# VietNamese Music Classification
## 1. Data-preprocessing

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

In [None]:
import os
import glob
import pandas as pd

# Đường dẫn đến thư mục chứa các file CSV
directory = './data'

# Tìm tất cả các file CSV trong thư mục
all_files = glob.glob(os.path.join(directory, '*.csv'))

# Đọc và gộp các file CSV thành một DataFrame
df_from_each_file = (pd.read_csv(f) for f in all_files)
merged_df = pd.concat(df_from_each_file, ignore_index=True)

# Lưu DataFrame gộp thành một file CSV
merged_df.to_csv('all_data.csv', index=False)

In [None]:
dataset = pd.read_csv('./all_data.csv')

In [None]:
dataset.head()

In [None]:
dataset.info()

In [None]:
for i in range(len(dataset.columns)):
    missing_data = dataset[dataset.columns[i]].isna().sum()
    perc = missing_data / len(dataset) * 100
    print('>%d,  missing entries: %d, percentage %.2f' % (i, missing_data, perc))

In [None]:
plt.figure(figsize = (4,4)) #is to create a figure object with a given size
sns.heatmap(dataset.isna(), cbar=False, cmap='viridis', yticklabels=False)

In [None]:
#convert the dataframe into a numpy array by calling values on my dataframe (not necessary), but a habit I prefer
X= dataset.iloc[:, 2:-1].values
y = dataset.iloc[:, -1].values

In [None]:
print(X)

In [None]:
print(y)

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
#output of fit_transform of Label Encoder is already a Numpy Array
y = le.fit_transform(y)

In [None]:
print(y)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

In [None]:
print(X_train)

In [None]:
print(X_test)

In [None]:
print(y_train)

In [None]:
print(y_test)

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train[:,:] = sc.fit_transform(X_train[:,:])
#only use Transform to use the SAME scaler as the Training Set
X_test[:,:] = sc.transform(X_test[:,:])

In [None]:
print(X_test)

In [None]:
print(X_test)

## 2.Train

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC 
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB

In [None]:
# Put models in a dictionary
models = {"Logistic Regression": LogisticRegression(max_iter=10000), 
          "KNN": KNeighborsClassifier(),
          "Random Forest": RandomForestClassifier(),
          "Decision Tree": DecisionTreeClassifier(),
          "SVM": SVC(),
          "GBM": GradientBoostingClassifier(),
          "Naive Bayes": GaussianNB()}

# Creat a funciton to fit and score models
def fit_and_score(models, X_train, X_test, y_train, y_test):
    """
    Fits and evaluates given machine learning models.
    models : a dict of different Scikit-Learn machine learning models
    X_train : training data
    X_test : testing data
    y_train : labels assosciated with training data
    y_test : labels assosciated with test data
    """
    # Set random seed
    np.random.seed(42)
    # Make a dictionary to keep model scores
    model_scores = {}
    # Loop throuhg models
    for name, model in models.items():
        #Fit the model to the data
        model.fit(X_train, y_train)
        #Evaluate the model and append its score to model_scores
        model_scores[name]= model.score(X_test, y_test)
    return model_scores

In [None]:

model_score = fit_and_score(models=models,
                            X_train=X_train,
                            X_test=X_test,
                            y_train=y_train,
                            y_test=y_test)
model_score

In [None]:
model_compare = pd.DataFrame(model_score, index=["accuracy"])
model_compare

In [None]:
model_compare.T.plot.bar();

In [None]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

In [None]:
# Create a hyper-parameter grid for LogisticRegression()

log_reg_grid = {"C": np.logspace(-4,4,20), #most valuable for Log Reg model
                "solver": ["liblinear"]}

# Create a hyper-parameter grid for RandomForestClassifier()
rf_grid = {"n_estimators": np.arange(10,1000,50),
           "max_depth": [None, 3,5,10],
           "min_samples_split": np.arange(2,20,2),
           "min_samples_leaf": np.arange(1,20,2)}
# Create a hyper-parameter grid for SVM
svm_grid = {'C': [0.1, 1, 10, 100, 1000],  
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
              'kernel': ['rbf']} 

# Create a hyper-parameter grid for GradientBoostingClassifier
gbm_grid = {
    'n_estimators': np.arange(50, 251, 50),
    'learning_rate': np.linspace(0.01, 0.2, 10),
    'max_depth': np.arange(3, 8),
}

In [None]:
log_reg_grid = {"C": np.logspace(-4,4,20), "solver": ["liblinear"]}
rf_grid = {"n_estimators": np.arange(10,1000,50), "max_depth": [None, 3,5,10], "min_samples_split": np.arange(2,20,2), "min_samples_leaf": np.arange(1,20,2)}
svm_grid = {'C': np.logspace(-3, 3, 10), 'gamma': ['scale', 'auto'], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid']}
gbm_grid = {'learning_rate': [0.01, 0.05, 0.1, 0.2], 'n_estimators': [50, 100, 200], 'max_depth': [3, 5, 7, 9], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4]}

In [None]:
# Put models in a dictionary
params_grid = {"Logistic Regression": (LogisticRegression(max_iter=10000), log_reg_grid),
               "Random Forest": (RandomForestClassifier(), rf_grid),
               "SVM": (SVC(), svm_grid),
               "GBM": (GradientBoostingClassifier(), gbm_grid)}

# Creat a funciton to hyperparameter tuning with RandomizedSearchCV
def tune(params_grid, X_train, X_test, y_train, y_test):
    np.random.seed(42)
    rs_scores = {}
    for name, (model, param_grid) in params_grid.items():
        rs = RandomizedSearchCV(model, param_distributions=param_grid, cv=5, n_iter=20, verbose=True, n_jobs=-1)
        rs.fit(X_train, y_train)
        rs_scores[name] = rs.score(X_test, y_test)
    return rs_scores

In [None]:
rs_scores = tune(params_grid=params_grid,
                            X_train=X_train,
                            X_test=X_test,
                            y_train=y_train,
                            y_test=y_test)
rs_scores

In [None]:
# Different hyperparameters for our LogisticRegression model
log_reg_grid = {"C": np.logspace(-4, 4, 30),
                "solver": ["liblinear"]}

# Setup grid hyperparameter search for LogisticRegression
gs_log_reg = GridSearchCV(LogisticRegression(),
                          param_grid=log_reg_grid,
                          cv=5,
                          verbose=True)

# Fit grid hyperparameter search model
gs_log_reg.fit(X_train, y_train);

In [None]:
# Evaluate the grid search LogisticRegression model
gs_log_reg.score(X_test, y_test)


## 3. Evaluation

In [None]:
## Model evaluators
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import precision_score, recall_score, f1_score #Classification Problem

In [None]:
y_preds = gs_log_reg.predict(X_test)

In [None]:
print(confusion_matrix(y_test, y_preds))

In [None]:
sns.set(font_scale=1.5) # Increase font size
 
def plot_conf_mat(y_test, y_preds):
    """
    Plots a confusion matrix using Seaborn's heatmap().
    """
    fig, ax = plt.subplots(figsize=(3, 3))
    ax = sns.heatmap(confusion_matrix(y_test, y_preds),
                     annot=True, # Annotate the boxes
                     cbar=False,
                     cmap="Blues")
    plt.xlabel("Predicted label") # predictions go on the x-axis
    plt.ylabel("True label") # true labels go on the y-axis 
    
plot_conf_mat(y_test, y_preds)

In [None]:
print(classification_report(y_test, y_preds)) #Based on only 1 test set