In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import sklearn
import warnings
warnings.filterwarnings('ignore')
import os
import time

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier, XGBRFClassifier
from xgboost import plot_tree, plot_importance
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, roc_curve
from sklearn.metrics import average_precision_score,precision_score,f1_score,recall_score
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
from sklearn.model_selection import GridSearchCV
# from sklearn.datasets import load_breast_cancer

In [None]:
# 讀取完整csv檔
df = pd.read_csv('./finalcsv/finalcsv30s.csv', index_col=0)
df

In [None]:
# 將label字串轉成數字的類別0-9
df['label']=df['label'].replace(['blues','classical','country','disco','hiphop','jazz','metal','pop'
,'reggae','rock'],[0,1,2,3,4,5,6,7,8,9])

In [None]:
# 將song_name欄位丟掉
df = df.drop(['song_name','videoname','url','songid'], axis=1)
# 確定剩下數字資料型態
df

In [None]:
# 分成特徵欄位及預測目標欄位
y = df['label']

# X = df.loc[:, df.columns != 'label'] # label以外的欄位
X = df.iloc[:, [0,1,2,3,4,5,6,7]]

#### NORMALIZE X ####

# 對特徵值做標準化
cols = X.columns
min_max_scaler = preprocessing.MinMaxScaler()
np_scaled = min_max_scaler.fit_transform(X)

# 新dataframe用標準化過的特徵值
X = pd.DataFrame(np_scaled, columns = cols)

In [None]:
X

In [None]:
y

In [None]:
# 切成訓練集合測試集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# KNN超參數優化器

In [None]:
start = time.perf_counter()


kn = KNeighborsClassifier()
params = {
    'n_neighbors' : [10],    # 邻居个数
    'weights': ['uniform', 'distance'],    # uniform不带距离权重,distance带有距离权重
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']    # 搜尋數演算法
}
grid_kn = GridSearchCV(estimator = kn,
                        param_grid = params,
                        scoring = 'accuracy', 
                        cv = 5,    # cv=交叉驗證參數,
                        refit = True,
                        n_jobs = -1)    # verbose=0不輸出訓練過程=2輸出訓練過程 n_job=-1用所有cpu

grid_kn.fit(X_train, y_train)

y_pred = grid_kn.predict(X_test)
print(grid_kn.best_params_)
print(grid_kn.score(X_test, y_test))

print("This time is being calculated")

end = time.perf_counter()

print(end - start)

# Random Forest超參數優化器

In [None]:
start = time.perf_counter()


rfc=RandomForestClassifier(random_state=42)
params = { 
    'n_estimators': [500,1000],    # 森林裡樹木的數量
    'max_features': ['auto', 'sqrt', 'log2'],    # 每個決策樹最大的特徵數量
    'max_depth' : [8,10,15,20],    # 樹的最大深度
    'criterion' :['gini', 'entropy']    # 分類依據
}

grid_rfc = GridSearchCV(estimator=rfc, 
                        param_grid=params, 
                        cv= 5,
                        refit = True,
                        n_jobs = -1)

grid_rfc.fit(X_train, y_train)

y_pred = grid_rfc.predict(X_test)

print(grid_rfc.best_params_)
print(grid_rfc.score(X_test, y_test))


print("This time is being calculated")

end = time.perf_counter()

print(end - start)

# SVC超參數優化器

In [None]:
start = time.perf_counter()


svc = SVC()
params = {
    'C':[1,10,100,1000],    # 惩罚参数
    'gamma':[1,0.1,0.001,0.0001],     # gamma越大，支持向量越少，gamma值越小，支持向量越多。支持向量的个数影响训练与预测的速度
    'kernel':['linear','rbf'],     # 核函数
    'decision_function_shape':['ovo', 'ovr']    # 分类器
}
grid_svc = GridSearchCV(estimator = svc,
                        param_grid = params,
                        scoring = 'accuracy', 
                        cv = 5,
                        refit = True,
                        n_jobs = -1)

grid_svc.fit(X_train, y_train)

y_pred = grid_svc.predict(X_test)

print(grid_svc.best_params_)
print(grid_svc.score(X_test, y_test))

print("This time is being calculated")

end = time.perf_counter()

print(end - start)

# Logistic Regression超參數優化器

In [None]:
start = time.perf_counter()


lg = LogisticRegression()
params = {
    "C":[1,10,100,1000],     # 正则化系数λ的倒数，越小的数值表示越强的正则化
    "penalty":["l1","l2"],    # 惩罚项
    'multi_class':['multinomial', 'ovr'],}    # 分类方式选择参数
grid_lg = GridSearchCV(estimator = lg,
                        param_grid = params,
                        scoring = 'accuracy', 
                        cv = 5,
                        refit = True,
                        n_jobs = -1)

grid_lg.fit(X_train, y_train)

y_pred = grid_lg.predict(X_test)

print(grid_lg.best_params_)
print(grid_lg.score(X_test, y_test))

print("This time is being calculated")

end = time.perf_counter()

print(end - start)

# XGBoost超參數優化器

In [None]:
start = time.perf_counter()


xgb = XGBClassifier()
params = {'objective':['binary:logistic'],    # 輸出概率
              'learning_rate': [0.3,0.1],    # 更新过程中用到的收缩步长 (0-1)
              'max_depth': [6,24],    # 树的最大深度 (1-無限)
              'min_child_weight': [1,10],    # 决定最小叶子节点样本权重和，加权和低于这个值时，就不再分裂产生新的叶子节点(0-無限)
              'subsample': [0.6,0.8],    # 这个参数控制对于每棵树，随机采样的比例 (0-1)
              'colsample_bytree': [0.6,0.8],    # 用来控制每颗树随机采样的列数的占比 (0-1)
              'n_estimators': [10,100],    # n_estimators：弱學習器的数量 (0-無限)
              'seed': [42]}    # 給定種子數，固定42

grid_xgb = GridSearchCV(estimator = xgb,
                        param_grid = params,
                        scoring = 'accuracy', 
                        cv = 5,
                        n_jobs = -1)

grid_xgb.fit(X_train, y_train)

y_pred = grid_xgb.predict(X_test)

print(grid_xgb.best_params_)
print(grid_xgb.score(X_test, y_test))

print("This time is being calculated")

end = time.perf_counter()

print(end - start)

# 混淆矩陣及分數

In [None]:
# Confusion Matrix
confusion_matr = confusion_matrix(y_test, y_pred) #normalize = 'true'
plt.figure(figsize = (16, 9))
sns.heatmap(confusion_matr, cmap="Blues", annot=True, 
            xticklabels = ["blues", "classical", "country", "disco", "hiphop", "jazz", "metal", "pop", "reggae", "rock"],
           yticklabels=["blues", "classical", "country", "disco", "hiphop", "jazz", "metal", "pop", "reggae", "rock"]);
# plt.savefig("conf matrix")

In [None]:
print('------Weighted------')
print('Weighted precision', precision_score(y_test, y_pred, average='weighted'))
print('Weighted recall', recall_score(y_test, y_pred, average='weighted'))
print('Weighted f1-score', f1_score(y_test, y_pred, average='weighted'))
print('------Macro------')
print('Macro precision', precision_score(y_test, y_pred, average='macro'))
print('Macro recall', recall_score(y_test, y_pred, average='macro'))
print('Macro f1-score', f1_score(y_test, y_pred, average='macro'))
print('------Micro------')
print('Micro precision', precision_score(y_test, y_pred, average='micro'))
print('Micro recall', recall_score(y_test, y_pred, average='micro'))
print('Micro f1-score', f1_score(y_test, y_pred, average='micro'))