In [1]:
import pandas as pd
from scipy.stats import skew
from scipy.stats import kurtosis
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, f1_score, accuracy_score, confusion_matrix, classification_report

In [2]:
#load data
df1=pd.read_csv("/Users/Jessie/Documents/JupyterNotebook/ass3data/training-data/alternative.csv",keep_default_na=False,na_values=['?'])
df2=pd.read_csv("/Users/Jessie/Documents/JupyterNotebook/ass3data/training-data/blues.csv",keep_default_na=False,na_values=['?'])
df3=pd.read_csv("/Users/Jessie/Documents/JupyterNotebook/ass3data/training-data/childrens music.csv",keep_default_na=False,na_values=['?'])
df4=pd.read_csv("/Users/Jessie/Documents/JupyterNotebook/ass3data/training-data/comedy.csv",keep_default_na=False,na_values=['?'])
df5=pd.read_csv("/Users/Jessie/Documents/JupyterNotebook/ass3data/training-data/electronic.csv",keep_default_na=False,na_values=['?'])
df6=pd.read_csv("/Users/Jessie/Documents/JupyterNotebook/ass3data/training-data/folk.csv",keep_default_na=False,na_values=['?'])
df7=pd.read_csv("/Users/Jessie/Documents/JupyterNotebook/ass3data/training-data/hip-hop.csv",keep_default_na=False,na_values=['?'])
df8=pd.read_csv("/Users/Jessie/Documents/JupyterNotebook/ass3data/training-data/movie.csv",keep_default_na=False,na_values=['?'])
df9=pd.read_csv("/Users/Jessie/Documents/JupyterNotebook/ass3data/training-data/ska.csv",keep_default_na=False,na_values=['?'])
df10=pd.read_csv("/Users/Jessie/Documents/JupyterNotebook/ass3data/training-data/soul.csv",keep_default_na=False,na_values=['?'])


In [3]:
#check if the 10 datasets have same structure
training_dfs=[df2,df3,df4,df5,df6,df7,df8,df9,df10]
if all([set(df1.columns)==set(tdf.columns) for tdf in training_dfs]):
    print("structure of training sets are identical")

structure of training sets are identical


In [4]:
#merge to one dataset
df=pd.concat([df1,df2,df3,df4,df5,df6,df7,df8,df9,df10])
df.shape

(50000, 19)

In [5]:
#preprocess the data
#1) split the dataset to a training set(90%) and avalidation set(10%) to avoid data leakage
x=df.drop("genre", axis = 1)
y=df["genre"]
train_x,val_x,train_y,val_y=train_test_split(x, y, test_size=0.1, random_state=309)

In [6]:
#2) drop irrelavant features and encode categorical features
train_x_clean=train_x.drop(["instance_id","artist_name","track_name","track_id"], axis = 1)
key_encoder=LabelEncoder()
train_x_clean["key"]=key_encoder.fit_transform(train_x_clean["key"])
mode_encoder=LabelEncoder()
train_x_clean["mode"]=mode_encoder.fit_transform(train_x_clean["mode"])
ts_encoder=LabelEncoder()
train_x_clean["time_signature"]=ts_encoder.fit_transform(train_x_clean["time_signature"])

#3) encode target variable
genre_encoder=LabelEncoder()
train_y_encoded=genre_encoder.fit_transform(train_y)

train_x_clean.dtypes

popularity            int64
acousticness        float64
danceability        float64
duration_ms           int64
energy              float64
instrumentalness    float64
key                   int64
liveness            float64
loudness            float64
mode                  int64
speechiness         float64
tempo               float64
time_signature        int64
valence             float64
dtype: object

In [23]:
#check missing values
print(train_x_clean.isnull().sum())

popularity             0
acousticness           0
danceability           0
duration_ms         9015
energy                 0
instrumentalness       0
key                    0
liveness               0
loudness               0
mode                   0
speechiness            0
tempo                  0
time_signature         0
valence                0
dtype: int64


In [24]:
#4) impute missing values

# duration_ms
from sklearn.impute import KNNImputer
dms_imputer = KNNImputer(n_neighbors=2, weights="uniform")
train_x_clean["duration_ms"]=train_x_clean["duration_ms"].replace(-1,np.NaN)
train_x_clean["duration_ms"] = dms_imputer.fit_transform(train_x_clean[["duration_ms"]])
# tempo
tempo_imputer = KNNImputer(n_neighbors=2, weights="uniform")
train_x_clean["tempo"] = tempo_imputer.fit_transform(train_x_clean[["tempo"]])

print("The number of missing values ")
print(train_x_clean.isnull().sum())

The number of missing values 
popularity          0
acousticness        0
danceability        0
duration_ms         0
energy              0
instrumentalness    0
key                 0
liveness            0
loudness            0
mode                0
speechiness         0
tempo               0
time_signature      0
valence             0
dtype: int64


In [25]:
#5) remove outliers
from scipy import stats
train_x_clean[(np.abs(stats.zscore(train_x_clean)) < 3).all(axis=1)]

Unnamed: 0,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
3874,27,0.0405,0.513,301213.000000,0.848,0.000015,5,0.3840,-9.462,0,0.0563,118.856081,2,0.685
981,48,0.0126,0.843,344120.000000,0.448,0.000944,3,0.0826,-10.398,0,0.3160,119.564000,2,0.172
4966,36,0.7870,0.817,227367.444685,0.189,0.000000,3,0.0843,-14.537,0,0.0713,127.846000,2,0.566
3760,25,0.0267,0.367,122000.000000,0.931,0.000000,2,0.2400,-11.898,1,0.0442,125.761000,2,0.925
3438,50,0.0362,0.889,227367.444685,0.557,0.000000,7,0.3320,-8.148,0,0.3520,97.176000,2,0.805
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3475,55,0.0611,0.865,227367.444685,0.641,0.000107,1,0.2180,-6.652,1,0.1620,95.307000,2,0.762
4617,31,0.6880,0.633,164853.000000,0.537,0.000000,7,0.2870,-12.827,1,0.0306,131.639000,2,0.736
89,37,0.7010,0.502,150667.000000,0.659,0.000225,5,0.1150,-7.473,0,0.0416,101.382000,2,0.841
319,50,0.0719,0.771,164516.000000,0.834,0.013400,5,0.1630,-4.883,1,0.0481,118.856081,2,0.280


In [26]:
#6) Scale the data
from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(train_x_clean)
print(x_scaled[1,:])

[0.5        0.01264933 0.84255365 0.0677637  0.44832917 0.00095065
 0.27272727 0.07108141 0.66479414 0.         0.31162495 0.46465345
 0.66666667 0.172     ]


In [36]:
#7) perform feature selection
from sklearn.feature_selection import SelectKBest, chi2
selector=SelectKBest(chi2, k=12)
train_x_final = selector.fit_transform(x_scaled, train_y_encoded)

In [37]:
train_x_final[1,:]

array([0.5       , 0.01264933, 0.84255365, 0.44832917, 0.00095065,
       0.07108141, 0.66479414, 0.        , 0.31162495, 0.46465345,
       0.66666667, 0.172     ])

In [46]:
#7) Transform the validation set
val_x_clean=val_x.drop(["instance_id","artist_name","track_name","track_id"], axis = 1)
val_x_clean["key"]=key_encoder.transform(val_x_clean["key"])
val_x_clean["mode"]=mode_encoder.transform(val_x_clean["mode"])
val_x_clean["time_signature"]=ts_encoder.transform(val_x_clean["time_signature"])
val_y_encoded=genre_encoder.transform(val_y)
val_x_clean["duration_ms"]=val_x_clean["duration_ms"].replace(-1,np.NaN)
val_x_clean["duration_ms"] = dms_imputer.transform(val_x_clean[["duration_ms"]])
val_x_clean["tempo"] = tempo_imputer.transform(val_x_clean[["tempo"]])
val_x_scaled=min_max_scaler.transform(val_x_clean)
val_x_final=selector.transform(val_x_scaled)


Modelling

In [47]:
%%time
#knn
from sklearn.neighbors import KNeighborsClassifier
knnmodel = KNeighborsClassifier(n_neighbors=5)
knnmodel.fit(train_x_final, train_y_encoded)
pred_knn = knnmodel.predict(val_x_final)
acc_knn=accuracy_score(val_y_encoded,pred_knn)
print(acc_knn)
print(classification_report(val_y_encoded,pred_knn))

0.5342
              precision    recall  f1-score   support

           0       0.29      0.42      0.35       507
           1       0.40      0.43      0.42       507
           2       0.32      0.27      0.30       452
           3       0.94      0.95      0.94       504
           4       0.59      0.59      0.59       473
           5       0.41      0.44      0.42       493
           6       0.65      0.68      0.67       542
           7       0.76      0.66      0.71       524
           8       0.67      0.61      0.64       493
           9       0.35      0.24      0.28       505

    accuracy                           0.53      5000
   macro avg       0.54      0.53      0.53      5000
weighted avg       0.54      0.53      0.54      5000

CPU times: user 1.31 s, sys: 11.3 ms, total: 1.32 s
Wall time: 1.34 s


In [49]:
%%time
#random forest
#rfmodel = RandomForestClassifier(n_estimators=1200,min_samples_split=10,min_samples_leaf=2,max_features='auto',
#                                 max_depth=70,bootstrap= True)
rfmodel = RandomForestClassifier(max_depth=15)
rfmodel.fit(train_x_final, train_y_encoded)
pred_rf = rfmodel.predict(val_x_final)
acc_rf=accuracy_score(val_y_encoded,pred_rf)
print(acc_rf)
print(classification_report(val_y_encoded,pred_rf))

0.6174
              precision    recall  f1-score   support

           0       0.36      0.37      0.36       507
           1       0.56      0.51      0.54       507
           2       0.41      0.25      0.31       452
           3       0.99      0.94      0.96       504
           4       0.70      0.67      0.68       473
           5       0.49      0.62      0.54       493
           6       0.70      0.86      0.77       542
           7       0.77      0.80      0.78       524
           8       0.74      0.69      0.71       493
           9       0.41      0.41      0.41       505

    accuracy                           0.62      5000
   macro avg       0.61      0.61      0.61      5000
weighted avg       0.61      0.62      0.61      5000

CPU times: user 8.96 s, sys: 89.2 ms, total: 9.05 s
Wall time: 9.09 s


In [45]:
%%time
#decision tree
from sklearn.tree import DecisionTreeClassifier
dtmodel = DecisionTreeClassifier(max_depth=10)
dtmodel.fit(train_x_final, train_y_encoded)
pred_dt = dtmodel.predict(val_x_final)
acc_dt=accuracy_score(val_y_encoded,pred_dt)
print(acc_dt)
print(classification_report(val_y_encoded,pred_dt))

0.5752
              precision    recall  f1-score   support

           0       0.36      0.40      0.38       507
           1       0.48      0.48      0.48       507
           2       0.48      0.31      0.37       452
           3       0.96      0.95      0.95       504
           4       0.58      0.51      0.54       473
           5       0.43      0.60      0.50       493
           6       0.66      0.83      0.74       542
           7       0.77      0.71      0.74       524
           8       0.68      0.60      0.64       493
           9       0.35      0.31      0.33       505

    accuracy                           0.58      5000
   macro avg       0.58      0.57      0.57      5000
weighted avg       0.58      0.58      0.57      5000

CPU times: user 381 ms, sys: 5.97 ms, total: 387 ms
Wall time: 412 ms


In [17]:
%%time
#LR
from sklearn.linear_model import LogisticRegression
lrmodel = LogisticRegression(max_iter=400)
lrmodel.fit(train_x_final, train_y_encoded)
pred_lr = lrmodel.predict(val_x_final)
acc_lr=accuracy_score(val_y_encoded,pred_lr)
print(acc_lr)
print(classification_report(val_y_encoded,pred_lr))

0.5472
              precision    recall  f1-score   support

           0       0.36      0.43      0.39       507
           1       0.46      0.31      0.37       507
           2       0.09      0.05      0.06       452
           3       0.94      0.95      0.95       504
           4       0.59      0.63      0.61       473
           5       0.48      0.53      0.50       493
           6       0.68      0.80      0.74       542
           7       0.67      0.72      0.69       524
           8       0.50      0.67      0.57       493
           9       0.40      0.32      0.35       505

    accuracy                           0.55      5000
   macro avg       0.52      0.54      0.52      5000
weighted avg       0.52      0.55      0.53      5000

CPU times: user 17.9 s, sys: 580 ms, total: 18.5 s
Wall time: 5.18 s


In [18]:
%%time
#GradientBoostingClassifier
from sklearn.ensemble import GradientBoostingClassifier
gbmodel = GradientBoostingClassifier(max_depth=10)
gbmodel.fit(train_x_final, train_y_encoded)
pred_gb = gbmodel.predict(val_x_final)
acc_gb=accuracy_score(val_y_encoded,pred_gb)
print(acc_gb)
print(classification_report(val_y_encoded,pred_gb))

0.5988
              precision    recall  f1-score   support

           0       0.34      0.36      0.35       507
           1       0.53      0.52      0.53       507
           2       0.33      0.30      0.31       452
           3       0.98      0.95      0.96       504
           4       0.70      0.66      0.68       473
           5       0.49      0.54      0.51       493
           6       0.73      0.77      0.75       542
           7       0.78      0.77      0.78       524
           8       0.72      0.68      0.70       493
           9       0.39      0.40      0.40       505

    accuracy                           0.60      5000
   macro avg       0.60      0.59      0.60      5000
weighted avg       0.60      0.60      0.60      5000

CPU times: user 5min 27s, sys: 3.65 s, total: 5min 31s
Wall time: 5min 35s


In [19]:
%%time
#MLPClassifier  - Multi layer perceptron
from sklearn.neural_network import MLPClassifier
mlpmodel = MLPClassifier()
mlpmodel.fit(train_x_final, train_y_encoded)
pred_mlp = mlpmodel.predict(val_x_final)
acc_mlp=accuracy_score(val_y_encoded,pred_mlp)
print(acc_mlp)
print(classification_report(val_y_encoded,pred_mlp))

0.6236
              precision    recall  f1-score   support

           0       0.41      0.46      0.43       507
           1       0.55      0.53      0.54       507
           2       0.57      0.23      0.33       452
           3       0.98      0.95      0.96       504
           4       0.69      0.65      0.67       473
           5       0.48      0.63      0.55       493
           6       0.69      0.85      0.76       542
           7       0.77      0.79      0.78       524
           8       0.73      0.65      0.69       493
           9       0.40      0.43      0.42       505

    accuracy                           0.62      5000
   macro avg       0.63      0.62      0.61      5000
weighted avg       0.63      0.62      0.62      5000

CPU times: user 2min 49s, sys: 2.93 s, total: 2min 52s
Wall time: 48.5 s




In [20]:
#these codes are to find the best parameter of random forest model
#reference: https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74
from sklearn.model_selection import RandomizedSearchCV
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]
random_grid = {'n_estimators': n_estimators,
               'max_depth': max_depth}
print(random_grid)
rf = RandomForestClassifier()
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 10, cv = 3, n_jobs = -1)
rf_random.fit(train_x_final, train_y_encoded)
print(rf_random.best_params_)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None]}
{'n_estimators': 1000, 'max_depth': 10}


In [50]:
# Load and transform the test set
df_test=pd.read_csv("/Users/Jessie/Documents/JupyterNotebook/ass3data/test-data/test.csv",keep_default_na=False,na_values=['?'])
test_x=df_test.drop(["instance_id","artist_name","track_name","track_id"], axis = 1)
test_x["key"]=key_encoder.transform(test_x["key"])
test_x["mode"]=mode_encoder.transform(test_x["mode"])
test_x["time_signature"] = test_x["time_signature"] .map(lambda s: '<unknown>'  if s not in ts_encoder.classes_ else s)
ts_encoder.classes_ = np.append(ts_encoder.classes_, '<unknown>' )
test_x["time_signature"]=ts_encoder.transform(test_x["time_signature"])
print("The number of missing values ")
print(test_x.isnull().sum())


The number of missing values 
popularity             0
acousticness           0
danceability           0
duration_ms            0
energy                 0
instrumentalness       0
key                    0
liveness               0
loudness               0
mode                   0
speechiness            0
tempo               4598
time_signature         0
valence                0
dtype: int64


In [51]:
#impute missing values
test_x["duration_ms"]=test_x["duration_ms"].replace(-1,np.NaN)
test_x["duration_ms"] = dms_imputer.transform(test_x[["duration_ms"]])
test_x["tempo"] = tempo_imputer.transform(test_x[["tempo"]])
test_x_final=min_max_scaler.transform(test_x)
test_x_final=selector.transform(test_x_final)

print(test_x_final[:10])

[[4.27083333e-01 2.56023140e-01 6.87264100e-01 9.49939126e-01
  2.57804632e-01 8.69785338e-02 8.13837730e-01 1.00000000e+00
  3.89053882e-01 6.05667154e-01 6.66666667e-01 3.65000000e-01]
 [3.95833333e-01 3.65461032e-01 6.33344117e-01 2.96144111e-01
  7.81470292e-06 1.21101661e-01 6.88710327e-01 0.00000000e+00
  1.18795078e-02 2.64291606e-01 6.66666667e-01 6.00000000e-01]
 [5.72916667e-01 4.46786438e-01 4.26291384e-01 4.25301166e-01
  0.00000000e+00 5.97407857e-02 6.62376449e-01 1.00000000e+00
  2.34196012e-01 9.00508181e-01 6.66666667e-01 4.98000000e-01]
 [7.29166667e-02 3.33332477e-01 7.48732880e-01 5.43444829e-01
  0.00000000e+00 3.24625354e-01 7.81435919e-01 0.00000000e+00
  6.04582096e-03 3.01907676e-01 6.66666667e-01 8.39000000e-01]
 [5.62500000e-01 5.84056975e-04 5.75110536e-01 8.94872165e-01
  1.24874119e-06 3.17537465e-01 8.13719216e-01 0.00000000e+00
  1.07127705e-02 3.71121458e-01 6.66666667e-01 6.43000000e-01]
 [3.02083333e-01 1.81598627e-03 3.90704195e-01 9.76971998e-01
  1

In [53]:
#apply the trained model to test data
y_pred=rfmodel.predict(test_x_final) #random forest
#y_pred=mlpmodel.predict(test_x_final) #mlp
y_pred=genre_encoder.inverse_transform(y_pred)
print(y_pred[:10])

['Electronic' 'Soul' "Children's Music" 'Movie' "Children's Music" 'Ska'
 'Alternative' 'Movie' 'Alternative' 'Ska']


In [54]:
#save prediction to csv - ready to upload to kaggle
instance_id=df_test["instance_id"]
prediction=pd.DataFrame(instance_id)
prediction["genre"]=y_pred
prediction.to_csv("ass3data/preds.csv", index=False)
prediction.head

<bound method NDFrame.head of        instance_id             genre
0                1        Electronic
1                2              Soul
2                3  Children's Music
3                4             Movie
4                5  Children's Music
...            ...               ...
30926        30927             Movie
30927        30928       Alternative
30928        30929              Folk
30929        30930              Folk
30930        30931       Alternative

[30931 rows x 2 columns]>

Further Evaluation

In [55]:
#Feature importance
from sklearn.feature_selection import RFE
ranker = RFE(rfmodel, n_features_to_select=1)
ranker.fit(train_x_final, train_y_encoded)
print(f"Model's Feature Importance")
for i in range(len(ranker.ranking_)):
    print(f"#{i+1}: {train_x_clean.columns[ranker.ranking_[i]-1]} ")
    

Model's Feature Importance
#1: acousticness 
#2: danceability 
#3: duration_ms 
#4: energy 
#5: instrumentalness 
#6: key 
#7: loudness 
#8: speechiness 
#9: popularity 
#10: mode 
#11: tempo 
#12: liveness 
