In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
import sklearn.neural_network
from sklearn.metrics import confusion_matrix, classification_report
import itertools
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler 
from sklearn.neighbors import KNeighborsClassifier
import csv


In [3]:
data=pd.read_csv("../project_train.csv")
features=data.drop("Label", axis=1)
targets=data["Label"]
X_train, X_test, Y_train, Y_test=train_test_split(features, targets, test_size=0.2, random_state=20)


exclude=['mode', 'key']
columns_to_standardize=X_train.drop(exclude, axis=1)
excluded_cols=X_train[exclude]
scaler = StandardScaler()
scaled_columns = pd.DataFrame(scaler.fit_transform(columns_to_standardize), columns=columns_to_standardize.columns)

X_train_scaled = pd.concat([excluded_cols.reset_index(drop=True), scaled_columns], axis=1)

exclude=['mode', 'key']
columns_to_standardize=X_test.drop(exclude, axis=1)
excluded_cols=X_test[exclude]
X_test_scaled=scaler.transform(columns_to_standardize)
X_test_scaled = pd.DataFrame(scaler.transform(columns_to_standardize), columns=columns_to_standardize.columns)
X_test_scaled = pd.concat([excluded_cols.reset_index(drop=True), X_test_scaled], axis=1)


X_train_scaled.head()



Unnamed: 0,mode,key,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,1,5,1.06159,-0.046587,0.300757,-0.464073,-0.079908,-0.537923,-0.563034,-0.303212,-0.294702
1,0,11,-1.433846,1.154419,0.315478,3.035649,-0.851176,-0.422346,3.592413,-1.500278,-0.814173
2,0,1,0.633348,0.383282,0.244817,0.22407,-0.582249,-0.538041,-0.000919,0.554792,-0.367778
3,1,0,-0.99503,0.734395,0.595879,-0.54304,-0.844138,-0.535957,-0.531514,-1.191322,0.641569
4,1,9,0.987573,0.429222,0.623919,-0.67928,-0.65666,-0.538041,0.041108,1.710086,-0.114839


In [4]:

knn_model=KNeighborsClassifier(n_neighbors=3,p=1,weights='distance')
knn_model.fit(X_train_scaled, Y_train)
print(knn_model.score(X_test_scaled,Y_test))

0.8118811881188119


In [5]:
layer_sizes=[10, 25, 50,75]
max_layers=4
hidden_layer_sizes = [tuple(layer_size) for n in range(1, max_layers + 1) for layer_size in itertools.product(layer_sizes, repeat=n)]
hidden_layer_sizes=[(25,10,75,10),(10,25),(25,75),(50,75),(75,25),(50,25),(10,25,75)]
params={'hidden_layer_sizes' : hidden_layer_sizes, 'alpha' : [0, 0.0001, 0.001],
        'learning_rate_init': [0.00001, 0.0001, 0.001],  'max_iter': [10000]  }


#neural_network=GridSearchCV(sklearn.neural_network.MLPClassifier(), params, n_jobs=-1)
neural_network=sklearn.neural_network.MLPClassifier(alpha=0.001, hidden_layer_sizes=(50, 75), learning_rate_init=0.0001, max_iter=15000)
neural_network.fit(X_train_scaled, Y_train)

y_pred=neural_network.predict(X_test_scaled)
#print(neural_network.best_params_)
Y_test.reset_index(drop=True)


con_matrix=confusion_matrix(Y_test, y_pred)
report = classification_report(Y_test, y_pred)
print(con_matrix)
print(report)

#{'alpha': 0.0001, 'hidden_layer_sizes': (25,), 'learning_rate_init': 0.001, 'max_iter': 10000} NOT STANDARDIZED
#{'alpha': 0.0001, 'hidden_layer_sizes': (25, 10, 75, 10), 'learning_rate_init': 0.0001, 'max_iter': 10000} STANDARDIZED


[[46 11]
 [15 29]]
              precision    recall  f1-score   support

           0       0.75      0.81      0.78        57
           1       0.72      0.66      0.69        44

    accuracy                           0.74       101
   macro avg       0.74      0.73      0.74       101
weighted avg       0.74      0.74      0.74       101



In [6]:
from sklearn.svm import SVC 
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis, LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier

#svm_params={'C':[1,10,100,1000],'gamma':[1,0.1,0.001,0.0001], 'kernel':['linear','rbf'], 'probability': [True]}
#svm_model=GridSearchCV(SVC(),svm_params, n_jobs=-1) #{'C': 10, 'gamma': 1, 'kernel': 'linear', 'probability': True}
svm_model=SVC(C=200,kernel='linear', probability=True)
svm_model.fit(X_train_scaled,Y_train)
#print(svm_model.best_params_)

#rfc_params={'n_estimators':[100, 200, 400, 800], 'criterion':["gini", "entropy"]}
#rfc_model=GridSearchCV(RandomForestClassifier(), rfc_params, n_jobs=-1)
rfc_model=RandomForestClassifier(criterion='gini',n_estimators=400, max_depth=40)
rfc_model.fit(X_train_scaled,Y_train)


knn_model=KNeighborsClassifier(n_neighbors=3,p=1,weights='distance')
knn_model.fit(X_train_scaled, Y_train)

lda_model=LinearDiscriminantAnalysis()
lda_model.fit(X_train_scaled,Y_train)

meta_model=LogisticRegression()

stacked_model = StackingClassifier(
    estimators=[('svm', svm_model), ('ann', neural_network),('knn',knn_model) ,('rf', rfc_model), ('lda',lda_model)],
    final_estimator=meta_model,
    n_jobs=-1,
    cv="prefit"
)
stacked_model.fit(X_train_scaled,Y_train)


accuracy=stacked_model.score(X_test_scaled,Y_test)

print(f"Accuracy: {accuracy}")
""""
knn_vec_acc=np.zeros(10)
accuracy_vec=np.zeros(10)
for l in range(0,10):
    print(l)
    X_train, X_test, Y_train, Y_test=train_test_split(features, targets, test_size=0.2)

    exclude=['mode', 'key']
    columns_to_standardize=X_train.drop(exclude, axis=1)
    excluded_cols=X_train[exclude]
    scaler = StandardScaler()
    scaled_columns = pd.DataFrame(scaler.fit_transform(columns_to_standardize), columns=columns_to_standardize.columns)

    X_train_scaled = pd.concat([excluded_cols.reset_index(drop=True), scaled_columns], axis=1)

    exclude=['mode', 'key']
    columns_to_standardize=X_test.drop(exclude, axis=1)
    excluded_cols=X_test[exclude]
    X_test_scaled=scaler.transform(columns_to_standardize)
    X_test_scaled = pd.DataFrame(scaler.transform(columns_to_standardize), columns=columns_to_standardize.columns)
    X_test_scaled = pd.concat([excluded_cols.reset_index(drop=True), X_test_scaled], axis=1)

    stacked_model.fit(X_train_scaled,Y_train)
    accuracy_vec[l] = stacked_model.score(X_test_scaled, Y_test)
    print(f"stacked {accuracy_vec[l]}")
    #print(f"Accuracy: {accuracy}")

print(np.mean(accuracy_vec))
print(np.std(accuracy_vec))"""





0.7425742574257426
Accuracy: 0.8316831683168316


'"\nknn_vec_acc=np.zeros(10)\naccuracy_vec=np.zeros(10)\nfor l in range(0,10):\n    print(l)\n    X_train, X_test, Y_train, Y_test=train_test_split(features, targets, test_size=0.2)\n\n    exclude=[\'mode\', \'key\']\n    columns_to_standardize=X_train.drop(exclude, axis=1)\n    excluded_cols=X_train[exclude]\n    scaler = StandardScaler()\n    scaled_columns = pd.DataFrame(scaler.fit_transform(columns_to_standardize), columns=columns_to_standardize.columns)\n\n    X_train_scaled = pd.concat([excluded_cols.reset_index(drop=True), scaled_columns], axis=1)\n\n    exclude=[\'mode\', \'key\']\n    columns_to_standardize=X_test.drop(exclude, axis=1)\n    excluded_cols=X_test[exclude]\n    X_test_scaled=scaler.transform(columns_to_standardize)\n    X_test_scaled = pd.DataFrame(scaler.transform(columns_to_standardize), columns=columns_to_standardize.columns)\n    X_test_scaled = pd.concat([excluded_cols.reset_index(drop=True), X_test_scaled], axis=1)\n\n    stacked_model.fit(X_train_scaled,Y_

In [86]:
from sklearn.ensemble import RandomForestClassifier
rfc_params={'n_estimators':[100, 200, 400, 800], 'criterion':["gini", "entropy"]}
rfc_model=GridSearchCV(RandomForestClassifier(), rfc_params, n_jobs=-1)
rfc_model.fit(X_train_scaled,Y_train)
print(rfc_model.score(X_test_scaled,Y_test))
print(rfc_model.best_params_)

0.8217821782178217
{'criterion': 'entropy', 'n_estimators': 200}


In [33]:
from sklearn.svm import SVC 
from sklearn.ensemble import RandomForestClassifier
svm_params={'C':[100,200,400,500,800],'gamma':[1,0.1,0.001,0.0001], 'kernel':['linear','rbf', 'poly', 'sigmoid'],
             'probability': [True]}
svm_model=GridSearchCV(SVC(),svm_params, n_jobs=-1)
svm_model.fit(X_train_scaled,Y_train)
print(svm_model.score(X_test_scaled,Y_test))


0.7425742574257426


In [54]:
print(svm_model.best_params_)

{'C': 200, 'degree': 2, 'gamma': 1, 'kernel': 'linear', 'probability': True}


In [43]:
from sklearn.neighbors import KNeighborsClassifier

knn_params={'n_neighbors': [3,5,7,9,11,13,15,17,19,21], 'weights':['uniform','distance'], 'p':[1,2,3]}
knn_model=GridSearchCV(KNeighborsClassifier(), knn_params, n_jobs=-1)
knn_model.fit(X_train_scaled, Y_train)
print(knn_model.score(X_test_scaled,Y_test))
print(knn_model.best_params_)


0.7425742574257426
{'n_neighbors': 3, 'p': 1, 'weights': 'uniform'}


In [26]:
rfc_params={'n_estimators':[100, 200,300, 400, 800], 'criterion':["gini", "entropy"], 'max_depth':[10,20,30,40,50]}
rfc_model=GridSearchCV(RandomForestClassifier(), rfc_params, n_jobs=-1)
rfc_model.fit(X_train_scaled,Y_train)
rfc_model.best_params_

{'criterion': 'gini', 'max_depth': 40, 'n_estimators': 400}

In [11]:
test_data=pd.read_csv("../project_test.csv")

exclude=['mode', 'key']
columns_to_standardize=test_data.drop(exclude, axis=1)
excluded_cols=test_data[exclude]
test_data_scaled=scaler.transform(columns_to_standardize)
test_data_scaled = pd.DataFrame(scaler.transform(columns_to_standardize), columns=columns_to_standardize.columns)
test_data_scaled = pd.concat([excluded_cols.reset_index(drop=True), test_data_scaled], axis=1)


test_data_scaled.head()

predictions_final=stacked_model.predict(test_data_scaled)
pd.DataFrame(predictions_final).to_csv('FINAL_PREDICTION.csv',index=False)