In [1]:
# Import modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn import svm
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
# Download data
!wget -O training_data.csv http://handsonml.control.lth.se/data/training_data.csv
!wget -O songs_to_classify.csv http://handsonml.control.lth.se/data/songs_to_classify.csv

In [43]:
# Load data
train = pd.read_csv("training_data.csv")
test = pd.read_csv("songs_to_classify.csv")


qtrain, qtest,= train_test_split( train, test_size=0.4, random_state=42)

train.shape, test.shape, qtrain.shape, qtest.shape

((750, 14), (200, 13), (450, 14), (300, 14))

In [None]:
# Inspect data
train.sample(5)

In [None]:
pd.plotting.scatter_matrix(train, figsize=(10, 10));

In [None]:
corr = train.corr()
corr

In [None]:
from sklearn.feature_selection import SelectKBest, chi2
selector  = SelectKBest(chi2, k=10)
selector.fit(abs(train.iloc[:,:12]),train.loc[:,'label'].values)
cols = selector.get_support(indices=True)
features_df_new = abs(train.iloc[:,:12]).iloc[:,cols]
features_df_new

In [44]:
# select which features to use
features = ['danceability','energy','instrumentalness','tempo','acousticness','liveness','speechiness','valence','loudness']
X_train = qtrain.loc[:,features].values
y_train = qtrain.loc[:,'label'].values
X_test = qtest.loc[:,features].values
y_test = qtest.loc[:,'label'].values
X_test2 = test.loc[:,features].values


In [49]:
# Normalize data. Can also be done using sklearn methods such as
# MinMaxScaler() or StandardScaler()
#X_trainn = X_train*1/np.max(np.abs(X_train), axis=0)
#X_testn = X_test*1/np.max(np.abs(X_train), axis=0)
scaler = StandardScaler()
scaler.fit(X_train)
scaler.fit(X_test)
#scaler.transform(X_train)
#scaler.transform(X_test)
X_trainn = scaler.transform(X_train)
X_testn = scaler.transform(X_test2)
X_qtest = scaler.transform(X_test)
X_trainn.shape, y_train.shape, X_qtest.shape, y_test.shape, X_testn.shape


((450, 9), (450,), (300, 9), (300,), (200, 9))

In [None]:
# note: all inputs/features are treated as quantitative/numeric
# some of the features are perhaps more sensible to treat as
# qualitative/cathegorical. For that sklearn preprocessing methods
# such as OneHotEncoder() can be used

# define the k-NN model. To set n_neighbors in a systematic way, use cross validation!
#knnmodel = KNeighborsClassifier(n_neighbors=5)
# feed it with data and train it
#knnmodel.fit(X_trainn, y_train)
#clf = knnmodel.fit(X_trainn, y_train)
#clf = BaggingClassifier(KNeighborsClassifier(),max_samples=0.5, max_features=0.5)
#clf = svm.SVC()
#model=LogisticRegression()
#clf = model.fit(X_trainn, y_train)

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the param grid
param_grid = {'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_depth': list(range(10, 15)),         
        }

rf = RandomForestClassifier()
rf_Grid = GridSearchCV(estimator = rf, param_grid = param_grid, cv = 10)
rf_Grid.fit(X_trainn, y_train)
print(" Results from Grid Search " )
print("\n The best estimator across ALL searched params:\n",rf_Grid.best_estimator_)
print("\n The best score across ALL searched params:\n",rf_Grid.best_score_)
print("\n The best parameters across ALL searched params:\n",rf_Grid.best_params_)

#print (f'Train Accuracy - : {clf.score(X_trainn,y_train):.3f}')
#print("Best: %f using %s" % (clf.best_score_,rf_Grid.best_params_))

#scores = cross_val_score(clf, X_trainn, y_train, cv=5)
#print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

# make predictions
#predictions = clf.predict(X=X_testn)
#print(predictions)
#"".join([str(int(elem)) for elem in predictions.tolist()])

 Results from Grid Search 

 The best estimator across ALL searched params:
 RandomForestClassifier(max_depth=13, min_samples_leaf=2)

 The best score across ALL searched params:
 0.8266666666666665

 The best parameters across ALL searched params:
 {'max_depth': 13, 'min_samples_leaf': 2, 'min_samples_split': 2}


In [None]:
#GradientBoostingClassifier for boosting
#learning_rate = [0.01,0.1]
#n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1000, num = 10)]
#min_samples_split = [2,3,4]
#min_samples_leaf = [1,2]
#max_depth = [10,20,30]
param_grid = {'learning_rate': [0.01,0.02,0.03],
        'n_estimators': [100,500,1000],
        'max_depth': [4,6,8],
        'subsample': [0.9,0.5,0.2]
        }
gbc = GradientBoostingClassifier()
boost_Grid = GridSearchCV(estimator = gbc, param_grid = param_grid, cv = 5, n_jobs=-1)
boost_Grid.fit(X_trainn, y_train)
print(" Results from Grid Search " )
print("\n The best estimator across ALL searched params:\n",boost_Grid.best_estimator_)
print("\n The best score across ALL searched params:\n",boost_Grid.best_score_)
print("\n The best parameters across ALL searched params:\n",boost_Grid.best_params_)

 Results from Grid Search 

 The best estimator across ALL searched params:
 GradientBoostingClassifier(learning_rate=0.01, max_depth=4, n_estimators=1000,
                           subsample=0.2)

 The best score across ALL searched params:
 0.8244444444444445

 The best parameters across ALL searched params:
 {'learning_rate': 0.01, 'max_depth': 4, 'n_estimators': 1000, 'subsample': 0.2}


In [None]:
clf = boost_Grid.best_estimator_
y_pred = clf.predict(X_qtest)
print(accuracy_score(y_test, y_pred))
predictions = clf.predict(X=X_testn)
"".join([str(int(elem)) for elem in predictions.tolist()])
#scores = cross_val_score(clf, X_qtest, y_test, cv=5)
#print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

0.8433333333333334


'00010011001101101011001000001111011111010101110110001101100010101111101111110110110101110000001011111010010111111011101001101110101011111111101011001010001111101101111111111001111011111110100111110111'

In [None]:
clf = RandomForestClassifier()
clf.fit(X_trainn, y_train)
y_pred = clf.predict(X_qtest)
print(accuracy_score(y_test, y_pred))
predictions = clf.predict(X=X_testn)
"".join([str(int(elem)) for elem in predictions.tolist()])

0.84


'00010011001101101011001100000111011111010101110110001101100011100111101011110110110111110000011011111010010111110010101001101110101011111111101011001010001111101101101111111001111011111110100111110111'

In [None]:
clf = rf_Grid.best_estimator_
y_pred = clf.predict(X_qtest)
print(accuracy_score(y_test, y_pred))
#scores = cross_val_score(clf, X_test, y_test, cv=5)
#print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

0.8333333333333334
