## Proyecto Complementario #1: Evaluando modelos ML

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

from sklearn import preprocessing
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import train_test_split
from sklearn.gaussian_process.kernels import RBF
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

In [3]:
df = pd.read_csv('Spotify_Track_Dataset.csv')

## Data Cleaning

In [4]:
df = df.dropna()
df_filtro = df[(np.abs(stats.zscore(df['duration_ms'])) < 3)]
df1 = df_filtro.copy()

## Data Wrangling

In [5]:
df1.drop('Unnamed: 0', axis = 1, inplace = True)
df1['duration_min'] = (df1['duration_ms']/(60 * 1000)).round(2)
df1.drop('duration_ms', axis = 1, inplace = True)
df1['loudness_db'] = - df1['loudness']
df1.drop('loudness', axis = 1, inplace = True)
df1[['artist_1', 'artist_2', 'artist_3']] = df1['artists'].str.split(';', 2, expand=True)
df1.drop('artists', axis = 1, inplace = True)

## Preprocesado

In [6]:
ord_enc = preprocessing.OrdinalEncoder()
df1['track_genre'] = ord_enc.fit_transform(df1[['track_genre']])
df1['track_genre']

X = df1.drop(['track_genre', 'track_id', 'album_name', 'track_name', 'explicit', 'mode', 'key', 'time_signature', 'artist_1', 'artist_2', 'artist_3'], axis = 1) 

y = df1['track_genre']

X2 = X.iloc[::15,:] 
y2 = y.iloc[::15]

## Selección de modelos de ML

A continuación se probarán distintos modelos de clasificación y se evaluarán las métricas obtenidas. Los algoritmos empleados son KNeighbors, Gaussian Naive Bayes, Decision tree y Randon Forest. Se probaron otros algoritmos tales como SVC, Quadratic Discriminant, Ada Boost, etcétera, pero no funcionaron adecuadamente o no eran extremadamente lentos.

Para cada algoritmo se empleó el Forward Feature Selector para determinar las 3, 5, 7, 9 y 10 variables que el algoritmo considera más relevantes (Sprint 3, Sprint 5, Sprint 7, Sprint 9 y Sprint 10, respectivamente). Y luego para cada Sprint se aplicó el algoritmo y se calculan las métricas.

### KNeighbors Classifier

In [7]:
from sklearn.neighbors import KNeighborsClassifier

In [8]:
knn = KNeighborsClassifier(n_neighbors = 5, n_jobs = -1)

forw = {}

for i in [3, 5, 7, 9, 10]:
    forw[f'sfs_{i}'] = SequentialFeatureSelector(knn, n_features_to_select = i)
    forw[f'sfs_{i}'].fit(X2, y2)
    # print(f'Las {i} variables seleccionadas son:', list(X2.columns[forw[f'sfs_{i}'].get_support()]))
    forw[f'sfs_{i}_var'] = X2.columns[forw[f'sfs_{i}'].get_support()]

Las 3 variables seleccionadas son: ['popularity', 'danceability', 'acousticness']
Las 5 variables seleccionadas son: ['popularity', 'danceability', 'speechiness', 'acousticness', 'instrumentalness']
Las 7 variables seleccionadas son: ['popularity', 'danceability', 'energy', 'speechiness', 'acousticness', 'instrumentalness', 'valence']
Las 9 variables seleccionadas son: ['popularity', 'danceability', 'energy', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'duration_min']
Las 10 variables seleccionadas son: ['popularity', 'danceability', 'energy', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'duration_min', 'loudness_db']


In [44]:
knn = KNeighborsClassifier(n_neighbors = 5, n_jobs = -1)

accuracy_forw, recall_forw, f1_forw = [], [], []

for i in [3, 5, 7, 9, 10]:
    X_i = X[list(X2.columns[forw[f'sfs_{i}'].get_support()])]
    
    X_train, X_test, y_train, y_test = train_test_split(X_i, y, random_state = 42)
    knn.fit(X_train, y_train)
    
    y_pred = knn.predict(X_test)
    
    accuracy_forw.append(accuracy_score(y_test, y_pred))
    recall_forw.append(recall_score(y_test, y_pred, average = 'macro'))
    f1_forw.append(f1_score(y_test, y_pred, average = 'macro'))
    
comparacion_forw = pd.DataFrame({'Modelo': ['Sprint 3', 'Sprint 5', ' Sprint 7', 'Sprint 9', 'Sprint 10'], 'Accuracy': accuracy_forw, 'Recall': recall_forw, 'F1': f1_forw})

comparacion_forw.round(2)

Unnamed: 0,Modelo,Accuracy,Recall,F1
0,Sprint 3,0.13,0.13,0.13
1,Sprint 5,0.16,0.16,0.16
2,Sprint 7,0.19,0.19,0.19
3,Sprint 9,0.2,0.2,0.2
4,Sprint 10,0.18,0.18,0.18


### Gaussian Naive Bayes

In [14]:
from sklearn.naive_bayes import GaussianNB

In [15]:
nb = GaussianNB()

forw_nb = {}

for i in [3, 5, 7, 9, 10]:
    forw_nb[f'sfs_{i}'] = SequentialFeatureSelector(nb, n_features_to_select = i)
    forw_nb[f'sfs_{i}'].fit(X2, y2)
    # print(f'Las {i} variables seleccionadas son:', list(X2.columns[forw_nb[f'sfs_{i}'].get_support()]))
    forw_nb[f'sfs_{i}_var'] = X2.columns[forw_nb[f'sfs_{i}'].get_support()]

Las 3 variables seleccionadas son: ['popularity', 'danceability', 'loudness_db']
Las 5 variables seleccionadas son: ['popularity', 'danceability', 'valence', 'duration_min', 'loudness_db']
Las 7 variables seleccionadas son: ['popularity', 'danceability', 'speechiness', 'acousticness', 'valence', 'duration_min', 'loudness_db']
Las 9 variables seleccionadas son: ['popularity', 'danceability', 'energy', 'speechiness', 'acousticness', 'valence', 'tempo', 'duration_min', 'loudness_db']
Las 10 variables seleccionadas son: ['popularity', 'danceability', 'energy', 'speechiness', 'acousticness', 'liveness', 'valence', 'tempo', 'duration_min', 'loudness_db']


In [45]:
nb = GaussianNB()

accuracy_forw_nb, recall_forw_nb, f1_forw_nb = [], [], []

for i in [3, 5, 7, 9, 10]:
    X_i = X[list(X2.columns[forw_nb[f'sfs_{i}'].get_support()])]
    
    X_train, X_test, y_train, y_test = train_test_split(X_i, y, random_state = 42)
    nb.fit(X_train, y_train)
    
    y_pred = nb.predict(X_test)
    
    accuracy_forw_nb.append(accuracy_score(y_test, y_pred))
    recall_forw_nb.append(recall_score(y_test, y_pred, average = 'macro'))
    f1_forw_nb.append(f1_score(y_test, y_pred, average = 'macro'))
    
comparacion_forw_nb = pd.DataFrame({'Modelo': ['Sprint 3', 'Sprint 5', ' Sprint 7', 'Sprint 9', 'Sprint 10'], 'Accuracy': accuracy_forw_nb, 'Recall': recall_forw_nb, 'F1': f1_forw_nb})

comparacion_forw_nb.round(2)

Unnamed: 0,Modelo,Accuracy,Recall,F1
0,Sprint 3,0.11,0.11,0.07
1,Sprint 5,0.15,0.15,0.11
2,Sprint 7,0.17,0.17,0.13
3,Sprint 9,0.18,0.19,0.14
4,Sprint 10,0.18,0.18,0.14


### Decision Tree Classifier

In [18]:
from sklearn.tree import DecisionTreeClassifier

In [19]:
dtc = DecisionTreeClassifier(min_samples_split = 10)

forw_dtc = {}

for i in [3, 5, 7, 9, 10]:
    forw_dtc[f'sfs_{i}'] = SequentialFeatureSelector(dtc, n_features_to_select = i)
    forw_dtc[f'sfs_{i}'].fit(X2, y2)
    # print(f'Las {i} variables seleccionadas son:', list(X2.columns[forw_dtc[f'sfs_{i}'].get_support()]))
    forw_dtc[f'sfs_{i}_var'] = X2.columns[forw_dtc[f'sfs_{i}'].get_support()]

Las 3 variables seleccionadas son: ['popularity', 'danceability', 'acousticness']
Las 5 variables seleccionadas son: ['popularity', 'danceability', 'acousticness', 'duration_min', 'loudness_db']
Las 7 variables seleccionadas son: ['popularity', 'danceability', 'acousticness', 'instrumentalness', 'tempo', 'duration_min', 'loudness_db']
Las 9 variables seleccionadas son: ['popularity', 'danceability', 'energy', 'speechiness', 'acousticness', 'instrumentalness', 'tempo', 'duration_min', 'loudness_db']
Las 10 variables seleccionadas son: ['popularity', 'danceability', 'energy', 'speechiness', 'acousticness', 'instrumentalness', 'valence', 'tempo', 'duration_min', 'loudness_db']


In [46]:
dtc = DecisionTreeClassifier(min_samples_split = 10)

accuracy_forw_dtc, recall_forw_dtc, f1_forw_dtc = [], [], []

for i in [3, 5, 7, 9, 10]:
    X_i = X[list(X2.columns[forw_dtc[f'sfs_{i}'].get_support()])]
    
    X_train, X_test, y_train, y_test = train_test_split(X_i, y, random_state = 42)
    dtc.fit(X_train, y_train)
    
    y_pred = dtc.predict(X_test)
    
    accuracy_forw_dtc.append(accuracy_score(y_test, y_pred))
    recall_forw_dtc.append(recall_score(y_test, y_pred, average = 'macro'))
    f1_forw_dtc.append(f1_score(y_test, y_pred, average = 'macro'))
    
comparacion_forw_dtc = pd.DataFrame({'Modelo': ['Sprint 3', 'Sprint 5', ' Sprint 7', 'Sprint 9', 'Sprint 10'], 'Accuracy': accuracy_forw_dtc, 'Recall': recall_forw_dtc, 'F1': f1_forw_dtc})

comparacion_forw_dtc.round(2)

Unnamed: 0,Modelo,Accuracy,Recall,F1
0,Sprint 3,0.13,0.14,0.13
1,Sprint 5,0.18,0.18,0.18
2,Sprint 7,0.2,0.2,0.2
3,Sprint 9,0.22,0.22,0.22
4,Sprint 10,0.22,0.22,0.22


### Random Forrest 

In [23]:
from sklearn.ensemble import RandomForestClassifier

In [30]:
rfc = RandomForestClassifier(max_depth = 5, n_estimators = 10, max_features = 1)

forw_rfc = {}

for i in [3, 5, 7, 9, 10]:
    forw_rfc[f'sfs_{i}'] = SequentialFeatureSelector(rfc, n_features_to_select = i)
    forw_rfc[f'sfs_{i}'].fit(X2, y2)
    # print(f'Las {i} variables seleccionadas son:', list(X2.columns[forw_rfc[f'sfs_{i}'].get_support()]))
    forw_rfc[f'sfs_{i}_var'] = X2.columns[forw_rfc[f'sfs_{i}'].get_support()]

Las 3 variables seleccionadas son: ['popularity', 'acousticness', 'tempo']
Las 5 variables seleccionadas son: ['popularity', 'acousticness', 'instrumentalness', 'tempo', 'duration_min']
Las 7 variables seleccionadas son: ['popularity', 'danceability', 'energy', 'acousticness', 'instrumentalness', 'tempo', 'duration_min']
Las 9 variables seleccionadas son: ['popularity', 'danceability', 'energy', 'acousticness', 'instrumentalness', 'valence', 'tempo', 'duration_min', 'loudness_db']
Las 10 variables seleccionadas son: ['popularity', 'danceability', 'energy', 'speechiness', 'acousticness', 'instrumentalness', 'valence', 'tempo', 'duration_min', 'loudness_db']


In [47]:
rfc = RandomForestClassifier(max_depth = 5, n_estimators = 10, max_features = 1)

accuracy_forw_rfc, recall_forw_rfc, f1_forw_rfc = [], [], []

for i in [3, 5, 7, 9, 10]:
    X_i = X[list(X2.columns[forw_rfc[f'sfs_{i}'].get_support()])]
    
    X_train, X_test, y_train, y_test = train_test_split(X_i, y, random_state = 42)
    rfc.fit(X_train, y_train)
    
    y_pred = rfc.predict(X_test)
    
    accuracy_forw_rfc.append(accuracy_score(y_test, y_pred))
    recall_forw_rfc.append(recall_score(y_test, y_pred, average = 'macro'))
    f1_forw_rfc.append(f1_score(y_test, y_pred, average = 'macro'))
    
comparacion_forw_rfc = pd.DataFrame({'Modelo': ['Sprint 3', 'Sprint 5', ' Sprint 7', 'Sprint 9', 'Sprint 10'], 'Accuracy': accuracy_forw_rfc, 'Recall': recall_forw_rfc, 'F1': f1_forw_rfc})

comparacion_forw_rfc.round(2)

Unnamed: 0,Modelo,Accuracy,Recall,F1
0,Sprint 3,0.13,0.13,0.08
1,Sprint 5,0.14,0.14,0.09
2,Sprint 7,0.16,0.16,0.11
3,Sprint 9,0.14,0.15,0.09
4,Sprint 10,0.15,0.15,0.1


### Ada Boost Classifier

Este algoritmo resulta extremandamente lento a la hora de realizar el Forward Feature Selector, es por ello que se tomaron los features determinados por Random Forest.

In [35]:
from sklearn.ensemble import AdaBoostClassifier

In [40]:
abc = AdaBoostClassifier()

X_9 = X[list(X2.columns[forw['sfs_9'].get_support()])]
X_train, X_test, y_train, y_test = train_test_split(X_9, y, random_state = 42)

abc.fit(X_train, y_train)
y_pred = abc.predict(X_test)

accuracy_abc = accuracy_score(y_test, y_pred)
recall_abc = recall_score(y_test, y_pred, average = 'macro')
f1_abc = f1_score(y_test, y_pred, average = 'macro')

In [41]:
abc = AdaBoostClassifier()

accuracy_forw_abc, recall_forw_abc, f1_forw_abc = [], [], []

for i in [3, 5, 7, 9, 10]:
    X_i = X[list(X2.columns[forw_rfc[f'sfs_{i}'].get_support()])]
    
    X_train, X_test, y_train, y_test = train_test_split(X_i, y, random_state = 42)
    abc.fit(X_train, y_train)
    
    y_pred = abc.predict(X_test)
    
    accuracy_forw_abc.append(accuracy_score(y_test, y_pred))
    recall_forw_abc.append(recall_score(y_test, y_pred, average = 'macro'))
    f1_forw_abc.append(f1_score(y_test, y_pred, average = 'macro'))
    
comparacion_forw_abc = pd.DataFrame({'Modelo': ['Sprint 3', 'Sprint 5', ' Sprint 7', 'Sprint 9', 'Sprint 10'], 'Accuracy': accuracy_forw_abc, 'Recall': recall_forw_abc, 'F1': f1_forw_abc})

comparacion_forw_abc.round(2)