In [1]:
import os
import pandas as pd 
import pickle
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics         import accuracy_score



In [2]:
### Curación de datos

In [3]:
df = pd.read_csv('df_comm_genres.csv')
df = df.drop(columns=["Unnamed: 0", 'liveness', 'id', "uri", "num_artists", "track_name", "artist_name", 'genero'])

df.most_common_genre = df.most_common_genre.astype('str')
generos = df.most_common_genre.values



In [4]:
### Encoding del género (most common)
le = LabelEncoder()
generos_encoded = le.fit_transform(generos)

data_final = df.copy()
data_final['encoded_genres'] = generos_encoded

data_final.rename(columns={'most_common_genre':'genero'}, inplace=True)
data_final


Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,valence,tempo,duration_ms,time_signature,genero,encoded_genres
0,0.161,0.482,1,-11.210,0,0.0472,0.4860,0.409000,0.0926,161.092,233079,5,dance pop,17
1,0.230,0.434,10,-12.402,0,0.0551,0.5700,0.000000,0.0834,80.793,216905,5,dance pop,17
2,0.289,0.280,9,-15.335,1,0.0433,0.8740,0.004430,0.0391,124.835,215773,3,dance pop,17
3,0.499,0.648,7,-5.812,1,0.0331,0.7230,0.000000,0.4640,88.140,219107,4,latin,24
4,0.528,0.383,2,-11.170,1,0.0258,0.3760,0.000000,0.4600,89.089,328320,4,latin,24
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1750,0.690,0.519,4,-7.241,1,0.0246,0.3140,0.000021,0.5230,90.978,230053,4,argentine indie,2
1751,0.616,0.395,2,-6.017,1,0.0301,0.2890,0.000006,0.3980,120.758,230867,3,argentine indie,2
1752,0.432,0.456,4,-7.259,1,0.0290,0.1320,0.000205,0.2310,98.843,208579,3,argentine indie,2
1753,0.557,0.656,7,-6.915,1,0.0295,0.0953,0.000043,0.2590,134.982,204675,4,argentine indie,2


In [5]:
### Creación del dataset binario para cada género
path = 'data/'
generos_de_interes = ['argentine rock', 'latin', 'cuarteto', 'cumbia villera', 'argentine indie']
data_final.drop(columns={'encoded_genres'}, inplace=True)

if not os.path.isdir(f'{path}'):
    os.mkdir(f'{path}')

for genero in generos_de_interes:
    if not os.path.isdir(f'{path}{genero}'):
        os.mkdir(f'{path}{genero}')
    
    df_pos = data_final[data_final['genero']==genero]
    df_pos.genero = 1
    count = df_pos.shape[0]
    df_neg = data_final[data_final['genero']!=genero].sample(count)
    df_neg.genero = 0
    
    df_final_genre = pd.concat([df_pos, df_neg])
    
    df_train, df_test = train_test_split(df_final_genre, test_size=0.3, random_state=42)
    
    df_train.to_csv(f'{path}{genero}/{genero}_train.csv')
    df_test.to_csv(f'{path}{genero}/{genero}_test.csv')                 

In [8]:
dict_tree_grid = {'criterion'         : ['gini', 'entropy'],
                  'max_features'      : [None, 'auto'],
                  'min_samples_split' : [25, 50, 75, 100, 150, 200, 300, 500],
                  'ccp_alpha'         : [0, 0.5, 1]
                  }

##TRAINING

for genero in generos_de_interes:
    df_train = pd.read_csv(f'{path}{genero}/{genero}_train.csv')
    columns = df_train.columns.to_list()
    columns.remove('genero')
    
    X = df_train[columns]
    y = df_train['genero']
    print(f"Training {genero} model")
    
    model =      RandomForestClassifier(random_state=42)
    clf =        GridSearchCV(model, dict_tree_grid, scoring='accuracy', cv=3, n_jobs=-1)
    search =     clf.fit(X, y)
    best_tree =  search.best_estimator_

    with open(f'{path}{genero}/{genero}_model', 'wb') as f:
        pickle.dump(best_tree, f)

Training argentine rock model
Training latin model
Training cuarteto model
Training cumbia villera model
Training argentine indie model


In [9]:
### Testing...  el accuracy se calcula como promedio del accuracy de cada clasificador individual
count = len(generos_de_interes)
acc = 0
acc_total = 0
files = ['argentine rock', 'latin', 'cuarteto', 'cumbia villera', 'argentine indie']
for genero in generos_de_interes:
    tran = le.transform([genero])[0]

    df_test = pd.read_csv(f'{path}{genero}/{genero}_test.csv')
    columns = df_train.columns.to_list()
    columns.remove('genero')
    model = pickle.load(open(f'{path}{genero}/{genero}_model', 'rb'))
    X = df_test[columns]
    y = df_test['genero']
    print(f"\n---------Testing {genero} model---------")
    y_pred = model.predict(X)
    acc += accuracy_score(y, y_pred)
    print(f'{genero} model accuracy: {str(accuracy_score(y, y_pred))}')
acc_total = acc/count
print(f'Total accuracy: {acc_total}')


---------Testing argentine rock model---------
argentine rock model accuracy: 0.8708010335917312

---------Testing latin model---------
latin model accuracy: 0.8930817610062893

---------Testing cuarteto model---------
cuarteto model accuracy: 0.9571428571428572

---------Testing cumbia villera model---------
cumbia villera model accuracy: 0.8091603053435115

---------Testing argentine indie model---------
argentine indie model accuracy: 0.8543689320388349
Total accuracy: 0.8769109778246447
