<a href="https://colab.research.google.com/github/JorgeSedek/tp3_orga_datos/blob/main/tp3_xgboost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.utils import shuffle
from sklearn.preprocessing import OneHotEncoder
import numpy as np
from xgboost import XGBClassifier 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [None]:
df_train = pd.read_parquet("/content/drive/MyDrive/orgaDatos/TP3 dataset music/train.parquet")

In [None]:
df_test = pd.read_parquet("/content/drive/MyDrive/orgaDatos/TP3 dataset music/test.parquet")

In [None]:
df_train.count()

track_name          31383
lyric               31380
genre               31383
language            24021
popularity          31383
artist              31383
a_genres            31383
a_songs             31383
a_popularity        31383
did                  7004
s-label              7004
acousticness        31383
danceability        31383
duration_ms         31383
energy              31383
instrumentalness    31383
key                 31383
liveness            31383
loudness            31383
mode                31383
speechiness         31383
tempo               31383
time_signature      31383
valence             31383
dtype: int64

In [None]:
def fix_generos(x):
    if x in ["Children’s Music"]:
        return "Children's Music"
    return x
 

df_train.genre = df_train.genre.map(lambda x:fix_generos(x))
df_test.genre = df_test.genre.map(lambda x:fix_generos(x))

In [None]:
df_train.drop(['track_name','a_genres','did','time_signature'], axis=1, inplace= True)
df_test.drop(['track_name','a_genres','did','time_signature'], axis=1, inplace= True)

In [None]:
idioma_mas_popular = df_train['language'].value_counts().reset_index()['index'][0]

In [None]:
df_train['language'] = df_train['language'].fillna(idioma_mas_popular)
df_test['language'] = df_test['language'].fillna(idioma_mas_popular)

In [None]:
df_train['s-label'] = df_train['s-label'].fillna(df_train['s-label'].mean())
df_test['s-label'] = df_test['s-label'].fillna(df_test['s-label'].mean())

In [None]:
df_train.count()

lyric               31380
genre               31383
language            31383
popularity          31383
artist              31383
a_songs             31383
a_popularity        31383
s-label             31383
acousticness        31383
danceability        31383
duration_ms         31383
energy              31383
instrumentalness    31383
key                 31383
liveness            31383
loudness            31383
mode                31383
speechiness         31383
tempo               31383
valence             31383
dtype: int64

# Separación de Test y Valid

Para separar el test y valid se utiliza la feature artist:

*   Se mezcla al azar la lista de artistas sin repetir
*   Se toma el 80% de esos artistas
*   se divide el set de entrenamiento:

    1.  las filas que tienen uno de los artistas dentro de la muestra del 80% pasan a ser el nuevo set de entrenamiento.
    2.  el resto pasa a ser el set de validación.

In [None]:
artistas_shuffled = shuffle(df_train['artist'].unique(), random_state = 8)
artistas_80 = artistas_shuffled[:8*(len(artistas_shuffled)//10)]
len(artistas_80) / len(artistas_shuffled)

0.7985074626865671

In [None]:
df_train_final = df_train[df_train['artist'].map(lambda x: x in artistas_80)]
df_valid_final = df_train[df_train['artist'].map(lambda x: not (x in artistas_80))]

df_train_final.drop(['artist'], axis=1, inplace= True)
df_valid_final.drop(['artist'], axis=1, inplace= True)
df_test.drop(['artist'], axis=1, inplace= True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [None]:
y_train = df_train_final.genre
y_test = df_test.genre
x_train = df_train_final.drop("genre", axis = 1)
x_test = df_test.drop("genre", axis = 1)

In [None]:
y_valid = df_valid_final.genre
x_valid = df_valid_final.drop("genre", axis = 1)

In [None]:
sets_trabajo = [x_train, x_valid, x_test]

# Encoding para lyric

In [None]:
from nltk.corpus import stopwords

import nltk

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [None]:
from nltk import word_tokenize
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.stem import WordNetLemmatizer

In [None]:
stopwordsEN = stopwords.words('english')

In [None]:
lemma = WordNetLemmatizer()

def lematizar(texto):
  texto_lematizado= ""
  for palabra in word_tokenize(texto):
      texto_lematizado = texto_lematizado + " " + lemma.lemmatize(palabra)
  return texto_lematizado

def filtrar(texto):
  caracteres_eliminar = "./,``,’?!#$%&/()]_-[{}0123456789²'"
  
  for caracter in caracteres_eliminar:
    texto = texto.replace(caracter,"")
    
  return texto

In [None]:
vectorizer = CountVectorizer(lowercase=True, stop_words=stopwordsEN)
textos = ""
for line in x_test.lyric:
    textos = textos + filtrar(lematizar(str(line)).lower().strip(',.!"#$%/(=?)¡[]{}"'))

In [None]:
matrix = vectorizer.fit_transform([textos])
matrix

<1x13755 sparse matrix of type '<class 'numpy.int64'>'
	with 13755 stored elements in Compressed Sparse Row format>

In [None]:
counts = pd.DataFrame(matrix.toarray(),
                      columns=vectorizer.get_feature_names())
counts



Unnamed: 0,aaaaaaaaaaaaaaaaooooh,aaaaaaaaaaah,aah,aahh,aba,abandon,abandoned,abba,aberration,abide,...,zonia,zoo,zooky,zoom,zoot,zu,zulu,zutto,às,ëround
0,1,3,8,10,1,13,6,2,1,6,...,1,4,1,3,2,6,3,8,2,2


In [None]:
palabras_freq = counts.T.sort_values(by=0, ascending=False).head(20)
palabras_freq2 = palabras_freq[0].reset_index().rename(columns={"index": "palabras"})
palabras_frecuentes = palabras_freq2['palabras'].tolist()
palabras_frecuentes

['nt',
 'like',
 'know',
 'love',
 'got',
 'na',
 'get',
 'go',
 'yeah',
 'oh',
 'time',
 'never',
 'want',
 'wa',
 'baby',
 'come',
 'let',
 'ca',
 'one',
 'cause']

In [None]:
for set_trabajo in sets_trabajo:
  for palabra in palabras_frecuentes:
    set_trabajo[str(palabra)] = set_trabajo['lyric'].map(lambda x: 1 if (str(palabra) in str(x)) else 0)

In [None]:
x_train = x_train.reset_index().drop(['lyric'],axis=1)
x_valid = x_valid.reset_index().drop(['lyric'],axis=1)
x_test = x_test.reset_index().drop(['lyric'],axis=1)

# One-hot encoding de variables categoricas

In [None]:
features_ohe = ['language','mode']

In [None]:
def cant_columns_num(x_set):
  cant_columns_num = 0
  for columna in x_set.columns:
    if isinstance(columna, int):
      cant_columns_num += 1
  return cant_columns_num

def renombrar(nombre_feature, x_set):
  for i in range(cant_columns_num(x_set)):
    x_set = x_set.rename(columns = {i: (nombre_feature + str(i))})
  return x_set

In [None]:
for nombre_feature in features_ohe:
  ohe = OneHotEncoder(drop='first', handle_unknown='ignore')
  encoded = ohe.fit_transform(x_train[[nombre_feature]]).todense().astype(int)
  encoded_valid = ohe.transform(x_valid[[nombre_feature]]).todense().astype(int)
  encoded_test = ohe.transform(x_test[[nombre_feature]]).todense().astype(int)

  x_train = x_train.reset_index().drop(['index', nombre_feature],axis=1).join(pd.DataFrame(encoded))
  x_valid = x_valid.reset_index().drop(['index', nombre_feature],axis=1).join(pd.DataFrame(encoded_valid))
  x_test = x_test.reset_index().drop(['index', nombre_feature],axis=1).join(pd.DataFrame(encoded_test))
  
  x_train = renombrar(nombre_feature,x_train)
  x_valid = renombrar(nombre_feature,x_valid)
  x_test = renombrar(nombre_feature,x_test)



In [None]:
x_train.columns

Index(['level_0', 'popularity', 'a_songs', 'a_popularity', 's-label',
       'acousticness', 'danceability', 'duration_ms', 'energy',
       'instrumentalness', 'key', 'liveness', 'loudness', 'speechiness',
       'tempo', 'valence', 'nt', 'like', 'know', 'love', 'got', 'na', 'get',
       'go', 'yeah', 'oh', 'time', 'never', 'want', 'wa', 'baby', 'come',
       'let', 'ca', 'one', 'cause', 'language0', 'language1', 'language2',
       'language3', 'language4', 'language5', 'language6', 'language7',
       'language8', 'language9', 'mode0'],
      dtype='object')

In [None]:
ohe_genre= OneHotEncoder(drop='first', handle_unknown='ignore')
encoded_genre = ohe_genre.fit_transform(pd.DataFrame(y_train)).todense().astype(int)
encoded_genre_valid = ohe_genre.transform(pd.DataFrame(y_valid)).todense().astype(int)
encoded_genre_test = ohe_genre.transform(pd.DataFrame(y_test)).todense().astype(int)

y_train2 = y_train.reset_index().drop(['index', 'genre'],axis=1).join(pd.DataFrame(encoded_genre))
y_valid2 = y_valid.reset_index().drop(['index', 'genre'],axis=1).join(pd.DataFrame(encoded_genre_valid))
y_test2 = y_test.reset_index().drop(['index', 'genre'],axis=1).join(pd.DataFrame(encoded_genre_test))

# Mean Encoding

In [None]:
key_mode = x_train.loc[:,["mode0", 'key']].groupby(['key']).mean().sort_values(by="mode0",ascending=False).reset_index()
key_mode

Unnamed: 0,key,mode0
0,B,0.512075
1,A#,0.489946
2,E,0.48731
3,F#,0.483851
4,F,0.419652
5,D#,0.331558
6,A,0.320377
7,C#,0.3
8,D,0.202594
9,G#,0.198119


In [None]:
set_features = [x_train, x_valid, x_test]

In [None]:
def mean_encoding(key):
  return key_mode[key_mode['key'].map(lambda x: x == key)]['mode0'].values[0]

In [None]:
for features in set_features:
  features['key'] = features['key'].map(lambda x: mean_encoding(x))

In [None]:
x_train['key']

0        0.512075
1        0.512075
2        0.512075
3        0.512075
4        0.512075
           ...   
25237    0.487310
25238    0.487310
25239    0.483851
25240    0.154978
25241    0.419652
Name: key, Length: 25242, dtype: float64

In [None]:
x_train = x_train.reset_index().drop(['index', 'level_0'],axis=1)
x_valid = x_valid.reset_index().drop(['index', 'level_0'],axis=1)
x_test = x_test.reset_index().drop(['index', 'level_0'],axis=1)

## XGBoost

In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import top_k_accuracy_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform

In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score

In [None]:
model2 = XGBClassifier(criterion = 'squared_error', random_state= 8)

In [None]:
model2.fit(x_train, y_train)

XGBClassifier(criterion='squared_error', objective='multi:softprob',
              random_state=8)

In [None]:
# define search space
space = {}
space['criterion'] = ['squared_error', 'friedman_mse']

In [None]:
search = RandomizedSearchCV(model2, space, n_iter=2, scoring='accuracy', n_jobs= 1, cv=2, random_state= 8, verbose =10)

In [None]:
# execute search
result = search.fit(x_train, y_train)
# summarize result
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

Fitting 2 folds for each of 2 candidates, totalling 4 fits
[CV 1/2; 1/2] START criterion=squared_error.....................................
[CV 1/2; 1/2] END ......criterion=squared_error;, score=0.387 total time=  42.8s
[CV 2/2; 1/2] START criterion=squared_error.....................................
[CV 2/2; 1/2] END ......criterion=squared_error;, score=0.348 total time=  43.3s
[CV 1/2; 2/2] START criterion=friedman_mse......................................
[CV 1/2; 2/2] END .......criterion=friedman_mse;, score=0.387 total time=  43.1s
[CV 2/2; 2/2] START criterion=friedman_mse......................................
[CV 2/2; 2/2] END .......criterion=friedman_mse;, score=0.348 total time=  43.9s
Best Score: 0.3676412328658585
Best Hyperparameters: {'criterion': 'squared_error'}


In [None]:
top_k_accuracy_score(y_valid, model2.predict_proba(x_valid), k=2, labels = model2.classes_)

0.5145741735873636

In [None]:
model2.feature_importances_

array([0.07354306, 0.02746116, 0.03724257, 0.0056091 , 0.03999484,
       0.02328701, 0.02042293, 0.042092  , 0.02869448, 0.00297652,
       0.00297111, 0.02745401, 0.0456745 , 0.0060045 , 0.01251139,
       0.00958427, 0.03323746, 0.01414283, 0.02212904, 0.02822384,
       0.01241139, 0.05374084, 0.02314712, 0.02179286, 0.02190975,
       0.01008213, 0.00342545, 0.00492957, 0.01215132, 0.04237849,
       0.00678233, 0.01284242, 0.00975294, 0.00813112, 0.01470057,
       0.        , 0.01673773, 0.1284536 , 0.        , 0.        ,
       0.        , 0.01311692, 0.02348051, 0.02437901, 0.        ,
       0.03239731], dtype=float32)

El mejor modelo es xboost con 0.51457 de top-2 acuracy

In [None]:
top_k_accuracy_score(y_test, model2.predict_proba(x_test), k=2, labels = model2.classes_)

0.5016926201760324

El top-acuracy 2 para test es 0.50169

In [None]:
prediciones = model2.predict_proba(x_train)

In [None]:
x_train2 = x_train.copy()

In [None]:
i = 0
for genero in model2.classes_:
  x_train2[genero] = prediciones[:,i]
  i+= 1

In [None]:
x_train2

Unnamed: 0,popularity,a_songs,a_popularity,s-label,acousticness,danceability,duration_ms,energy,instrumentalness,key,...,Pop,R&B,Rap,Reggae,Reggaeton,Rock,Ska,Soul,Soundtrack,World
0,79,276.0,205.5,0.490309,0.2720,0.508,261640,0.720,0.000000,0.512075,...,0.553333,0.177568,0.008410,0.000717,0.000178,0.010977,0.000398,0.011000,0.000134,0.000196
1,80,276.0,205.5,0.490309,0.2720,0.508,261640,0.720,0.000000,0.512075,...,0.553333,0.177568,0.008410,0.000717,0.000178,0.010977,0.000398,0.011000,0.000134,0.000196
2,80,276.0,205.5,0.490309,0.2720,0.508,261640,0.720,0.000000,0.512075,...,0.553333,0.177568,0.008410,0.000717,0.000178,0.010977,0.000398,0.011000,0.000134,0.000196
3,71,276.0,205.5,0.490309,0.0296,0.412,319467,0.441,0.072600,0.512075,...,0.438401,0.284703,0.012876,0.000507,0.000476,0.005177,0.000578,0.012395,0.000174,0.000328
4,71,276.0,205.5,0.490309,0.0296,0.412,319467,0.441,0.072600,0.512075,...,0.438401,0.284703,0.012876,0.000507,0.000476,0.005177,0.000578,0.012395,0.000174,0.000328
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25237,19,89.0,1.0,0.748000,0.6120,0.616,190733,0.822,0.000003,0.487310,...,0.000687,0.006196,0.001212,0.151739,0.001696,0.010597,0.163120,0.024430,0.000374,0.396969
25238,34,89.0,1.0,0.868000,0.8890,0.457,230200,0.369,0.011100,0.487310,...,0.000259,0.001079,0.000415,0.011510,0.002137,0.002107,0.004187,0.025857,0.000185,0.640679
25239,18,89.0,1.0,0.793000,0.7630,0.717,275640,0.566,0.812000,0.483851,...,0.000212,0.001937,0.000380,0.060163,0.004647,0.003574,0.007282,0.007125,0.000260,0.469107
25240,28,89.0,1.0,0.738000,0.8040,0.633,204373,0.553,0.000866,0.154978,...,0.000457,0.003131,0.000789,0.095290,0.009663,0.003000,0.024867,0.022123,0.000259,0.529096


In [None]:
x_train2.to_csv('predicciones_Xgboost.csv', index=False)