# Ejercicio 1

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import glob
import os

plt.rcParams['figure.figsize'] = (12, 6)
plt.style.use('seaborn')

In [2]:
dir_actual = os.getcwd()
archivos_dump = list(map(lambda x: f'{dir_actual}\{x}', glob.glob('input/dump/*.csv')))

dfs = (pd.read_csv(f) for f in archivos_dump)
df = pd.concat(dfs).drop(columns='Unnamed: 0')
df.columns = ['artist', 'genre', 'song_name', 'lyrics']

# Ejercicio 2

In [3]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(stop_words='english', max_features=5000)
vectorizer_fit = vectorizer.fit_transform(df['lyrics'])

words = vectorizer.get_feature_names_out()
freqs = vectorizer_fit.toarray().sum(axis=0)

print(f'Las 5000 palabras mas frecuentes en la base de datos son: {words}')

Las 5000 palabras mas frecuentes en la base de datos son: ['000' '10' '100' ... 'zoo' 'zoom' 'zulu']


# Ejercicio 3

In [4]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import GridSearchCV

In [5]:
n_components = [5, 10, 15]
learning_decay = [0.7, 0.5]

grid = {
    'n_components': n_components,
    'learning_decay': learning_decay
}

modelo_grilla = GridSearchCV(LatentDirichletAllocation(), param_grid=grid, verbose=5, cv=5)

In [6]:
X = CountVectorizer().fit_transform(df['lyrics'])
y = df['genre']

In [7]:
modelo_grilla.fit(X, y)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV 1/5] END learning_decay=0.7, n_components=5;, score=-3751249.138 total time=  24.7s
[CV 2/5] END learning_decay=0.7, n_components=5;, score=-6299729.669 total time=  20.8s
[CV 3/5] END learning_decay=0.7, n_components=5;, score=-4487252.411 total time=  27.1s
[CV 4/5] END learning_decay=0.7, n_components=5;, score=-4031393.216 total time=  24.7s
[CV 5/5] END learning_decay=0.7, n_components=5;, score=-2625375.751 total time=  29.1s
[CV 1/5] END learning_decay=0.7, n_components=10;, score=-3838785.645 total time= 1.2min
[CV 2/5] END learning_decay=0.7, n_components=10;, score=-6440895.379 total time=  27.9s
[CV 3/5] END learning_decay=0.7, n_components=10;, score=-4559192.805 total time=  26.5s
[CV 4/5] END learning_decay=0.7, n_components=10;, score=-4120974.446 total time=  26.5s
[CV 5/5] END learning_decay=0.7, n_components=10;, score=-2676440.389 total time= 1.4min
[CV 1/5] END learning_decay=0.7, n_components=15;, scor

GridSearchCV(cv=5, estimator=LatentDirichletAllocation(),
             param_grid={'learning_decay': [0.7, 0.5],
                         'n_components': [5, 10, 15]},
             verbose=5)

In [9]:
import pickle

pickle.dump(modelo_grilla, open('modelo.sav', 'wb'))

In [11]:
modelo_grilla_2 = GridSearchCV(LatentDirichletAllocation(), param_grid=grid, verbose=5, cv=5)
X = CountVectorizer(stop_words='english', max_features=5000).fit_transform(df['lyrics'])
y = df['genre']
modelo_grilla_2.fit(X, y)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV 1/5] END learning_decay=0.7, n_components=5;, score=-1615459.244 total time=  18.8s
[CV 2/5] END learning_decay=0.7, n_components=5;, score=-2789973.387 total time=  16.7s
[CV 3/5] END learning_decay=0.7, n_components=5;, score=-2009414.504 total time=  18.6s
[CV 4/5] END learning_decay=0.7, n_components=5;, score=-1779393.654 total time=  58.3s
[CV 5/5] END learning_decay=0.7, n_components=5;, score=-1091020.225 total time= 1.0min
[CV 1/5] END learning_decay=0.7, n_components=10;, score=-1648282.244 total time= 1.0min
[CV 2/5] END learning_decay=0.7, n_components=10;, score=-2816453.863 total time=  53.2s
[CV 3/5] END learning_decay=0.7, n_components=10;, score=-2033092.891 total time= 1.0min
[CV 4/5] END learning_decay=0.7, n_components=10;, score=-1803781.249 total time= 1.0min
[CV 5/5] END learning_decay=0.7, n_components=10;, score=-1118936.949 total time= 1.1min
[CV 1/5] END learning_decay=0.7, n_components=15;, scor

GridSearchCV(cv=5, estimator=LatentDirichletAllocation(),
             param_grid={'learning_decay': [0.7, 0.5],
                         'n_components': [5, 10, 15]},
             verbose=5)

In [None]:
pickle.dump(modelo_grilla_2, open('modelo_grilla_2.sav', 'wb'))

In [13]:
print(modelo_grilla.best_estimator_)

LatentDirichletAllocation(learning_decay=0.5, n_components=5)


In [14]:
print(modelo_grilla_2.best_estimator_)

LatentDirichletAllocation(learning_decay=0.5, n_components=5)
