# Práctica Final de NLP

# 3. Entrenamiento de los modelos

* Cargamos los datos.

In [48]:
x_train = np.load("x_train_NLP.npy", allow_pickle=True)
y_train = np.load("y_train_NLP.npy", allow_pickle=True)
x_test = np.load("x_test_NLP.npy", allow_pickle=True)
y_test = np.load("y_test_NLP.npy", allow_pickle=True)
x_val = np.load("x_val_NLP.npy", allow_pickle=True)
y_val = np.load("y_val_NLP.npy", allow_pickle=True)

* Extracción de características con TF-IDF

In [49]:
from sklearn.feature_extraction.text import TfidfVectorizer

cv = TfidfVectorizer(
    max_df=0.9,
    min_df=4,
    max_features=2000,
    strip_accents='ascii',
    ngram_range=(1, 1)
)
cv.fit(x_train)

Los parámetros ajustados son los siguientes:
* *max_df=0.95* : para aquellas palabras que aparezcan en más del 90% de los documentos, serán consideradas irrelevantes y eliminadas.
* *min_df=3* : para que una palabra sea incluida deberá aparecer en al menos 4 reviews.
* *max_features=2000* : número máximo de palabras del vocabulario, decido limitarlo un poco para evitar que la RAM se sature.
* *strip_accents='ascii'* : para eliminar acentos de las palabras.
* *ngram_range=(1, 1)* : decidimos no incluir n-gramas para evitar aumentar la dimensionalidad.

In [50]:
print(list(cv.vocabulary_.items())[:20])

[('wow', 1609), ('love', 847), ('sorry', 1337), ('circus', 223), ('life', 819), ('relate', 1170), ('person', 1027), ('nursing', 963), ('home', 684), ('surprise', 1412), ('end', 448), ('right', 1208), ('like', 822), ('happy', 648), ('ending', 449), ('writer', 1612), ('fabulous', 510), ('want', 1566), ('read', 1144), ('work', 1604)]


In [51]:
print(len(cv.vocabulary_))

1622


In [52]:
x_train = cv.transform(x_train)
x_val = cv.transform(x_val)
x_test = cv.transform(x_test)

* Guardamos los datos.

In [53]:
import numpy as np
np.save("x_train_NLP_TR.npy", x_train)
np.save("y_train_NLP_TR.npy", y_train)
np.save("x_test_NLP_TR.npy", x_test)
np.save("y_test_NLP_TR.npy", y_test)
np.save("x_val_NLP_TR.npy", x_val)
np.save("y_val_NLP_TR.npy", y_val)

## 3.a. Modelo Logistic Regression

* Primer buscamos los parámetros óptimos con GridSearch

In [54]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import GridSearchCV

model = LogisticRegression(max_iter=1000, solver='lbfgs')

params_grid = {
    'penalty':['l1', 'l2'],           # Tipo de regularización
    'C': [0.01, 0.1, 1, 10, 100],           # Inverso de la fuerza de regularización
    'class_weight': [None, 'balanced']}

grid_search = GridSearchCV(
    estimator = model,
    param_grid = params_grid,
    scoring = 'accuracy',
    cv = 5,
    verbose = 1
)

grid_search.fit(x_train, y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


50 fits failed out of a total of 100.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
50 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py", line 1194, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py", line 67, in _check_solver
   

In [55]:
print("Mejores hiperparámetros:", grid_search.best_params_)
print("Mejor puntuación en validación cruzada:", grid_search.best_score_)

Mejores hiperparámetros: {'C': 10, 'class_weight': None, 'penalty': 'l2'}
Mejor puntuación en validación cruzada: 0.8985714285714284


* A continuación entrenamos el modelo con los parámetros obtenidos

In [56]:
best_model = grid_search.best_estimator_

# Predecir en el conjunto de validación
y_val_pred = best_model.predict(x_val)

# Evaluar
from sklearn.metrics import accuracy_score, classification_report
print("Accuracy en validación:", accuracy_score(y_val, y_val_pred))
print("\nReporte de clasificación:\n", classification_report(y_val, y_val_pred))

Accuracy en validación: 0.8933333333333333

Reporte de clasificación:
               precision    recall  f1-score   support

           0       0.89      0.89      0.89        75
           1       0.89      0.89      0.89        75

    accuracy                           0.89       150
   macro avg       0.89      0.89      0.89       150
weighted avg       0.89      0.89      0.89       150



* Guardamos el modelo.

In [58]:
from joblib import dump

cache_dir = '/content/drive/My Drive/0. KEEPCODING/NLP/'
model_file = os.path.join(cache_dir, "ML_model.joblib")
dump(model, model_file)

['/content/drive/My Drive/0. KEEPCODING/NLP/ML_model.joblib']

## 3.b. Modelo Deep Learning con LSTM

Calculamos la longitud de las reviews para seleccionar los parámetros del modelo

In [59]:
import numpy as np

# Supongamos que tienes tus textos en una lista
df_reviews['text_length'] = df_reviews['tokens'].apply(len)

# Calcular el percentil deseado (e.g., 95%)
max_words = int(np.percentile(df_reviews['text_length'], 95))

print(f'El 95% de las reseñas tienen menos de {max_words} palabras.')

El 95% de las reseñas tienen menos de 3954 palabras.


* Entrenamos el modelo

In [60]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout, GRU, SimpleRNN

vocabulary_size = len(cv.vocabulary_)
embedding_size = 100
max_words = 485

model = Sequential()
model.add(Embedding(vocabulary_size, embedding_size, input_length=max_words))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))

print(model.summary())



None


In [61]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

* Ajustamos el modelo con los datos de train y validación

In [63]:
batch_size = 64
num_epochs = 20

model.fit(x_train, y_train,
          validation_data=(x_val, y_val),
          batch_size=batch_size, epochs=num_epochs)

Epoch 1/20
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 4s/step - accuracy: 0.5241 - loss: 0.6931 - val_accuracy: 0.5000 - val_loss: 0.6934
Epoch 2/20
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 4s/step - accuracy: 0.5372 - loss: 0.6923 - val_accuracy: 0.5000 - val_loss: 0.6935
Epoch 3/20
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 4s/step - accuracy: 0.5041 - loss: 0.6931 - val_accuracy: 0.5000 - val_loss: 0.6932
Epoch 4/20
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 4s/step - accuracy: 0.5135 - loss: 0.6930 - val_accuracy: 0.5000 - val_loss: 0.6934
Epoch 5/20
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 4s/step - accuracy: 0.5037 - loss: 0.6933 - val_accuracy: 0.5000 - val_loss: 0.6932
Epoch 6/20
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m86s[0m 4s/step - accuracy: 0.4651 - loss: 0.6935 - val_accuracy: 0.5000 - val_loss: 0.6932
Epoch 7/20
[1m11/11[0m [32m━━━━━━━━━━

<keras.src.callbacks.history.History at 0x7e9f45e85030>

* Guardamos el modelo

In [64]:
cache_dir='/content/drive/My Drive/0. KEEPCODING/NLP/'
model_file = "DL_model.h5"
model.save(os.path.join(cache_dir, model_file))

