# Clasificador neuronal liviano
Entrenaremos una red neuronal pequeña (MLP) para distinguir reseñas positivas (`1`) y negativas (`0`) a partir del texto preprocesado disponible.

In [1]:
import pathlib
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report

In [2]:
DATA_PATH = pathlib.Path("..") / "data" / "processed" / "cleaned_sentiment_data.csv"
df = pd.read_csv(DATA_PATH)
df.head()

Unnamed: 0,cleaned_review,sentiment,review_length
0,teenager martha moxley maggie grace move high ...,1,125
1,ok really like kris kristofferson usual easy g...,0,111
2,spoiler read think watching movie although wou...,0,147
3,hi people seen wonderful movie im sure thet wo...,1,36
4,recently bought dvd forgetting much hated movi...,0,61


In [3]:
texts = df["cleaned_review"].astype(str).values
labels = df["sentiment"].astype(int).values

X_train, X_test, y_train, y_test = train_test_split(
    texts, labels, test_size=0.2, random_state=42, stratify=labels
 )

vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2), min_df=2)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

X_train_vec.shape

(39665, 5000)

In [4]:
X_train_dense = X_train_vec.toarray().astype(np.float32)
X_test_dense = X_test_vec.toarray().astype(np.float32)

mlp = MLPClassifier(
    hidden_layer_sizes=(64,),
    activation="relu",
    solver="adam",
    alpha=1e-4,
    random_state=42,
    max_iter=30,
    verbose=True
 )
mlp.fit(X_train_dense, y_train)

Iteration 1, loss = 0.44395599
Iteration 2, loss = 0.25175763
Iteration 3, loss = 0.21950913
Iteration 4, loss = 0.20565465
Iteration 5, loss = 0.19764695
Iteration 6, loss = 0.19202587
Iteration 7, loss = 0.18809856
Iteration 8, loss = 0.18519127
Iteration 9, loss = 0.18231393
Iteration 10, loss = 0.17999150
Iteration 11, loss = 0.17819056
Iteration 12, loss = 0.17559048
Iteration 13, loss = 0.17344132
Iteration 14, loss = 0.17105182
Iteration 15, loss = 0.16807162
Iteration 16, loss = 0.16554279
Iteration 17, loss = 0.16197683
Iteration 18, loss = 0.15853645
Iteration 19, loss = 0.15478281
Iteration 20, loss = 0.15106374
Iteration 21, loss = 0.14602240
Iteration 22, loss = 0.14133245
Iteration 23, loss = 0.13557861
Iteration 24, loss = 0.12982583
Iteration 25, loss = 0.12381841
Iteration 26, loss = 0.11705749
Iteration 27, loss = 0.10994764
Iteration 28, loss = 0.10303381
Iteration 29, loss = 0.09611391
Iteration 30, loss = 0.08849636




0,1,2
,"hidden_layer_sizes  hidden_layer_sizes: array-like of shape(n_layers - 2,), default=(100,) The ith element represents the number of neurons in the ith hidden layer.","(64,)"
,"activation  activation: {'identity', 'logistic', 'tanh', 'relu'}, default='relu' Activation function for the hidden layer. - 'identity', no-op activation, useful to implement linear bottleneck,  returns f(x) = x - 'logistic', the logistic sigmoid function,  returns f(x) = 1 / (1 + exp(-x)). - 'tanh', the hyperbolic tan function,  returns f(x) = tanh(x). - 'relu', the rectified linear unit function,  returns f(x) = max(0, x)",'relu'
,"solver  solver: {'lbfgs', 'sgd', 'adam'}, default='adam' The solver for weight optimization. - 'lbfgs' is an optimizer in the family of quasi-Newton methods. - 'sgd' refers to stochastic gradient descent. - 'adam' refers to a stochastic gradient-based optimizer proposed  by Kingma, Diederik, and Jimmy Ba For a comparison between Adam optimizer and SGD, see :ref:`sphx_glr_auto_examples_neural_networks_plot_mlp_training_curves.py`. Note: The default solver 'adam' works pretty well on relatively large datasets (with thousands of training samples or more) in terms of both training time and validation score. For small datasets, however, 'lbfgs' can converge faster and perform better.",'adam'
,"alpha  alpha: float, default=0.0001 Strength of the L2 regularization term. The L2 regularization term is divided by the sample size when added to the loss. For an example usage and visualization of varying regularization, see :ref:`sphx_glr_auto_examples_neural_networks_plot_mlp_alpha.py`.",0.0001
,"batch_size  batch_size: int, default='auto' Size of minibatches for stochastic optimizers. If the solver is 'lbfgs', the classifier will not use minibatch. When set to ""auto"", `batch_size=min(200, n_samples)`.",'auto'
,"learning_rate  learning_rate: {'constant', 'invscaling', 'adaptive'}, default='constant' Learning rate schedule for weight updates. - 'constant' is a constant learning rate given by  'learning_rate_init'. - 'invscaling' gradually decreases the learning rate at each  time step 't' using an inverse scaling exponent of 'power_t'.  effective_learning_rate = learning_rate_init / pow(t, power_t) - 'adaptive' keeps the learning rate constant to  'learning_rate_init' as long as training loss keeps decreasing.  Each time two consecutive epochs fail to decrease training loss by at  least tol, or fail to increase validation score by at least tol if  'early_stopping' is on, the current learning rate is divided by 5. Only used when ``solver='sgd'``.",'constant'
,"learning_rate_init  learning_rate_init: float, default=0.001 The initial learning rate used. It controls the step-size in updating the weights. Only used when solver='sgd' or 'adam'.",0.001
,"power_t  power_t: float, default=0.5 The exponent for inverse scaling learning rate. It is used in updating effective learning rate when the learning_rate is set to 'invscaling'. Only used when solver='sgd'.",0.5
,"max_iter  max_iter: int, default=200 Maximum number of iterations. The solver iterates until convergence (determined by 'tol') or this number of iterations. For stochastic solvers ('sgd', 'adam'), note that this determines the number of epochs (how many times each data point will be used), not the number of gradient steps.",30
,"shuffle  shuffle: bool, default=True Whether to shuffle samples in each iteration. Only used when solver='sgd' or 'adam'.",True


In [5]:
y_pred = mlp.predict(X_test_dense)
acc = accuracy_score(y_test, y_pred)
print(f"Accuracy: {acc:.3f}")
print(classification_report(y_test, y_pred, digits=3))

Accuracy: 0.866
              precision    recall  f1-score   support

           0      0.859     0.876     0.867      4940
           1      0.874     0.858     0.866      4977

    accuracy                          0.866      9917
   macro avg      0.867     0.867     0.866      9917
weighted avg      0.867     0.866     0.866      9917



In [6]:
def predict_sentiment(review: str) -> tuple[int, float]:
    vec = vectorizer.transform([review]).toarray().astype(np.float32)
    pos_prob = mlp.predict_proba(vec)[0, 1]
    label = int(pos_prob >= 0.5)
    return label, pos_prob

example_review = "really enjoyed the pacing and performances"
predict_sentiment(example_review)

(1, np.float32(0.8323484))

In [9]:
# Guardar el modelo y el vectorizador
import joblib
MODEL_PATH = pathlib.Path("..") / "models" / "sentiment_mlp_model.joblib"
joblib.dump(mlp, MODEL_PATH.with_suffix(".joblib"))
VECTORIZER_PATH = pathlib.Path("..") / "models" / "tfidf_vectorizer.joblib"
joblib.dump(vectorizer, VECTORIZER_PATH.with_suffix(".joblib"))


['../models/tfidf_vectorizer.joblib']