In [5]:
import pandas as pd
import streamlit as st
# Load the dataset
dataset = pd.read_csv('Data/dataset_reviews_cleaned.csv')

# Display the first few rows of the dataset to understand its structure
dataset


Unnamed: 0.1,Unnamed: 0,Contenido,Valoración,Recomendado_binario
0,0,2 marzo so bad,No recomendado,0
1,1,10 febrero actualmente recomiendo juego contab...,No recomendado,0
2,2,9 febrero increíblemente gracioso ver cómo cdp...,No recomendado,0
3,3,the world in this game is extremely static the...,No recomendado,0
4,4,zero replayability i finished this game in abo...,No recomendado,0
...,...,...,...,...
19995,19995,si,Recomendado,1
19996,19996,cojonudo,Recomendado,1
19997,19997,reostia historia guapisima graficos impresiona...,Recomendado,1
19998,19998,basicamente sublime obra maestra,Recomendado,1


In [22]:
# Check for missing values
missing_values = dataset.isnull().sum()

# Basic statistics of the dataset
basic_stats = dataset.describe()

missing_values, basic_stats

(Unnamed: 0               0
 Contenido              288
 Valoración               0
 Recomendado_binario      0
 dtype: int64,
          Unnamed: 0  Recomendado_binario
 count  20000.000000         20000.000000
 mean    9999.500000             0.500000
 std     5773.647028             0.500013
 min        0.000000             0.000000
 25%     4999.750000             0.000000
 50%     9999.500000             0.500000
 75%    14999.250000             1.000000
 max    19999.000000             1.000000)

In [23]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Descargar recursos necesarios para nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Eliminar registros con valores nulos en `Contenido`
dataset = dataset.dropna(subset=['Contenido'])

# Convertir el texto a minúsculas
dataset['Contenido'] = dataset['Contenido'].str.lower()

# Eliminar caracteres especiales, puntuación y dígitos
dataset['Contenido'] = dataset['Contenido'].apply(lambda x: re.sub(r'[^a-z\s]', '', x))

# Tokenización y lematización
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('spanish'))

def preprocess_text(text):
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

dataset['Contenido'] = dataset['Contenido'].apply(preprocess_text)

dataset.head()


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\buque\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\buque\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\buque\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset['Contenido'] = dataset['Contenido'].str.lower()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

Unnamed: 0.1,Unnamed: 0,Contenido,Valoración,Recomendado_binario
0,0,marzo so bad,No recomendado,0
1,1,febrero actualmente recomiendo juego contaba r...,No recomendado,0
2,2,febrero increblemente gracioso ver cmo cdpr de...,No recomendado,0
3,3,the world in this game is extremely static the...,No recomendado,0
4,4,zero replayability i finished this game in abo...,No recomendado,0


In [24]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [25]:
# Vectorización del texto usando TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(dataset['Contenido']).toarray()
y = dataset['Recomendado_binario']


In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
# # Guardar los datos vectorizados y divididos
# import numpy as np

# np.save('X_train.npy', X_train)
# np.save('X_test.npy', X_test)
# np.save('y_train.npy', y_train)
# np.save('y_test.npy', y_test)

In [12]:
# from sklearn.linear_model import LogisticRegression
# from sklearn.svm import SVC
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# # Cargar los datos divididos
# X_train = np.load('X_train.npy')
# X_test = np.load('X_test.npy')
# y_train = np.load('y_train.npy')
# y_test = np.load('y_test.npy')

In [27]:
# Entrenamiento de modelos
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "SVM": SVC(),
    "Random Forest": RandomForestClassifier()
}

In [14]:
# for model_name, model in models.items():
#     model.fit(X_train, y_train)
#     y_pred = model.predict(X_test)
#     print(f"Results for {model_name}:")
#     print(classification_report(y_test, y_pred))
#     print(confusion_matrix(y_test, y_pred))
#     print(f"Accuracy: {accuracy_score(y_test, y_pred)}\n")

In [15]:
# # Evaluar el rendimiento en el conjunto de entrenamiento
# for model_name, model in models.items():
#     y_train_pred = model.predict(X_train)
#     print(f"Training Results for {model_name}:")
#     print(classification_report(y_train, y_train_pred))
#     print(confusion_matrix(y_train, y_train_pred))
#     print(f"Training Accuracy: {accuracy_score(y_train, y_train_pred)}\n")


In [28]:
from sklearn.model_selection import GridSearchCV

# # Ajuste de Hiperparámetros para SVM
# svm_params = {'C': [0.1, 1, 10, 100], 'kernel': ['linear', 'rbf']}
# svm_grid = GridSearchCV(SVC(), svm_params, cv=5, scoring='accuracy')
# svm_grid.fit(X_train, y_train)
# best_svm = svm_grid.best_estimator_

# Ajuste de Hiperparámetros para Random Forest
rf_params = {'n_estimators': [100, 200], 'max_depth': [10, 20, None], 'min_samples_split': [2, 5, 10]}
rf_grid = GridSearchCV(RandomForestClassifier(), rf_params, cv=5, scoring='accuracy')
rf_grid.fit(X_train, y_train)
best_rf = rf_grid.best_estimator_

# Evaluación de los mejores modelos
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    # "Best SVM": best_svm,
    "Best Random Forest": best_rf
}

for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"Results for {model_name}:")
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))
    print(f"Accuracy: {accuracy_score(y_test, y_pred)}\n")

    # Evaluación en el conjunto de entrenamiento
    y_train_pred = model.predict(X_train)
    print(f"Training Results for {model_name}:")
    print(classification_report(y_train, y_train_pred))
    print(confusion_matrix(y_train, y_train_pred))
    print(f"Training Accuracy: {accuracy_score(y_train, y_train_pred)}\n")


In [17]:
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
 # Validación cruzada
cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
print(f"Cross-Validation Scores for {model_name}: {cv_scores}")
print(f"Mean Cross-Validation Score: {np.mean(cv_scores)}\n")


Cross-Validation Scores for Best Random Forest: [0.85859226 0.84686113 0.85447051 0.85890932 0.85981605]
Mean Cross-Validation Score: 0.8557298551710975



In [18]:
best_rf = rf_grid.best_estimator_

In [19]:
import pickle
best_model = best_rf if np.mean(cross_val_score(best_rf, X_train, y_train, cv=5, scoring='accuracy')) > np.mean(cross_val_score(best_rf, X_train, y_train, cv=5, scoring='accuracy')) else best_rf

with open('best_model.pkl', 'wb') as file:
    pickle.dump(best_model, file)



In [20]:
with open('vectorizer.pkl', 'wb') as file:
    pickle.dump(vectorizer, file)