### LIBRERÍAS

In [1]:
import pandas as pd 
import numpy as np 

# Procesamiento del lenguaje
import nltk

# Procesamiento datos
import ast

# OVERSAMPLING
from imblearn.over_sampling import SMOTE 

# Normalizacion
from sklearn.preprocessing import MinMaxScaler

# Train, Test
from sklearn.model_selection import train_test_split

# Clasificadores
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

# Metricas para Clasificadores
from collections import Counter

from sklearn.preprocessing import label_binarize

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import jaccard_score

# GridSearch
from sklearn.model_selection import GridSearchCV

# Guradar archivos y abrirlos
import pickle

# Escalador
from sklearn.preprocessing import StandardScaler

# PCA
from sklearn.decomposition import PCA

# Rede Neuronales
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical

# Guardar modelo entrenado de red neuronal
from keras.models import save_model

# Guardar scaler y PCA
import joblib

# Cargar un modelo entrenado y guardado
from keras.models import load_model




In [2]:
# Para descargar paquetes

nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('porter_test')
nltk.download('english_grammars')
nltk.download('vader_lexicon')
nltk.download('omw-1.4')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\enaat\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\enaat\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package porter_test to
[nltk_data]     C:\Users\enaat\AppData\Roaming\nltk_data...
[nltk_data]   Package porter_test is already up-to-date!
[nltk_data] Error loading english_grammars: Package 'english_grammars'
[nltk_data]     not found in index
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\enaat\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\enaat\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\enaat\AppData\Roaming\nl

True

In [3]:
df = pd.read_csv("Data/Restaurant_reviews.csv")

In [4]:
df = df[["Restaurant", "Review", "Rating"]].copy()

### PROCESAMIENTO

In [5]:
df.duplicated().sum()

260

In [6]:
df = df.drop_duplicates()

In [7]:
df.isna().sum()

Restaurant    0
Review        8
Rating        2
dtype: int64

In [8]:
df = df.dropna()

### TOKENIZANDO Y ELIMINANDO STOPSWORDS

In [9]:
# Recolectando stopwords del inglés
stopwords = nltk.corpus.stopwords.words("english")

for indice in df.index:
    # Recogiendo el texto de la review
    texto = df["Review"][indice]
    # Tokenizandolo
    tokens = nltk.word_tokenize(text = texto , language = "english")
    tokens = [token.lower( ) for token in tokens]

    tokens_limpios = list() 

    for token in tokens: 
        # Si el token es una stopwords no lo añadimos a tokens_limpios
        # Si el token no es una stopwords verificamos que su longitud sea mayor que 2
        if token not in stopwords: 
            if len(token) > 2: 
                tokens_limpios.append(token)

    # Utiliza el método 'at' para agregar el valor al indice correspondiente
    # Usamos set para no repetir palabras
    df.at[indice, "Tokens"] = str(set(tokens_limpios))
    
# Eliminamos llaves creadas
df["Tokens"] = df["Tokens"].str.strip("{}")

df.drop(["Restaurant", "Review"], axis=1, inplace=True)

### DUMMIES MANUAL A LA TOKENIZACIÓN

In [20]:
# Función para fusionar conjuntos
def fusionar_conjuntos(cadena, conjunto_actual):
    try:
        # Utilizar ast.literal_eval para convertir la cadena en una lista
        lista = ast.literal_eval(cadena)
        # Fusionar la lista con el conjunto actual
        conjunto_actual.update(lista)
        return conjunto_actual
    
    except (SyntaxError, ValueError):
        # Manejar errores si la cadena no es válida
        return conjunto_actual

In [11]:
array_original = df.Tokens.values
# Inicializar un conjunto vacío
conjunto_total = set()
# Aplicar la función de fusión a cada elemento del array
conjunto_total = np.vectorize(fusionar_conjuntos, otypes=[set])(array_original, conjunto_total)
# el conjunto total resultante
conjunto_total = conjunto_total[0]

In [12]:
# Crear columnas para cada palabra y rellenarlas con 1 y 0 si está presente en la columna Tokens
for palabra in conjunto_total:
    df[palabra] = df['Tokens'].apply(lambda x: 1 if palabra in x else 0)

  df[palabra] = df['Tokens'].apply(lambda x: 1 if palabra in x else 0)
  df[palabra] = df['Tokens'].apply(lambda x: 1 if palabra in x else 0)
  df[palabra] = df['Tokens'].apply(lambda x: 1 if palabra in x else 0)
  df[palabra] = df['Tokens'].apply(lambda x: 1 if palabra in x else 0)
  df[palabra] = df['Tokens'].apply(lambda x: 1 if palabra in x else 0)
  df[palabra] = df['Tokens'].apply(lambda x: 1 if palabra in x else 0)
  df[palabra] = df['Tokens'].apply(lambda x: 1 if palabra in x else 0)
  df[palabra] = df['Tokens'].apply(lambda x: 1 if palabra in x else 0)
  df[palabra] = df['Tokens'].apply(lambda x: 1 if palabra in x else 0)
  df[palabra] = df['Tokens'].apply(lambda x: 1 if palabra in x else 0)
  df[palabra] = df['Tokens'].apply(lambda x: 1 if palabra in x else 0)
  df[palabra] = df['Tokens'].apply(lambda x: 1 if palabra in x else 0)
  df[palabra] = df['Tokens'].apply(lambda x: 1 if palabra in x else 0)
  df[palabra] = df['Tokens'].apply(lambda x: 1 if palabra in x else 0)
  df[p

In [None]:
df.drop("Tokens", axis=1, inplace=True)

df = df[df["Rating"] != "Like"]

df.reset_index(drop=True, inplace=True)

df["Rating"] = df["Rating"].astype(float)
df["Rating"] = df["Rating"].astype(int)

In [19]:
# Guardado en formato feather para más rapidez y menos carga
df.to_feather('data/df_reviews_comprimido.feather')

In [3]:
df = pd.read_feather('data/df_reviews_comprimido.feather')

### SEPARACIÓN TRAIN Y TEST

In [4]:
X = df.drop("Rating", axis=1)
y = df["Rating"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)

print(f"Conjunto de Train: {X_train.shape, y_train.shape}")
print(f"Conjunto de Test: {X_test.shape, y_test.shape}")

Conjunto de Train: ((7784, 17750), (7784,))
Conjunto de Test: ((1947, 17750), (1947,))


### BÚSQUEDA DEL MEJOR MODELO

In [23]:
modelos = [DecisionTreeClassifier(random_state=42),RandomForestClassifier(random_state=42), AdaBoostClassifier(random_state=42), GradientBoostingClassifier(random_state=42)]

df_metricas_clasificador = list()

for model in modelos:
    
    print(str(model))
    
    model.fit(X_train, y_train)
    
    yhat = model.predict(X_test)
    
    acc = accuracy_score(y_test, yhat)
    rec = recall_score(y_test, yhat, average = "macro")
    f1s = f1_score(y_test, yhat, average = "macro")
     # Tiene sentido cuando es clasificacion binaria
    mat = confusion_matrix(y_test, yhat)
    params = model.get_params()

    y_test_bin = label_binarize(y_test, classes=model.classes_)
    yhat_bin = label_binarize(yhat, classes=model.classes_)
    roc = roc_auc_score(y_test_bin,yhat_bin,multi_class='ovr')
    
    df_metricas_clasificador.append([str(model), model, acc, rec, f1s, mat, roc, params])

DecisionTreeClassifier(random_state=42)
RandomForestClassifier(random_state=42)
AdaBoostClassifier(random_state=42)
GradientBoostingClassifier(random_state=42)


In [24]:
df_metricas_clasificador = pd.DataFrame(data = df_metricas_clasificador, columns = ["name", "model", "accuracy", "recall", "f1_score", "cm", "roc_auc_score", "params"])

df_metricas_clasificador.sort_values(by = "recall", ascending = False)

Unnamed: 0,name,model,accuracy,recall,f1_score,cm,roc_auc_score,params
3,GradientBoostingClassifier(random_state=42),([DecisionTreeRegressor(criterion='friedman_ms...,0.569594,0.445809,0.440828,"[[254, 10, 4, 24, 49], [50, 5, 19, 31, 27], [2...",0.661799,"{'ccp_alpha': 0.0, 'criterion': 'friedman_mse'..."
1,RandomForestClassifier(random_state=42),"(DecisionTreeClassifier(max_features='sqrt', r...",0.576271,0.437829,0.41207,"[[274, 4, 3, 27, 33], [57, 4, 11, 37, 23], [50...",0.658453,"{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w..."
2,AdaBoostClassifier(random_state=42),"(DecisionTreeClassifier(max_depth=1, random_st...",0.541346,0.431093,0.424149,"[[253, 14, 18, 7, 49], [55, 4, 24, 15, 34], [3...",0.651334,"{'algorithm': 'SAMME.R', 'base_estimator': 'de..."
0,DecisionTreeClassifier(random_state=42),DecisionTreeClassifier(random_state=42),0.485362,0.400609,0.401535,"[[209, 34, 42, 23, 33], [42, 15, 25, 29, 21], ...",0.630684,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit..."


In [25]:
df_metricas_clasificador.to_csv("data/df_metricas.csv", index = False)

In [None]:
model = RandomForestClassifier(n_estimators=100, criterion="entropy", max_depth=None, min_samples_split=10, min_samples_leaf=1, max_features="sqrt", random_state=42)
# Entrenar el modelo
model.fit(X_train, y_train)
# Hacer predicciones en el conjunto de prueba
yhat = model.predict(X_test)

# Calcular la métricas en este caso
metrica = {"Jaccard Index" : jaccard_score(y_test, yhat, average = "macro"),
            "Accuracy" : accuracy_score(y_test, yhat),
            "Recall" : recall_score(y_test, yhat, average = "macro"),
            "F1-score" : f1_score(y_test, yhat, average = "macro")}

In [53]:
metrica

{'Jaccard Index': 0.2904449337318322,
 'Accuracy': 0.5742167437082691,
 'Recall': 0.4294045463662116,
 'F1-score': 0.3956195663425309}

In [None]:
# Guardando modelo ganador randon forest
with open('data/random_entrenado.pkl', 'wb') as archivo:
    pickle.dump(model, archivo)

### BUSQUEDA DE MEJORES PARÁMETROS

In [24]:
# No normalizamos porque ya están escalados los datos
# Cogemos una muestra de los datos para buscar parámetros
X_grid_train = np.array(X_train)[100:200]
y_grid_train = np.array(y_train)[100:200].reshape(-1,1).flatten()
X_grid_test = np.array(X_test)[100:200]
y_grid_test =  np.array(y_test)[100:200].reshape(-1,1).flatten()

In [None]:
# Definir el modelo RandomForestClassifier
model = GradientBoostingClassifier(random_state=42)

# Definir el espacio de búsqueda de hiperparámetros
param_grid = {
    'n_estimators': [100, 120, 140],
    'max_depth': [None, 3, 4, 5],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None]
}

# Inicializar GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring='accuracy')

# Ajustar el modelo a los datos
grid_search.fit(X_grid_train, y_grid_train)

# Obtener los mejores hiperparámetros
best_params = grid_search.best_params_
print("Mejores hiperparámetros:", best_params)

# Obtener la métrica de validación del mejor modelo
best_score = grid_search.best_score_
print("Métrica de validación del mejor modelo:", best_score)

# Obtener el mejor modelo
best_model = grid_search.best_estimator_

# Evaluar el mejor modelo en el conjunto de prueba
accuracy = best_model.score(X_test, y_test)
print("Exactitud del mejor modelo en el conjunto de prueba:", accuracy)

In [68]:
model = GradientBoostingClassifier(n_estimators = 100, max_depth= None, min_samples_split = 2, min_samples_leaf= 1, max_features = "sqrt", random_state=42)
model.fit(X_train, y_train)
yhat = model.predict(X_test)

In [69]:
# Calcular la métricas en este caso
metrica = {"Jaccard Index" : jaccard_score(y_test, yhat, average = "macro"),
            "Accuracy" : accuracy_score(y_test, yhat),
            "Recall" : recall_score(y_test, yhat, average = "macro"),
            "F1-score" : f1_score(y_test, yhat, average = "macro")}

In [70]:
metrica

{'Jaccard Index': 0.3033896590329511,
 'Accuracy': 0.578839239856189,
 'Recall': 0.4389142839404654,
 'F1-score': 0.41207362433909644}

In [71]:
with open('data/gradient_entrenado.pkl', 'wb') as archivo:
    pickle.dump(model, archivo)

### EJEMPLO REAL

In [72]:
with open('data/gradient_entrenado.pkl', 'rb') as archivo:
    modelo_cargado = pickle.load(archivo)

In [73]:
# Review con un rating de 3
data = {"Review": "Came for lunch with my sister. We loved our Thai-style mains which were amazing with lots of flavour, very impressive for a vegetarian restaurant. But the service was below average and the chips were too terrible to finish. When we arrived at 1.40, we had to wait 20 minutes while they got our table ready. OK, so we didn't have a reservation, but the restaurant was only half full. There was no reason to make us wait at all. We ordered the chips as a side dish and they looked delicious. But, when we tasted them, they were overcooked and swimming in oil so we left most of them. We expected a lot more for $10! When the waiter asked if everything was ok, we said we really didn't like the chips and he said 'That's funny, I love them' and that was it. He didn't offer us anything else or take them off our bill. Also, when we didn't leave a tip, he looked annoyed. I was really excited about visiting Vega, and the mains were just fantastic, but the rest of the experience was really disappointing."}

In [74]:
# Procesamiento de la review
df_ejemplo = pd.DataFrame(data, index=[0])

stopwords = nltk.corpus.stopwords.words("english")

for indice in df_ejemplo.index:
    texto = df_ejemplo["Review"][indice]

    tokens = nltk.word_tokenize(text = texto , language = "english")

    tokens_limpios = list() 

    tokens = [token.lower( ) for token in tokens]

    for token in tokens: 
        if token not in stopwords: 
            if len(token) > 2: 
                tokens_limpios.append(token)
    # Utiliza el método 'at' para agregar el valor
    df_ejemplo.at[indice, "Tokens"] = str(set(tokens_limpios))

df_ejemplo["Tokens"] = df_ejemplo["Tokens"].str.strip("{}")

array_original = df_ejemplo.Tokens.values
# Inicializar un conjunto vacío
conjunto_total = set()
# Aplicar la función de fusión a cada elemento del array
conjunto_total = np.vectorize(fusionar_conjuntos, otypes=[set])(array_original, conjunto_total)
# el conjunto total resultante
conjunto_total = conjunto_total[0]

# Crear columnas para cada clave del conjunto
for palabra in conjunto_total:
    columna_comida = f"{palabra}"
    df_ejemplo[palabra] = df_ejemplo['Tokens'].apply(lambda x: 1 if palabra in x else 0)

df_ejemplo.drop(["Review", "Tokens"], axis=1, inplace=True)

# Buscando columnas identicas a nuestro modelo
conindicencias = []

for col in df.columns:
    for columna in df_ejemplo.columns:
        if col == columna:
            conindicencias.append(col)

df_ejemplo = df_ejemplo[conindicencias]

df_prueba = pd.DataFrame(columns=df.columns)

df_prueba = pd.concat([df_prueba, df_ejemplo])
df_prueba = df_prueba.fillna(0)
df_prueba.drop("Rating", axis=1, inplace=True)

In [75]:
# Prediccion
rating = modelo_cargado.predict(df_prueba)
print(rating)

[1]


### RED NEURONAL

In [12]:
# Estandarizar los datos
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Aplicar PCA para reducir a 10 componentes principales
pca = PCA(n_components=10)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

# Convertir las etiquetas a one-hot encoding
y_train_encoded = to_categorical(y_train - 1, num_classes=5)  # Restamos 1 para que las etiquetas vayan de 0 a 4
y_test_encoded = to_categorical(y_test - 1, num_classes=5)

# Construir el modelo de red neuronal
model = Sequential()
model.add(Dense(128, input_dim=10, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(5, activation='softmax'))  # 5 neuronas para las 5 clases
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Entrenar el modelo
history = model.fit(X_train_pca, y_train_encoded, epochs=10, batch_size=32, validation_split=0.2)

# Evaluar el modelo en el conjunto de prueba
accuracy = model.evaluate(X_test_pca, y_test_encoded)[1]
print(f'Accuracy on test set: {accuracy}')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy on test set: 0.4273240864276886


In [14]:
# Guardar el modelo en formato HDF5
model.save("data/modelo_red_neuronal_pca.h5")

  saving_api.save_model(


In [17]:
# Guardar el scaler y el pca
joblib.dump(scaler, 'data/scaler.joblib')
joblib.dump(pca, 'data/pca.joblib')

['data/pca.joblib']

### EJEMPLO REAL

In [None]:
# Cargar el scaler y el pca
scaler = joblib.load('data/scaler.joblib')
pca = joblib.load('data/pca.joblib')

# Cargar el modelo desde el archivo guardado
modelo = load_model("data/modelo_red_neuronal_pca.h5")

In [None]:
# Review cuyo rating es un 3
data = {"Review": "Came for lunch with my sister. We loved our Thai-style mains which were amazing with lots of flavour, very impressive for a vegetarian restaurant. But the service was below average and the chips were too terrible to finish. When we arrived at 1.40, we had to wait 20 minutes while they got our table ready. OK, so we didn't have a reservation, but the restaurant was only half full. There was no reason to make us wait at all. We ordered the chips as a side dish and they looked delicious. But, when we tasted them, they were overcooked and swimming in oil so we left most of them. We expected a lot more for $10! When the waiter asked if everything was ok, we said we really didn't like the chips and he said 'That's funny, I love them' and that was it. He didn't offer us anything else or take them off our bill. Also, when we didn't leave a tip, he looked annoyed. I was really excited about visiting Vega, and the mains were just fantastic, but the rest of the experience was really disappointing."}

In [21]:
# Procesamiento de la review
df_ejemplo = pd.DataFrame(data, index=[0])

stopwords = nltk.corpus.stopwords.words("english")

for indice in df_ejemplo.index:
    texto = df_ejemplo["Review"][indice]

    tokens = nltk.word_tokenize(text = texto , language = "english")

    tokens_limpios = list() 

    tokens = [token.lower( ) for token in tokens]

    for token in tokens: 
        if token not in stopwords: 
            if len(token) > 2: 
                tokens_limpios.append(token)
    # Utiliza el método 'at' para agregar el valor
    df_ejemplo.at[indice, "Tokens"] = str(set(tokens_limpios))

df_ejemplo["Tokens"] = df_ejemplo["Tokens"].str.strip("{}")

array_original = df_ejemplo.Tokens.values
# Inicializar un conjunto vacío
conjunto_total = set()
# Aplicar la función de fusión a cada elemento del array
conjunto_total = np.vectorize(fusionar_conjuntos, otypes=[set])(array_original, conjunto_total)
# el conjunto total resultante
conjunto_total = conjunto_total[0]

# Crear columnas para cada clave del conjunto
for palabra in conjunto_total:
    columna_comida = f"{palabra}"
    df_ejemplo[palabra] = df_ejemplo['Tokens'].apply(lambda x: 1 if palabra in x else 0)

df_ejemplo.drop(["Review", "Tokens"], axis=1, inplace=True)

# Buscando columnas identicas a nuestro modelo
conindicencias = []

for col in df.columns:
    for columna in df_ejemplo.columns:
        if col == columna:
            conindicencias.append(col)

df_ejemplo = df_ejemplo[conindicencias]

df_prueba = pd.DataFrame(columns=df.columns)

df_prueba = pd.concat([df_prueba, df_ejemplo])
df_prueba = df_prueba.fillna(0)
df_prueba.drop("Rating", axis=1, inplace=True)

# Preprocesar nuevos datos
X_new_scaled = scaler.transform(df_prueba)
X_new_pca = pca.transform(X_new_scaled)

# Hacer predicciones en los nuevos datos preprocesados
predicciones = modelo.predict(X_new_pca)

# Si estás trabajando con un problema de clasificación, puedes obtener las clases predichas
clases_predichas = np.argmax(predicciones, axis=1)



In [23]:
# Clase 0 = rating 1
# Clase 1 = rating 2
# Clase 2 = rating 3
# Clase 3 = rating 4
# Clase 4 = rating 5
clases_predichas

array([0], dtype=int64)