In [12]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed May 22 08:37:24 2024

@author: jaimeunriza
"""
import warnings
warnings.filterwarnings('ignore')

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import r2_score, roc_auc_score

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer

from tensorflow.keras.optimizers import SGD, Adam

from keras import backend as K
from keras.models import Sequential
from keras.layers import Dense, Activation,Dropout
import keras.optimizers as opts
from keras.callbacks import EarlyStopping


In [2]:


# Descargar los recursos necesarios de NLTK
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to /Users/robertogb/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/robertogb/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/robertogb/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:

# Carga de datos de archivo .csv
dataTraining = pd.read_csv('https://github.com/albahnsen/MIAD_ML_and_NLP/raw/main/datasets/dataTraining.zip', encoding='UTF-8', index_col=0)
dataTesting = pd.read_csv('https://github.com/albahnsen/MIAD_ML_and_NLP/raw/main/datasets/dataTesting.zip', encoding='UTF-8', index_col=0)


In [4]:
dataTraining.head()

Unnamed: 0,year,title,plot,genres,rating
3107,2003,Most,most is the story of a single father who takes...,"['Short', 'Drama']",8.0
900,2008,How to Be a Serial Killer,a serial killer decides to teach the secrets o...,"['Comedy', 'Crime', 'Horror']",5.6
6724,1941,A Woman's Face,"in sweden , a female blackmailer with a disfi...","['Drama', 'Film-Noir', 'Thriller']",7.2
4704,1954,Executive Suite,"in a friday afternoon in new york , the presi...",['Drama'],7.4
2582,1990,Narrow Margin,"in los angeles , the editor of a publishing h...","['Action', 'Crime', 'Thriller']",6.6


In [5]:
dataTraining.shape

(7895, 5)

In [6]:
dataTesting.head()

Unnamed: 0,year,title,plot
1,1999,Message in a Bottle,"who meets by fate , shall be sealed by fate ...."
4,1978,Midnight Express,"the true story of billy hayes , an american c..."
5,1996,Primal Fear,martin vail left the chicago da ' s office to ...
6,1950,Crisis,husband and wife americans dr . eugene and mr...
7,1959,The Tingler,the coroner and scientist dr . warren chapin ...


In [7]:
dataTesting.shape

(3383, 3)

In [8]:
# Función para lematizar palabras con su respectiva POS
def lemmatize_with_pos(text):
    lemmatizer = WordNetLemmatizer()
    # Tokenizar las palabras
    tokens = word_tokenize(text)
    # Etiquetar POS de cada palabra
    tagged_tokens = nltk.pos_tag(tokens)
    # Lematizar cada palabra según su POS
    lemmatized_tokens = [lemmatizer.lemmatize(word, pos=pos_tag[0].lower())
                         if pos_tag[0].lower() in ['n', 'v', 'a', 'r'] else word
                         for word, pos_tag in tagged_tokens]
    return ' '.join(lemmatized_tokens)

# Concatenar título y año con la trama
dataTraining['plot_title_year'] =  dataTraining['plot'] + ' ' + dataTraining['title'] + ' ' + dataTraining['year'].astype(str)
dataTesting['plot_title_year'] =  dataTesting['plot'] + ' ' + dataTesting['title'] + ' ' + dataTesting['year'].astype(str)

# Aplicar lematización con POS a los datos de entrenamiento
X_train_lemmatized = [lemmatize_with_pos(text) for text in dataTraining['plot_title_year']]


# Crear el vectorizador CountVectorizer
vect = CountVectorizer(lowercase=True, strip_accents='ascii', stop_words='english',ngram_range=(1,2), max_features=10000)

# Definición de variables predictoras (X)
X_dtm = vect.fit_transform(X_train_lemmatized)
print("Shape of DTM:", X_dtm.shape)

# Definición de variable de interés (y)
dataTraining['genres'] = dataTraining['genres'].map(lambda x: eval(x))
mlb = MultiLabelBinarizer()
y_genres = mlb.fit_transform(dataTraining['genres'])


Shape of DTM: (7895, 10000)


In [9]:

# Separación de variables predictoras (X) y variable de interés (y) en set de entrenamiento y test usandola función train_test_split
X_train, X_test, y_train_genres, y_test_genres = train_test_split(X_dtm, y_genres, test_size=0.20, random_state=42)


# Definición de dimensiones de entrada, varaibles predictoras
dims = X_train.shape[1]
print(dims, 'input variables')

# Definición de dimensiones de salida, varaibles de interés
output_var = y_train_genres.shape[1]
print(output_var, ' output variables')

######################### modelos

10000 input variables
24  output variables


### Modelo de Radom Forest

In [10]:
rf_model = OneVsRestClassifier(RandomForestClassifier(n_estimators=100, random_state=42))
rf_model.fit(X_train, y_train_genres)
y_pred_rf = rf_model.predict(X_test)
roc_auc_rf = roc_auc_score(y_test_genres, y_pred_rf, average='macro')
print("ROC AUC Score (Random Forest):", roc_auc_rf)

ROC AUC Score (Random Forest): 0.5545072093765353


### Modelo de Logistic Regression

In [13]:
lr_model = OneVsRestClassifier(LogisticRegression(max_iter=1000, random_state=42))
lr_model.fit(X_train, y_train_genres)
y_pred_lr = lr_model.predict(X_test)
roc_auc_lr = roc_auc_score(y_test_genres, y_pred_lr, average='macro')
print("ROC AUC Score (Logistic Regression):", roc_auc_lr)

ROC AUC Score (Logistic Regression): 0.6538607677270819


### Modelo de Red Neuronal

In [14]:
K.clear_session()

# Definición red neuronal con la función Sequential()
model = Sequential()

# Definición de la capa densa con un tamaño de salida igual a output_var y un input_shape de dims
model.add(Dense(3500, input_shape=(dims,),activation='tanh'))
model.add(Dropout(0.2))
#model.add(Dense(1500,activation='tanh'))
#model.add(Dropout(0.2))


model.add(Dense(output_var))
model.add(Activation('tanh'))


# Impresión de la arquitectura de la red neuronal
print(model.summary())

# Definición de función de perdida. Se usa mean_squared_error dado que es un ejercicio de regresión
from keras.optimizers import SGD,Adam, RMSprop, Adagrad, Adadelta, Adamax, Nadam
model.compile(optimizer=Adadelta(learning_rate=0.020), loss='mean_squared_error')

import numpy as np

# Ordenar los índices de la matriz dispersa X_train
X_train.sort_indices()

# Similarmente, ordenar los índices de la matriz dispersa X_test si es necesario
X_test.sort_indices()

# Definición de la función EarlyStopping con parámetro definido en la función nn_model_params
early_stopping = EarlyStopping(monitor="val_loss", patience = 5)

# Entrenamiento de la red neuronal con ## épocas
model.fit(X_train, y_train_genres,batch_size=10,
          validation_data = (X_test, y_test_genres),
          epochs=5,
          callbacks=[early_stopping])

# Predicción del modelo
y_pred_nn = model.predict(X_test)

# Calcula el área bajo la curva ROC (ROC AUC)
roc_auc_nn = roc_auc_score(y_test_genres, y_pred_nn, average='macro')

# Imprime el desempeño del modelo
print("ROC AUC Score (Red Neuronal):", roc_auc_nn)



None
Epoch 1/5
[1m632/632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m128s[0m 201ms/step - loss: 0.1204 - val_loss: 0.0988
Epoch 2/5
[1m632/632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m164s[0m 259ms/step - loss: 0.0966 - val_loss: 0.0912
Epoch 3/5
[1m632/632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m165s[0m 261ms/step - loss: 0.0872 - val_loss: 0.0869
Epoch 4/5
[1m632/632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m161s[0m 255ms/step - loss: 0.0807 - val_loss: 0.0841
Epoch 5/5
[1m632/632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m271s[0m 428ms/step - loss: 0.0767 - val_loss: 0.0821
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step
ROC AUC Score (Red Neuronal): 0.7493899317868015


### Comparación de Resultados

In [15]:
# Comparación de resultados
print("\nComparación de Modelos")
print(f"Red Neuronal: ROC AUC = {roc_auc_nn}")
print(f"Random Forest: ROC AUC = {roc_auc_rf}")
print(f"Logistic Regression: ROC AUC = {roc_auc_lr}")


Comparación de Modelos
Red Neuronal: ROC AUC = 0.7493899317868015
Random Forest: ROC AUC = 0.5545072093765353
Logistic Regression: ROC AUC = 0.6538607677270819


### Selección Mejor Modelo


Analizando los resultados de las métricas de desempeño, en este caso y específicamente el ROC-AUC (Área bajo la curva) para cada modelo. El ROC-AUC es una medida de la capacidad de un modelo para distinguir entre clases. El valor más alto indica un mejor rendimiento.

**Resultados de los Modelos**
- Red Neuronal: ROC AUC = 0.7494
- Random Forest: ROC AUC = 0.5545
- Logistic Regression: ROC AUC = 0.6539

**Mejor Modelo Red Neuronal (ROC AUC = 0.7494)**
Indica que la red neuronal tiene una buena capacidad para predecir la probabilidad de que una película pertenezca a un género en particular. Si bien es importante considerar el factor tiempo de entrenamiento, la complejidad del modelo y la interpretabilidad, el modelo Red Neuronal permite obtener un mejor desempeño significativamnte superior a los otros dos modelos Radom Forest y Logistic Regression.

Los resultados obtenidos se pueden considerar como esperables en el caso de problemas de NLP (Procesamiento de Lenguaje Natural), las redes neuronales y esquemas LSTM nos demuestran que son muy efectivas.


### Predicción Mejor Modelo

In [16]:
# Predicción del conjunto de test utilizando el mejor modelo (Red Neuronal)

# transformación variables predictoras X del conjunto de test
X_test_dtm = vect.transform(dataTesting[  'plot_title_year'   ])

cols = ['p_Action', 'p_Adventure', 'p_Animation', 'p_Biography', 'p_Comedy', 'p_Crime', 'p_Documentary', 'p_Drama', 'p_Family',
        'p_Fantasy', 'p_Film-Noir', 'p_History', 'p_Horror', 'p_Music', 'p_Musical', 'p_Mystery', 'p_News', 'p_Romance',
        'p_Sci-Fi', 'p_Short', 'p_Sport', 'p_Thriller', 'p_War', 'p_Western']


# Predicción del conjunto de test
y_pred_proba = model.predict(X_test_dtm)



# Guardar predicciones en formato exigido en la competencia de kaggle
res = pd.DataFrame(y_pred_proba, index=dataTesting.index, columns=cols)
res.to_csv('pred_genres_text_RN11.csv', index_label='ID')
res.head()

[1m106/106[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 19ms/step


Unnamed: 0,p_Action,p_Adventure,p_Animation,p_Biography,p_Comedy,p_Crime,p_Documentary,p_Drama,p_Family,p_Fantasy,...,p_Musical,p_Mystery,p_News,p_Romance,p_Sci-Fi,p_Short,p_Sport,p_Thriller,p_War,p_Western
1,0.129089,0.184302,0.060074,0.051592,0.399331,0.090644,0.02479,0.454759,-0.125685,0.091039,...,-0.027958,0.147027,0.05898,0.321345,-0.012792,0.018856,0.013376,0.076395,-0.090675,-0.019367
4,0.098633,-0.137732,-0.019283,0.183989,0.208818,0.100744,0.005444,0.51481,-0.004787,0.044391,...,0.108273,0.122407,-0.01043,0.069012,-0.086326,0.044232,0.001842,0.181614,0.089033,0.047333
5,-0.173774,0.009113,0.040335,0.080175,-0.155478,0.68766,-0.08826,0.841854,-0.195753,-0.203197,...,0.032964,0.168047,-0.272761,0.225255,0.187306,0.011013,0.092102,0.851831,-0.550483,-0.136624
6,0.136025,0.094897,0.010886,0.165577,0.209038,0.104102,0.18737,0.672141,0.111046,-0.04676,...,-0.112807,0.111829,0.050792,-0.042713,0.068157,0.255437,0.237588,0.418159,0.066017,-0.022192
7,-0.079627,-0.201107,-0.088293,-0.135205,0.420597,0.209097,-0.037694,-0.274813,-0.221566,0.285618,...,0.128819,0.20209,-0.022768,0.115525,0.511115,0.022251,-0.128303,0.197575,-0.135037,-0.224616


### Crear API con Flask

In [17]:
# Guardar el modelo entrenado
model.save('model.h5')

import pickle
# Guardar el vectorizador
with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(vect, f)




In [None]:
from flask import Flask, request, jsonify
import tensorflow as tf
from keras.models import load_model
import pickle

app = Flask(__name__)

# Cargar el modelo y el vectorizador
model = load_model('model.h5')
with open('vectorizer.pkl', 'rb') as f:
    vect = pickle.load(f)

@app.route('/predict', methods=['POST'])
def predict():
    data = request.json
    plot_title_year = data['plot_title_year']
    
    # Preprocesamiento
    X = vect.transform([plot_title_year])
    
    # Predicción
    prediction = model.predict(X)
    
    # Crear respuesta
    response = {
        'predictions': prediction.tolist()
    }
    return jsonify(response)

if __name__ == '__main__':
    app.run(debug=True)


### Ejecutar la API localmente

In [None]:
# Ejecución de la aplicación que disponibiliza el modelo de manera local en el puerto 5000
app.run(debug=True, use_reloader=False, host='0.0.0.0', port=5001)