In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
pip install --upgrade nltk


Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
Collecting regex>=2021.8.3
  Downloading regex-2024.11.6-cp38-cp38-win_amd64.whl (274 kB)Note: you may need to restart the kernel to use updated packages.
Installing collected packages: regex, nltk
  Attempting uninstall: regex
    Found existing installation: regex 2021.4.4
    Uninstalling regex-2021.4.4:
      Successfully uninstalled regex-2021.4.4
  Attempting uninstall: nltk
    Found existing installation: nltk 3.6.1

    Uninstalling nltk-3.6.1:
      Successfully uninstalled nltk-3.6.1
Successfully installed nltk-3.9.1 regex-2024.11.6


In [3]:
# Importación librerías
import pandas as pd
import os
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from xgboost import XGBRegressor, XGBClassifier
from sklearn.metrics import r2_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt

In [4]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

# Download necessary NLTK data (run these lines once)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger') # Needed for nltk.pos_tag
nltk.download('omw-1.4')

wordnet_lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

#Convert NLTK treebank tags to WordNet tags for lemmatization.
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN # Default to noun if POS tag is not found or is ambiguous

def split_lemmas_no_stopwords(text):
    text = text.lower()
    tokens = nltk.word_tokenize(text)
    # Filter out punctuation and numbers, then POS tag
    filtered_tokens = [word for word in tokens if word.isalpha()]
    pos_tagged_tokens = nltk.pos_tag(filtered_tokens)

    lemmas = []
    for word, tag in pos_tagged_tokens:
        if word not in stop_words:
            # Get the WordNet POS tag
            wn_tag = get_wordnet_pos(tag)
            lemma = wordnet_lemmatizer.lemmatize(word, pos=wn_tag)
            lemmas.append(lemma)

    return lemmas

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\junio\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\junio\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\junio\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\junio\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\junio\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [5]:
# Carga de datos de archivo .csv
dataTraining = pd.read_csv('https://github.com/albahnsen/MIAD_ML_and_NLP/raw/main/datasets/dataTraining.zip', encoding='UTF-8', index_col=0)
dataTesting = pd.read_csv('https://github.com/albahnsen/MIAD_ML_and_NLP/raw/main/datasets/dataTesting.zip', encoding='UTF-8', index_col=0)

In [6]:
# Visualización datos de entrenamiento
dataTraining.head()

Unnamed: 0,year,title,plot,genres,rating
3107,2003,Most,most is the story of a single father who takes...,"['Short', 'Drama']",8.0
900,2008,How to Be a Serial Killer,a serial killer decides to teach the secrets o...,"['Comedy', 'Crime', 'Horror']",5.6
6724,1941,A Woman's Face,"in sweden , a female blackmailer with a disfi...","['Drama', 'Film-Noir', 'Thriller']",7.2
4704,1954,Executive Suite,"in a friday afternoon in new york , the presi...",['Drama'],7.4
2582,1990,Narrow Margin,"in los angeles , the editor of a publishing h...","['Action', 'Crime', 'Thriller']",6.6


In [7]:
# Definición de variable de interés (y)
dataTraining['genres'] = dataTraining['genres'].map(lambda x: eval(x))
le = MultiLabelBinarizer()
y_genres = le.fit_transform(dataTraining['genres'])

In [10]:
import nltk
nltk.download('averaged_perceptron_tagger_eng')


[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\junio\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger_eng.zip.


True

In [11]:
vect = TfidfVectorizer(analyzer=split_lemmas_no_stopwords, ngram_range=(1,2))
X_dtm = vect.fit_transform(dataTraining['plot'])
X_dtm.shape

(7895, 32373)

In [None]:
vect = TfidfVectorizer(analyzer=split_lemmas_no_stopwords, ngram_range=(1,2))
X_dtm = vect.fit_transform(dataTraining['plot'])


In [12]:
# Separación de variables predictoras (X) y variable de interés (y) en set de entrenamiento y test usandola función train_test_split
X_train, X_test, y_train_genres, y_test_genres = train_test_split(X_dtm, y_genres, test_size=0.33, random_state=42)

In [13]:
clf = OneVsRestClassifier(LogisticRegression())
clf.fit(X_train, y_train_genres)

In [14]:
# Predicción del modelo de clasificación
y_pred_genres = clf.predict_proba(X_test)

# Impresión del desempeño del modelo
roc_auc_score(y_test_genres, y_pred_genres, average='macro')

0.8788006027019297

In [16]:
import joblib
import os

# Crear carpeta para guardar los modelos
os.makedirs('model_deployment', exist_ok=True)

# Guardar modelo y vectorizador
joblib.dump(clf, 'model_deployment/movie_genre_classifier.pkl', compress=3)
joblib.dump(vect, 'model_deployment/tfidf_vectorizer.pkl', compress=3)
joblib.dump(le, 'model_deployment/label_binarizer.pkl', compress=3)


['model_deployment/label_binarizer.pkl']

In [None]:
pip install flask-restx


In [None]:
from flask import Flask, request
from flask_restx import Api, Resource, fields
import joblib
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

# Inicializar objetos necesarios para la lematización
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
stop_words = set(stopwords.words('english'))
wordnet_lemmatizer = WordNetLemmatizer()

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def split_lemmas_no_stopwords(text):
    text = text.lower()
    tokens = nltk.word_tokenize(text)
    filtered_tokens = [word for word in tokens if word.isalpha()]
    pos_tagged_tokens = nltk.pos_tag(filtered_tokens)
    lemmas = []
    for word, tag in pos_tagged_tokens:
        if word not in stop_words:
            wn_tag = get_wordnet_pos(tag)
            lemma = wordnet_lemmatizer.lemmatize(word, pos=wn_tag)
            lemmas.append(lemma)
    return lemmas

# Cargar modelo y objetos
model = joblib.load('model_deployment/movie_genre_classifier.pkl')
vectorizer = joblib.load('model_deployment/tfidf_vectorizer.pkl')
label_binarizer = joblib.load('model_deployment/label_binarizer.pkl')

# Inicializar Flask y Flask-RESTX
app = Flask(__name__)
api = Api(app, version='1.0', title='🎬 Predicción de Género de Películas',
          description='API para predecir géneros cinematográficos a partir de la sinopsis',
          doc='/docs')

ns = api.namespace('predict', description='Predicción de géneros')

# Esquema de entrada
parser = ns.parser()
parser.add_argument('plot', type=str, required=True, help='Sinopsis de la película', location='args')

# Esquema de salida
genre_fields = api.model('Prediction', {
    'genres': fields.List(fields.String),
    'probabilities': fields.List(fields.Float),
})

@ns.route('/')
class GenrePredictor(Resource):
    @ns.doc(parser=parser)
    @ns.marshal_with(genre_fields)
    def get(self):
        args = parser.parse_args()
        plot = args['plot']

        # Transformar la sinopsis
        X_input = vectorizer.transform([plot])

        # Predecir
        probs = model.predict_proba(X_input)[0]

        # Devolver nombres de géneros y sus probabilidades
        genres = label_binarizer.classes_
        results = sorted(zip(genres, probs), key=lambda x: -x[1])  # orden descendente

        return {
            'genres': [g for g, p in results],
            'probabilities': [round(p, 3) for g, p in results]
        }

if __name__ == '__main__':
    app.run(debug=True, host='0.0.0.0', port=5000)


In [None]:
label_binarizer.classes_

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib
from modelo_generos_ML_TDVH import split_lemmas_no_stopwords

# Cargar los datos
dataTraining = pd.read_csv('https://github.com/albahnsen/MIAD_ML_and_NLP/raw/main/datasets/dataTraining.zip', index_col=0)
dataTraining['genres'] = dataTraining['genres'].map(lambda x: eval(x))

# Entrenar el nuevo vectorizador con función importada
vect = TfidfVectorizer(analyzer=split_lemmas_no_stopwords, ngram_range=(1, 2))
X_dtm = vect.fit_transform(dataTraining['plot'])

# Guardar el nuevo vectorizador asegurando que la función ya no esté en __main__
joblib.dump(vect, 'model_deployment/tfidf_vectorizer.pkl', compress=3)
