In [21]:
from __future__ import annotations
import sys; sys.path.insert(0, '..')

%load_ext autoreload
%autoreload 2

# python
import os
import ast
import ssl
import csv

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from pathlib import Path
from inspect import cleandoc

# utils
from utils import Constants
from modules.preprocesing import preprocess

# stat
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import jaccard_score # Métrica común para multilabel
from sklearn.exceptions import ConvergenceWarning

# typings
from pandas import DataFrame as PandasDF
from typing import Dict

# warnings
import warnings
warnings.filterwarnings('ignore', category=ConvergenceWarning)
warnings.filterwarnings('ignore')

# setup
plt.style.use('seaborn-v0_8')
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('max_colwidth', None)
# decimals
np.set_printoptions(precision=6)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [22]:
# cargar el dataset sklearn
if not os.environ.get('CI'):
    ssl._create_default_https_context =\
        ssl._create_unverified_context
          
# rutas absolutas
here: Path = Path.cwd().absolute().parent
data: Path = here / 'data'
poetry_fundation_cleaned: Path = data / 'CleanedPoetryFoundationData.csv'
cv_poetry: Path = data / 'vallejo_poems_en.csv'

In [23]:
setup_load:Dict = dict(
    sep=Constants.PIPE_STR,
    quotechar='"',
    quoting=csv.QUOTE_NONNUMERIC,
    encoding=Constants.ENCODING
)

if not poetry_fundation_cleaned.is_file() or not cv_poetry.is_file():
    raise FileNotFoundError(
        cleandoc(f'''
        El archivo {poetry_fundation_cleaned} no existe.
        Por favor, descargue el archivo desde:
        https://www.kaggle.com/datasets/abhinavwalia95/poetryfoundationorg
        y coloquelo en la carpeta data.
        ''')
    )
    
poetry_df: PandasDF = (
    pd.read_csv(
        str(poetry_fundation_cleaned), 
        **setup_load
    )
)

cv_df: PandasDF = (
        pd.read_csv(
        str(cv_poetry), 
        **setup_load
    )
)

cv_df[['title', 'poem']] = (
        cv_df[['title', 'poem']]
        .apply(lambda col: col.astype(str).apply(preprocess))
    )

poetry_df = poetry_df.loc[~poetry_df.poem.isna(),:]
poetry_df['tags'] = poetry_df['tags'].apply(ast.literal_eval)

In [35]:
# Instanciar el binarizador
mlb = MultiLabelBinarizer()

# Ajustar y transformar la columna de etiquetas
# ¡Asegúrate de pasar una lista de listas!
y_mlb = mlb.fit_transform(poetry_df['tags']) 

# y_mlb ahora es una matriz NumPy donde cada fila es una poesía 
# y cada columna es una etiqueta binaria (0 o 1).

# Para ver las etiquetas (el orden de las columnas):
print("Clases (etiquetas únicas):", mlb.classes_)
print("\nMatriz Multilabel (primeras 2 filas):\n", y_mlb[:2])

Clases (etiquetas únicas): ['activities' 'animals' 'anniversary' 'architecture design'
 'arts sciences' 'birth' 'birth birthdays' 'birthdays'
 'breakups vexed love' 'buddhism' 'christianity' 'christmas'
 'cinco de mayo' 'cities urban life' 'class' 'classic love'
 'coming of age' 'crime punishment' 'death' 'desire'
 'disappointment failure' 'easter' 'eating drinking' 'engagement'
 'fairytales legends' 'faith doubt' 'fall' 'family ancestors'
 'farewells good luck' 'fathers day' 'first love' 'friends enemies'
 'funerals' 'gardening' 'gay' 'gender sexuality' 'get well recovery'
 'ghosts the supernatural' 'god the divine' 'graduation'
 'gratitude apologies' 'greek roman mythology' 'growing old' 'halloween'
 'hanukkah' 'health illness' 'heartache loss' 'heavens'
 'heroes patriotism' 'history politics' 'home life' 'horror'
 'humor satire' 'independence day' 'indoor activities' 'infancy'
 'infatuation crushes' 'islam' 'jobs working' 'judaism' 'kwanzaa'
 'labor day' 'landscapes pastorals' 'lang

In [26]:
# 2. Vectorizar el texto
vectorizer = TfidfVectorizer(max_features=5000) # Limitar a 5000 palabras más importantes
X = vectorizer.fit_transform(poetry_df.poem)

In [27]:
# 1. División de datos (Entrenamiento y Prueba)
X_train, X_test, y_train, y_test = train_test_split(
    X, y_mlb, test_size=0.2, random_state=42
)

# 2. Definición del modelo (Clasificador OVR con Regresión Logística)
classifier = OneVsRestClassifier(
    LogisticRegression(
        solver='liblinear',
        max_iter=1000,  # Aumenta el número de iteraciones
        random_state=42 # Asegura reproducibilidad
    )
)

# 3. Entrenamiento del modelo
classifier.fit(X_train, y_train)

# 4. Predicción
y_pred = classifier.predict(X_test)

# 5. Evaluación
# jaccard_score es una métrica útil para problemas multilabel
jaccard_micro = jaccard_score(y_test, y_pred, average='micro')

print(f"\nJaccard Score (Micro): {jaccard_micro:.4f}")


Jaccard Score (Micro): 0.1389


In [31]:
Xcv = vectorizer.transform(cv_df.poem)
ycv_pred = mlb.inverse_transform(
    classifier.predict(Xcv)
)

In [None]:
pd.DataFrame(dict(
    title=cv_df.title.values, 
    tags=list(map(list,ycv_pred)))
)

Unnamed: 0,title,tags
0,black herald,[living]
1,black stone white stone,[living]
2,pari octob poem,[]
3,xiii,[living]
