# Modelado para el history telling

--------------

## Librerias

In [187]:

import os
import re
import pandas as pd 
import numpy as np
import plotly.express as px
from  sqlalchemy import create_engine

#### Modelos 

In [188]:
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV,train_test_split
from sklearn.linear_model import ElasticNet, Lasso, Lars, Ridge
from sklearn.svm import SVR

In [189]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

--------------

### Funciones utiles

In [190]:
def to_secs(duration: str) -> float:

    if not duration or duration == 'Unknown':
        return 0

    match = re.match(r'(\d+)\s*hr\.?\s*(\d+)?\s*min\.?$', duration.strip())
    if match:
        horas = int(match.group(1))
        minutos = int(match.group(2)) if match.group(2) else 0
        return horas * 3600 + minutos * 60

    match = re.match(r'(\d+)\s*min\. per ep\.?$', duration.strip())
    if match:
        cantidad = int(match.group(1))
        return cantidad * 60

    match = re.match(r'(\d+)\s*min\.?$', duration.strip())
    if match:
        cantidad = int(match.group(1))
        return cantidad * 60

    return 0

In [191]:
ANIME = '../DATA/OLTP/ANIME.csv'
TYPES = '../DATA/OLTP/TYPES.csv'
CLASSIFICATION = '../DATA/OLTP/CLASSIFICATION.csv'
SOURCE = '../DATA/OLTP/SOURCE.csv'
GENRE_ANIME  = '../DATA/OLTP/GENRE_ANIME.csv'
GENRE  = '../DATA/OLTP/GENRE.csv'

In [192]:
df_anime = pd.read_csv('../DATA/OLTP/ANIME.csv')
df_genre_anime  = pd.read_csv('../DATA/OLTP/GENRE_ANIME.csv')
df_genre = pd.read_csv('../DATA/OLTP/GENRE.csv')

### Union de las tablas

In [193]:
df_anime = pd.merge( df_anime,pd.read_csv('../DATA/OLTP/TYPES.csv'), on='ID_TYPES',how='inner')
df_anime = pd.merge( df_anime,pd.read_csv('../DATA/OLTP/SOURCE.csv'), on='ID_SOURCE',how='inner')
df_anime = pd.merge( df_anime,pd.read_csv('../DATA/OLTP/CLASSIFICATION.csv'), on='ID_CLASSIFICATION',how='left')

In [194]:
anime_genre_mapping = pd.merge(df_genre_anime, df_anime, how='inner', left_on='ID_ANIME', right_on='ID_ANIME')
anime_genre_mapping = pd.merge(anime_genre_mapping, df_genre, how='inner', left_on='ID_GENRE', right_on='ID_GENRE')


category_counts = anime_genre_mapping['GENRE'].value_counts(normalize=True)
infrequent_categories = category_counts[category_counts < 0.1].index
anime_genre_mapping.loc[anime_genre_mapping['GENRE'].isin(infrequent_categories), 'GENRE'] = 'OTHER'


dummy_variables = pd.get_dummies(anime_genre_mapping[['ID_ANIME', 'GENRE']], columns=['GENRE'], prefix='GENRE_', prefix_sep='')
dummy_variables = dummy_variables.groupby('ID_ANIME').sum().reset_index()
df_anime = pd.merge(df_anime, dummy_variables, how='left', left_on='ID_ANIME', right_on='ID_ANIME').fillna(0, inplace=False)



### Definicion de las variables a utilizar

In [195]:
#'SYNOPSIS'
ls_drop = ['ID_ANIME','URL_IMAGE','START_DATE', 'FINISH_DATE','ID_CLASSIFICATION', 'ID_SOURCE','ID_TYPES']
ls_disc = ['STATUS','TYPES','SOURCES']
ls_cont = ['EPISODES', 'DURATION', 'SCORED','SCORED_BY','RANKS','POPULARITY','FAVORITES'] + [col for col in df_anime.columns if col.startswith('GENRE_')]

In [196]:
df_anime = df_anime.drop(columns=ls_drop)

In [197]:
df_anime['DURATION']=df_anime['DURATION'].apply(lambda x: to_secs(x)) 
df_anime['EPISODES']=df_anime['EPISODES'].apply(lambda x: int(x) if not isinstance(x,str) else None) 

#### Creacion de las dummies

In [198]:
dummy_variables = pd.get_dummies(df_anime[['STATUS','TYPES','SOURCES']])
df_anime = pd.concat([df_anime, dummy_variables], axis=1)
    

#### Uso de un modelo de lenguaje 

In [199]:
modelo = SentimentIntensityAnalyzer()

In [200]:
df_anime['SYNOPSIS'] = df_anime['SYNOPSIS'].apply(lambda text: modelo.polarity_scores(f'{text}')['compound'])

In [201]:
df_anime.head(2)

Unnamed: 0,ANIME_NAME,EPISODES,DURATION,SCORED,SCORED_BY,RANKS,POPULARITY,FAVORITES,SYNOPSIS,STATUS,...,SOURCES_Manga,SOURCES_Music,SOURCES_Novel,SOURCES_Original,SOURCES_Other,SOURCES_Picture book,SOURCES_Radio,SOURCES_Unknown,SOURCES_Visual novel,SOURCES_Web manga
0,Kimi no Na wa.,,6360,9.19,471398,2.0,33,34912,0.4389,Finished Airing,...,False,False,False,True,False,False,False,False,False,False
1,Fullmetal Alchemist: Brotherhood,,1440,9.25,733592,1.0,4,106895,-0.969,Finished Airing,...,True,False,False,False,False,False,False,False,False,False


### Nulos

In [202]:
nulos = pd.DataFrame(df_anime.isna().sum())
nulos[nulos[0] > 0]

Unnamed: 0,0
EPISODES,12174


In [203]:
df_anime['EPISODES'] = df_anime['EPISODES'].fillna(12)


Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`



### Outlayers

In [204]:
for column in ls_cont: 
    px.histogram(df_anime, x=column, title=f'Histograma de {column}').show()

In [205]:
ls_out = ['EPISODES']
percentile_99 = df_anime[ls_out].quantile(0.99)
for column in ls_out:df_anime = df_anime[df_anime[column] <= percentile_99[column]]

In [206]:
df_anime = df_anime.set_index('ANIME_NAME')

In [211]:
lls_x = [col for col in df_anime.columns if col not in [ 'POPULARITY','CLASSIFICATION', 'TYPES', 'SOURCES','STATUS']]

### Entrenamiento del mdelo

In [212]:
X = df_anime[lls_x]
y = df_anime['POPULARITY']

In [213]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [214]:
model = ElasticNet()

In [215]:
param_grid = {
    'alpha': [0.1, 0.5, 1.0],
    'l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9],
}

In [216]:
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

# Imprimir los mejores parámetros encontrados
print("Mejores parámetros:", grid_search.best_params_)
print()

# Evaluar el modelo en el conjunto de prueba
best_model = grid_search.best_estimator_
test_score = best_model.score(X_test, y_test)
print("Puntuación en el conjunto de prueba:", test_score)

Mejores parámetros: {'alpha': 0.1, 'l1_ratio': 0.9}
Puntuación en el conjunto de prueba: 0.7516537392601355


In [221]:
coeficientes_por_caracteristica = dict(zip(lls_x, best_model.coef_))

# Imprimir el diccionario que muestra qué coeficiente le pertenece a cada característica
print("Coeficientes por característica:")
print(coeficientes_por_caracteristica)


Coeficientes por característica:
{'EPISODES': 0.0, 'DURATION': 0.12570535260551474, 'SCORED': -617.3107936755141, 'SCORED_BY': -0.02031893905164934, 'RANKS': 0.5230290406763178, 'FAVORITES': 0.2152048782706724, 'SYNOPSIS': -41.01894013842806, 'GENRE_Comedy': -457.6730188347864, 'GENRE_OTHER': -369.7982974734882, 'STATUS_Currently Airing': 247.8694958689432, 'STATUS_Finished Airing': 88.98370400434263, 'STATUS_Not yet aired': -345.8531999120952, 'TYPES_Movie': 148.8426113869865, 'TYPES_Music': 192.3611670799225, 'TYPES_ONA': -296.23037931595854, 'TYPES_OVA': 525.6398273799016, 'TYPES_Special': -281.31960338749406, 'TYPES_TV': -289.2934284495229, 'SOURCES_4-koma manga': -491.8458865261825, 'SOURCES_Book': 306.05821764765017, 'SOURCES_Card game': -201.60133730525305, 'SOURCES_Digital manga': -182.59069226735048, 'SOURCES_Game': -893.3301101317527, 'SOURCES_Light novel': -942.7175836796832, 'SOURCES_Manga': -225.584781517945, 'SOURCES_Music': 886.6349758374001, 'SOURCES_Novel': -16.9089160