# Lectura de los datos iniciales

---

In [1]:
# =====================================
# Importación de librerías
# =====================================

import os
import pandas as pd
from glob import glob
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, root_mean_squared_error

In [2]:
# =====================================
# Importación de datos
# =====================================

df_peliculas = pd.read_csv("../../Fase 1 Datos/Parte 1/Puntuaciones/DatosPeliculasSeries.csv")
df_general = pd.read_csv("../../Fase 1 Datos/Parte 3/Puntuaciones/general.csv")

In [3]:
# =====================================
# Unificación de puntuaciones
# =====================================

archivos = glob(os.path.join("../../Fase 1 Datos/Parte 2/Puntuaciones", "*.csv"))

matriz = df_general

for archivo in archivos:
    nombre_base = os.path.basename(archivo)
    partes = nombre_base.split(".")
    if len(partes) == 2 and partes[0].isdigit() and partes[1] == "csv":
        archivo = pd.read_csv(archivo)
        matriz = pd.concat([matriz, archivo])

matriz

Unnamed: 0,userId,Mi Nota,tid
0,1,8,tt0114388
1,1,2,tt0113627
2,1,4,tt0112682
3,1,10,tt0115012
4,1,10,tt0114746
...,...,...,...
885,200949,8,tt9603212
886,200949,8,tt9619824
887,200949,6,tt9760852
888,200949,3,tt9764362


# Selección de usuarios parecidos

---

In [4]:
# =====================================
# Unión con usuarios parecidos
# =====================================

nuevo_usuario = {
    "tt0481369": 8,
    "tt0434409": 8
}

df_nuevo = pd.DataFrame(
    list(nuevo_usuario.items()),
    columns=["tid", "nota_nuevo"]
)

df_juntos = matriz.merge(
    df_nuevo,
    on="tid",
    how="right"
)

df_juntos["parecido"] = ((df_juntos["Mi Nota"] - df_juntos["nota_nuevo"]).abs() <= 1.5)

df_juntos

Unnamed: 0,userId,Mi Nota,tid,nota_nuevo,parecido
0,10,7,tt0481369,8,True
1,28,6,tt0481369,8,False
2,65,4,tt0481369,8,False
3,80,6,tt0481369,8,False
4,109,5,tt0481369,8,False
...,...,...,...,...,...
38842,200937,9,tt0434409,8,True
38843,200943,7,tt0434409,8,True
38844,200947,9,tt0434409,8,True
38845,200948,5,tt0434409,8,False


In [5]:
# =====================================
# Filtrado de usuarios parecidos
# =====================================

similitud = (
    df_juntos
    .groupby("userId")
    .agg(
        coincidencias=("parecido", "sum"),
        comunes=("Mi Nota", lambda x: (x != -1).sum())
    )
)

similitud = similitud[similitud["coincidencias"] >= (similitud["comunes"] * 0.8)]
similitud = similitud[similitud["comunes"] >= (len(nuevo_usuario) * 0.8)]
similitud = similitud.reset_index()

similitud

Unnamed: 0,userId,coincidencias,comunes
0,10,2,2
1,204,2,2
2,240,2,2
3,661,2,2
4,1411,2,2
...,...,...,...
734,199631,2,2
735,200213,2,2
736,200352,2,2
737,200436,2,2


In [6]:
# =====================================
# Recolección puntuaciones de usuarios parecidos
# =====================================

if similitud.empty:
    final = None

else:
    puntuaciones = similitud.merge(matriz, on="userId", how="left")
    puntuaciones = puntuaciones.groupby("tid", as_index=False)["Mi Nota"].mean()
    puntuaciones["Mi Nota"] = (
        puntuaciones["Mi Nota"]
            .round()
            .clip(1, 10)
            .astype(int)
    )

    puntuaciones_datos = puntuaciones.merge(
        df_peliculas,
        on="tid",
        how="left"
    )

    final = puntuaciones_datos[['tid', 'Tipo', 'Titulo', 'Titulo_ES', 'Año', 'Mi Nota', 'Duracion', 'Generos', 'Puntuacion', 'Num_Votos', 'Actores', 'Directores', 'Idioma']]

final

Unnamed: 0,tid,Tipo,Titulo,Titulo_ES,Año,Mi Nota,Duracion,Generos,Puntuacion,Num_Votos,Actores,Directores,Idioma
0,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,1906,7,70,"Action,Adventure,Biography",6.0,1039,"['Godfrey Cass', 'Bella Cola', 'Will Coyne', '...",['Charles Tait'],"[nan, 'en']"
1,tt0002130,movie,Dante's Inferno,El infierno,1911,6,71,"Adventure,Drama,Fantasy",7.0,3989,"['Emin Belig Belli', 'Giuseppe de Liguoro', 'E...","['Giuseppe de Liguoro', 'Adolfo Padovan', 'Fra...","['ja', 'en', nan, 'sv']"
2,tt0002844,movie,Fantômas: In the Shadow of the Guillotine,Fantómas a la sombra de la guillotina,1913,7,54,"Crime,Drama",6.9,2686,"['André Luguet', 'Georges Melchior', 'Renée Ca...",['Louis Feuillade'],"['sv', 'ja', 'tr', nan, 'en']"
3,tt0003037,movie,Fantomas: The Man in Black,Fantomas 2: Juve contra Fantomas,1913,7,61,"Crime,Drama",6.9,1826,"['Georges Melchior', 'Renée Carl', 'Yvette And...",['Louis Feuillade'],"['sv', nan, 'en']"
4,tt0003165,movie,Fantômas: The Dead Man Who Killed,Fantomas 3: El muerto que mata,1913,7,90,"Crime,Drama,Mystery",6.9,1455,"['André Luguet', 'Luitz-Morat', 'Georges Melch...",['Louis Feuillade'],"[nan, 'sv']"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
31663,tt9902160,movie,Herself,Volver a empezar (Herself),2020,8,97,Drama,7.0,5246,"['Shadaan Felfeli', 'Cathy Belton', 'Art Kearn...",['Phyllida Lloyd'],"['he', 'ca', 'ja', 'fr', nan, 'en']"
31664,tt9902160,movie,Herself,Herself,2020,8,97,Drama,7.0,5246,"['Shadaan Felfeli', 'Cathy Belton', 'Art Kearn...",['Phyllida Lloyd'],"['he', 'ca', 'ja', 'fr', nan, 'en']"
31665,tt9907782,movie,The Cursed,The Cursed,2021,8,111,"Fantasy,Horror,Mystery",6.2,22054,"['Kelly Reilly', 'Tommy Rodger', 'Amelia Crouc...",['Sean Ellis'],"['hi', 'bg', 'tr', 'ja', 'fr', nan, 'es', 'en']"
31666,tt9908390,movie,Le lion,Una misión de locos,2020,6,95,Comedy,5.5,1600,"['Ophélia Kolb', 'Philippe Duquesne', 'Benoît ...",['Ludovic Colbeau-Justin'],"['tr', 'ja', 'fr', nan, 'en']"


# Predicción

---

In [7]:
final["Mi Nota"].value_counts().sort_index()

Mi Nota
1      562
2      514
3      548
4     1814
5     2981
6     8080
7     9571
8     6258
9      889
10     451
Name: count, dtype: int64

In [8]:
# =====================================
# Balanceo de clases
# =====================================

conteo = int((final["Mi Nota"].value_counts().min()) * 1.5)

def eleccion_clases(df, conteo):
    n = len(df)
    if n > conteo:
        return df.sample(n=conteo, random_state=100496072)
    else:
        return df

df_balanceado = (
    final
    .groupby("Mi Nota", group_keys=False)
    .apply(eleccion_clases, conteo)
)

df_balanceado

  .apply(eleccion_clases, conteo)


Unnamed: 0,tid,Tipo,Titulo,Titulo_ES,Año,Mi Nota,Duracion,Generos,Puntuacion,Num_Votos,Actores,Directores,Idioma
28,tt0010726,movie,The Spiders - Episode 1: The Golden Sea,Las arañas 1: El lago de oro,1919,1,130,"Adventure,Drama,Western",6.5,1597,"['Friedrich Kühne', 'Lil Dagover', 'Ressel Orl...",['Fritz Lang'],"[nan, 'bg']"
29,tt0010726,movie,The Spiders - Episode 1: The Golden Sea,Arañas,1919,1,130,"Adventure,Drama,Western",6.5,1597,"['Friedrich Kühne', 'Lil Dagover', 'Ressel Orl...",['Fritz Lang'],"[nan, 'bg']"
125,tt0018806,movie,The Crowd,Y el mundo marcha,1928,1,98,"Drama,Romance",8.0,9873,"['Bert Roach', 'Daniel G. Tomlinson', 'John D....",['King Vidor'],"['bg', 'ru', 'ja', 'tr', 'fr', nan, 'sr', 'en']"
175,tt0021890,movie,The Front Page,Primera plana,1931,1,101,"Comedy,Crime,Drama",6.7,3776,"['Adolphe Menjou', 'Matt Moore', 'Mary Brian',...",['Lewis Milestone'],"['ru', 'ca', 'ja', 'fr', nan, 'en']"
176,tt0021890,movie,The Front Page,Un gran reportaje,1931,1,101,"Comedy,Crime,Drama",6.7,3776,"['Adolphe Menjou', 'Matt Moore', 'Mary Brian',...",['Lewis Milestone'],"['ru', 'ca', 'ja', 'fr', nan, 'en']"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
31419,tt9239552,movie,Konosuba!: God's Blessing on This Wonderful Wo...,Eiga Kono subarashii sekai ni shukufuku o!: Ku...,2019,10,90,"Action,Adventure,Animation",7.8,6707,"['Yui Horie', 'Rie Takahashi', 'Aki Toyosaki',...",['Takaomi Kanasaki'],"['ja', nan, 'en']"
31428,tt9252468,movie,Mosul,Mosul,2019,10,101,"Action,Drama,Thriller",7.1,32567,"['Tarik Rmili', 'Thaer Al-Shayei', ""Is'haq Eli...",['Matthew Michael Carnahan'],"['hi', 'cmn', 'ja', 'tr', 'fr', nan, 'en']"
31588,tt9707456,movie,Nightshade,Nightshade,2022,10,90,"Crime,Fantasy,Horror",4.0,370,"['Jason Patric', 'Dina Meyer', 'Jaime Gallaghe...",['Landon Williams'],"['tr', 'hi', nan, 'en']"
31622,tt9794630,movie,The Vanished,Sense rastre,2020,10,115,"Action,Crime,Thriller",5.7,12991,"['Kk Heim', 'Jason Patric', 'Mitchell L. Johns...",['Peter Facinelli'],"['tr', 'ca', 'ja', 'fr', nan, 'en']"


In [9]:
# =====================================
# Entrenamiento del modelo para las predicciones
# =====================================

df_datos_reg = df_balanceado[['Tipo', 'Año', 'Duracion', 'Puntuacion', 'Num_Votos', 'Generos', 'Mi Nota']].copy()
dummies = df_datos_reg["Generos"].str.get_dummies(sep=",")
dummies2 = df_datos_reg["Tipo"].str.get_dummies(sep=",")
df_datos_reg = df_datos_reg.join(dummies2).drop(columns=["Tipo"])
df_datos_reg = df_datos_reg.join(dummies).drop(columns=["Generos"])
df_datos_reg['Mi Nota'] /= 10


col_gen = ['Año', 'Duracion', 'Puntuacion', 'Num_Votos', 'Mi Nota', 'movie', 'tvSeries', 'tvMiniSeries',
           'tvSpecial', 'Thriller', 'Horror', 'Documentary', 'Sci-Fi','Romance', 'Drama', 'Sport', 'War', 'Biography',
           'Musical', 'Crime', 'Music', 'Action', 'Short', 'History', 'Comedy', 'Mystery', 'Animation', 'Adventure',
           'Family', 'Fantasy', 'Game-Show', 'Adult', 'Western', 'Talk-Show','Film-Noir']

for col in col_gen:
    if col not in df_datos_reg.columns:
        df_datos_reg[col] = 0

df_datos_reg = df_datos_reg[col_gen]

X_reg = df_datos_reg.drop("Mi Nota", axis=1)
y_reg = df_datos_reg["Mi Nota"]


modelo = Pipeline([
    ('scaler', RobustScaler()),
    ('modelo', RandomForestRegressor(min_samples_leaf=2, min_samples_split=5, random_state=100496072))
])

modelo.fit(X_reg, y_reg)
y_pred_completo_reg = modelo.predict(X_reg)
rmse_rf_reg = root_mean_squared_error(y_reg, y_pred_completo_reg)

print("\nRMSE:", rmse_rf_reg)
print("R2:", r2_score(y_reg, y_pred_completo_reg))


RMSE: 0.11552140856989553
R2: 0.8225031748431688


In [10]:
# =====================================
# Realización de predicción
# =====================================

avatar = "tt0499549"
informacion = df_peliculas[(df_peliculas["tid"] == avatar)]

a_predecir = informacion[['Tipo', 'Año', 'Duracion', 'Puntuacion', 'Num_Votos', 'Generos']].copy()

dummies = a_predecir["Generos"].str.get_dummies(sep=",")
dummies2 = a_predecir["Tipo"].str.get_dummies(sep=",")
a_predecir = a_predecir.join(dummies2).drop(columns=["Tipo"])
a_predecir = a_predecir.join(dummies).drop(columns=["Generos"])

a_predecir = pd.DataFrame(a_predecir)

col_gen = ['Año', 'Duracion', 'Puntuacion', 'Num_Votos', 'movie', 'tvSeries', 'tvMiniSeries',
           'tvSpecial', 'Thriller', 'Horror', 'Documentary', 'Sci-Fi', 'Romance', 'Drama', 'Sport', 'War', 'Biography',
           'Musical', 'Crime', 'Music', 'Action', 'Short', 'History', 'Comedy', 'Mystery', 'Animation', 'Adventure',
           'Family', 'Fantasy', 'Game-Show', 'Adult', 'Western', 'Talk-Show', 'Film-Noir']

missing = [c for c in col_gen if c not in a_predecir.columns]
if missing:
    for c in missing:
        a_predecir[c] = 0

a_predecir = a_predecir[col_gen]

prediccion = modelo.predict(a_predecir)
print(f"La probabilidad de le guste es de un {int(prediccion[0] * 100)}%")

La probabilidad de le guste es de un 70%


# Recomendación

---

In [11]:
# =====================================
# Realización de recomendaciones
# =====================================

recomendacion = []

if final is not None:

    recomendacion = final.merge(df_nuevo["tid"], on="tid", how='left', indicator=True)
    recomendacion = recomendacion[recomendacion['_merge'] == 'left_only'].drop(columns=['_merge'])
    recomendacion = recomendacion[recomendacion['Mi Nota'] >= 7.5]


if len(recomendacion) == 0:
    print("Personal")
    nuevo_df_filtrado = df_nuevo[df_nuevo["nota_nuevo"] > 7.5]

    tipos = nuevo_df_filtrado["Tipo"].value_counts().index.tolist()

    tipo1 = tipos[0]
    tipo2 = tipos[1] if len(tipos) > 1 else tipo1

    tipos = "|".join([tipo1, tipo2])

    conteo_anos_5 = nuevo_df_filtrado["Año"].value_counts().head(5).index.tolist()

    ano_minimo = min(conteo_anos_5)
    ano_maximo = max(conteo_anos_5) + 5
    puntuacion_minima = nuevo_df_filtrado["Puntuacion"].min()

    conteo_generos = (
        nuevo_df_filtrado["Generos"]
        .str.split(",")
        .explode()
        .str.strip()
        .value_counts()
    )

    rows, columns = nuevo_df_filtrado.shape

    generos_mas15 = conteo_generos[conteo_generos > (rows * 0.15)]

    sigeneross = generos_mas15.index.tolist()
    sigeneros = "|".join(sigeneross)

    nogeneross = ["None"]
    nogeneros = "|".join(nogeneross)

    recomendacion = df_peliculas[
        (df_peliculas.Tipo.str.contains(tipos)) &
        (df_peliculas.Año >= ano_minimo) &
        (df_peliculas.Año <= ano_maximo) &
        (df_peliculas.Puntuacion >= puntuacion_minima) &
        (df_peliculas.Generos.str.contains(sigeneros, case=False, na=False)) &
        (~df_peliculas.Generos.str.contains(nogeneros, case=False, na=False))
    ]

    recomendacion = recomendacion.merge(nuevo_df_filtrado["tid"], on="tid", how='left', indicator=True)
    recomendacion = recomendacion[recomendacion['_merge'] == 'left_only'].drop(columns=['_merge'])

recomendacion = recomendacion.sort_values(["Num_Votos", "Puntuacion"], ascending=False)
recomendacion.head(100)

Unnamed: 0,tid,Tipo,Titulo,Titulo_ES,Año,Mi Nota,Duracion,Generos,Puntuacion,Num_Votos,Actores,Directores,Idioma
10029,tt0111161,movie,The Shawshank Redemption,Cadena perpetua,1994,9,142,Drama,9.3,3128869,"['Larry Brandenburg', 'Jeffrey DeMunn', 'Mark ...",['Frank Darabont'],"['hi', 'bg', 'he', 'cmn', 'sv', 'th', 'ru', 'y..."
10030,tt0111161,movie,The Shawshank Redemption,Cadena perpètua,1994,9,142,Drama,9.3,3128869,"['Larry Brandenburg', 'Jeffrey DeMunn', 'Mark ...",['Frank Darabont'],"['hi', 'bg', 'he', 'cmn', 'sv', 'th', 'ru', 'y..."
17462,tt0468569,movie,The Dark Knight,El caballero oscuro,2008,9,152,"Action,Crime,Drama",9.1,3104589,"['Gary Oldman', 'Cillian Murphy', 'Maggie Gyll...",['Christopher Nolan'],"['hi', 'bg', 'cmn', 'ta', 'th', 'ru', 'yue', '..."
17463,tt0468569,movie,The Dark Knight,El cavaller fosc,2008,9,152,"Action,Crime,Drama",9.1,3104589,"['Gary Oldman', 'Cillian Murphy', 'Maggie Gyll...",['Christopher Nolan'],"['hi', 'bg', 'cmn', 'ta', 'th', 'ru', 'yue', '..."
21773,tt1375666,movie,Inception,Origen,2010,9,148,"Action,Adventure,Sci-Fi",8.8,2758879,"['Tom Berenger', 'Joseph Gordon-Levitt', 'Pete...",['Christopher Nolan'],"['hi', 'bg', 'he', 'cmn', 'th', 'ta', 'ru', 'y..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
19581,tt1049413,movie,Up,Up,2009,8,96,"Adventure,Animation,Comedy",8.3,1211924,"['David Kaye', 'Elie Docter', 'Jeremy Leary', ...","['Bob Peterson', 'Pete Docter']","['hi', 'bg', 'cmn', 'th', 'ru', 'yue', 'ca', '..."
5579,tt0081505,movie,The Shining,El resplandor,1980,8,146,"Drama,Horror",8.4,1201169,"['Tony Burton', 'Lia Beldam', 'Jack Nicholson'...",['Stanley Kubrick'],"['hi', 'bg', 'he', 'cmn', 'th', 'ru', 'yue', '..."
5580,tt0081505,movie,The Shining,La resplendor,1980,8,146,"Drama,Horror",8.4,1201169,"['Tony Burton', 'Lia Beldam', 'Jack Nicholson'...",['Stanley Kubrick'],"['hi', 'bg', 'he', 'cmn', 'th', 'ru', 'yue', '..."
15669,tt0371746,movie,Iron Man,Iron Man,2008,8,126,"Action,Adventure,Sci-Fi",7.9,1201008,"['Sayed Badreya', 'Faran Tahir', 'Bill Smitrov...",['Jon Favreau'],"['hi', 'bg', 'cmn', 'ru', 'yue', 'ca', 'ja', '..."
