In [None]:
#%pip install pyarrow

import os
import numpy as np
import pandas as pd
from dotenv import load_dotenv
import polars as pl
import joblib

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_selector, make_column_transformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MultiLabelBinarizer, FunctionTransformer, KBinsDiscretizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
import lightgbm as lgb
from sklearn import set_config


from src.custom_transformers import *


In [None]:
load_dotenv("config/.env")
DATA_CLEANED = os.getenv("URL_CLEANED_DATA")

print(f"Loading cleaned data from: {DATA_CLEANED}")
df = pl.read_parquet(DATA_CLEANED).to_pandas()
print("Data loaded successfully.")

Loading cleaned data from: https://huggingface.co/datasets/wojciechjurewicz/imdb/resolve/main/imdb_us_movies_cleaned.parquet


In [None]:
df.shape

(390855, 11)

In [None]:
df.sample(5)

Unnamed: 0,num__isAdult,num__startYear,num__runtimeMinutes,num__averageRating,num__numVotes,cat__title,cat__types,cat__genres,remainder__cast,remainder__directors,remainder__writers
22161,0.0,-1.0,-1.0,-1.0,-1.0,Double Happiness,missing,"comedy,drama,romance","[{'category': 'actor', 'job': 'missing', 'char...","[{'primaryName': 'Shari Albert', 'birthYear': ...","[{'primaryName': 'Laura LeeLun', 'birthYear': ..."
286560,0.0,2015.0,140.0,7.2,8602.0,Assassination,imdbdisplay,"action,drama,thriller","[{'category': 'actor', 'job': 'missing', 'char...","[{'primaryName': 'Dong-hoon Choi', 'birthYear'...","[{'primaryName': 'Dong-hoon Choi', 'birthYear'..."
235213,0.0,1980.0,89.0,6.3,104.0,I Hate Blondes,missing,comedy,"[{'category': 'actor', 'job': 'missing', 'char...","[{'primaryName': 'Giorgio Capitani', 'birthYea...","[{'primaryName': 'Laura Toscano', 'birthYear':..."
388851,0.0,2017.0,83.0,5.6,300.0,Take Off,imdbdisplay,"crime,drama","[{'category': 'actor', 'job': 'missing', 'char...","[{'primaryName': 'Ehsan Abdipur', 'birthYear':...","[{'primaryName': 'Ehsan Abdipur', 'birthYear':..."
286495,0.0,-1.0,-1.0,-1.0,-1.0,Ghost in the House,imdbdisplay,drama,"[{'category': 'writer', 'job': 'source_materia...","[{'primaryName': 'Frank Megna', 'birthYear': -...","[{'primaryName': 'Ernie Hudson', 'birthYear': ..."


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 390855 entries, 0 to 390854
Data columns (total 11 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   num__isAdult          390855 non-null  float64
 1   num__startYear        390855 non-null  float64
 2   num__runtimeMinutes   390855 non-null  float64
 3   num__averageRating    390855 non-null  float64
 4   num__numVotes         390855 non-null  float64
 5   cat__title            390855 non-null  object 
 6   cat__types            390855 non-null  object 
 7   cat__genres           390855 non-null  object 
 8   remainder__cast       390855 non-null  object 
 9   remainder__directors  390855 non-null  object 
 10  remainder__writers    390855 non-null  object 
dtypes: float64(5), object(6)
memory usage: 32.8+ MB


In [None]:
PREPROCESSOR_FILE = 'preprocessor.joblib'
TFIDF_VECTORIZER_FILE = 'tfidf_vectorizer.joblib'

FULL_PREPROCESSOR = joblib.load(PREPROCESSOR_FILE)
TFIDF_VECTORIZER = joblib.load(TFIDF_VECTORIZER_FILE)

print("Preprocessor and TF-IDF Vectorizer loaded successfully.")

Preprocessor and TF-IDF Vectorizer loaded successfully.


In [None]:
print((df.isin([np.inf, -np.inf])).sum())

num__isAdult            0
num__startYear          0
num__runtimeMinutes     0
num__averageRating      0
num__numVotes           0
cat__title              0
cat__types              0
cat__genres             0
remainder__cast         0
remainder__directors    0
remainder__writers      0
dtype: int64


In [None]:
numeric_cols = ['num__isAdult', 'num__startYear', 'num__runtimeMinutes', 'num__averageRating', 'num__numVotes']
corpus_cols = ['cat__title', 'cat__genres', 'remainder__cast', 'remainder__directors', 'remainder__writers']
binner_cols = ['num__startYear', 'num__runtimeMinutes', 'num__averageRating']
onehot_cols = ['cat__types']
multilabel_cols = ['cat__genres']

set_config(transform_output="pandas")


In [None]:
df[numeric_cols] = df[numeric_cols].replace(-1.0, np.nan)

In [None]:
df_preprocessed = FULL_PREPROCESSOR.fit_transform(df)

In [None]:
df_preprocessed.shape

(390855, 72)

In [None]:
df_processed = pd.DataFrame(df_preprocessed)

In [None]:
df_processed.sample(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,62,63,64,65,66,67,68,69,70,71
321139,To the Ends of the Earth drama,-0.150341,0.494668,1.231243,0.951043,-0.464029,-0.614289,-0.788115,-0.788115,1.173559,...,0,0,0,0,0,0,0,0,0,0
374702,Scarface: The Story of Willie Valera crime dra...,-0.150341,-2.153685,-1.460172,-1.194338,2.15504,1.627898,1.26885,1.26885,-1.038056,...,0,0,0,0,0,0,0,0,0,0
151734,Clockwise comedy,-0.150341,0.451381,0.69296,0.919023,-0.464029,-0.614289,-0.788115,-0.788115,1.75632,...,0,0,0,0,0,0,0,0,0,0
67274,A Wizard's Tale adventure animation comedy,-0.150341,0.493356,0.715388,0.27861,-0.464029,-0.614289,-0.788115,-0.788115,0.957269,...,0,0,0,0,0,0,0,0,0,0
276790,Vacaciones en familia comedy,-0.150341,0.474992,0.715388,0.27861,-0.464029,-0.614289,-0.788115,-0.788115,1.61315,...,0,0,0,0,0,0,0,0,0,0


In [None]:
df_processed

RangeIndex(start=0, stop=72, step=1)