In [1]:
#%pip install pyarrow

import os
import numpy as np
import pandas as pd
from dotenv import load_dotenv
import polars as pl
import joblib

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_selector, make_column_transformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MultiLabelBinarizer, FunctionTransformer, KBinsDiscretizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
import lightgbm as lgb
from sklearn import set_config


from src.custom_transformers import *


In [2]:
load_dotenv("config/.env")
DATA_CLEANED = os.getenv("URL_CLEANED_DATA")

print(f"Loading cleaned data from: {DATA_CLEANED}")
df = pl.read_parquet(DATA_CLEANED).to_pandas()
print("Data loaded successfully.")

Loading cleaned data from: https://huggingface.co/datasets/wojciechjurewicz/imdb/resolve/main/imdb_us_movies_cleaned.parquet
Data loaded successfully.


In [3]:
df.shape

(390855, 11)

In [4]:
df.sample(5)

Unnamed: 0,num__isAdult,num__startYear,num__runtimeMinutes,num__averageRating,num__numVotes,cat__title,cat__types,cat__genres,remainder__cast,remainder__directors,remainder__writers
198885,0.0,1937.0,62.0,5.1,15.0,Jolly Paupers,missing,"comedy,musical,romance","[{'category': 'director', 'job': 'missing', 'c...","[{'primaryName': 'Leon Jeannot', 'birthYear': ...","[{'primaryName': 'Jecheskiel Mosze Neuman', 'b..."
29745,0.0,-1.0,-1.0,-1.0,-1.0,Two Faced,missing,missing,"[{'category': 'casting_director', 'job': 'miss...","[{'primaryName': 'Taraji P. Henson', 'birthYea...","[{'primaryName': 'Cat Wilkins', 'birthYear': -..."
252341,0.0,2023.0,91.0,6.1,11.0,Code of Envy,imdbdisplay,drama,"[{'category': 'actor', 'job': 'missing', 'char...","[{'primaryName': 'Robert L. Parker III', 'birt...","[{'primaryName': 'Kenya Hendricks', 'birthYear..."
159252,0.0,2024.0,140.0,5.4,139.0,Marivillin Gopurangal,imdbdisplay,drama,"[{'category': 'actor', 'job': 'missing', 'char...","[{'primaryName': 'Arun Bose', 'birthYear': -1,...","[{'primaryName': 'Pramod Mohan', 'birthYear': ..."
216338,0.0,-1.0,-1.0,-1.0,-1.0,Tough As They Come,missing,"action,biography,drama","[{'category': 'actor', 'job': 'missing', 'char...","[{'primaryName': 'Sylvester Stallone', 'birthY...","[{'primaryName': 'Ian Mackenzie Jeffers', 'bir..."


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 390855 entries, 0 to 390854
Data columns (total 11 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   num__isAdult          390855 non-null  float64
 1   num__startYear        390855 non-null  float64
 2   num__runtimeMinutes   390855 non-null  float64
 3   num__averageRating    390855 non-null  float64
 4   num__numVotes         390855 non-null  float64
 5   cat__title            390855 non-null  object 
 6   cat__types            390855 non-null  object 
 7   cat__genres           390855 non-null  object 
 8   remainder__cast       390855 non-null  object 
 9   remainder__directors  390855 non-null  object 
 10  remainder__writers    390855 non-null  object 
dtypes: float64(5), object(6)
memory usage: 32.8+ MB


In [6]:
PREPROCESSOR_FILE = 'preprocessor.joblib'
TFIDF_VECTORIZER_FILE = 'tfidf_vectorizer.joblib'

FULL_PREPROCESSOR = joblib.load(PREPROCESSOR_FILE)
TFIDF_VECTORIZER = joblib.load(TFIDF_VECTORIZER_FILE)

print("Preprocessor and TF-IDF Vectorizer loaded successfully.")

Preprocessor and TF-IDF Vectorizer loaded successfully.


In [7]:
print((df.isin([np.inf, -np.inf])).sum())

num__isAdult            0
num__startYear          0
num__runtimeMinutes     0
num__averageRating      0
num__numVotes           0
cat__title              0
cat__types              0
cat__genres             0
remainder__cast         0
remainder__directors    0
remainder__writers      0
dtype: int64


In [8]:
numeric_cols = ['num__isAdult', 'num__startYear', 'num__runtimeMinutes', 'num__averageRating', 'num__numVotes']
corpus_cols = ['cat__title', 'cat__genres', 'remainder__cast', 'remainder__directors', 'remainder__writers']
binner_cols = ['num__startYear', 'num__runtimeMinutes', 'num__averageRating']
onehot_cols = ['cat__types']
multilabel_cols = ['cat__genres']

set_config(transform_output="pandas")


In [9]:
df[numeric_cols] = df[numeric_cols].replace(-1.0, np.nan)

In [10]:
df_preprocessed = FULL_PREPROCESSOR.fit_transform(df)

In [11]:
df_preprocessed.shape

(390855, 72)

In [12]:
df_processed = pd.DataFrame(df_preprocessed)

In [13]:
df_processed.sample(5)

Unnamed: 0,search_corpus__searchable_text,ranking_numeric__num__isAdult,ranking_numeric__num__startYear,ranking_numeric__num__runtimeMinutes,ranking_numeric__num__averageRating,ranking_numeric__is_missing_num__startYear,ranking_numeric__is_missing_num__runtimeMinutes,ranking_numeric__is_missing_num__averageRating,ranking_numeric__is_missing_num__numVotes,ranking_numeric__num__numVotes_log,...,filter_multilabel_genres__news,filter_multilabel_genres__reality-tv,filter_multilabel_genres__romance,filter_multilabel_genres__sci-fi,filter_multilabel_genres__short,filter_multilabel_genres__sport,filter_multilabel_genres__talk-show,filter_multilabel_genres__thriller,filter_multilabel_genres__war,filter_multilabel_genres__western
152736,What Is True Faith in God?: Faith in God drama,-0.150341,0.492044,-1.460172,1.591456,-0.464029,1.627898,-0.788115,-0.788115,0.351461,...,0,0,0,0,0,0,0,0,0,0
65302,The Last Vampyre on Earth horror,-0.150341,0.486798,0.670531,0.086487,-0.464029,-0.614289,-0.788115,-0.788115,0.314275,...,0,0,0,0,0,0,0,0,0,0
183547,The Ghost Creeps comedy mystery thriller,-0.150341,0.391042,0.064963,0.566796,-0.464029,-0.614289,-0.788115,-0.788115,0.873015,...,0,0,0,0,0,0,0,1,0,0
361636,Pick-up Summer comedy,-0.150341,0.443511,0.760245,0.182548,-0.464029,-0.614289,-0.788115,-0.788115,0.764558,...,0,0,0,0,0,0,0,0,0,0
378326,Journey to Squaxin Island documentary family,-0.150341,0.486798,-0.271464,-1.194338,-0.464029,-0.614289,1.26885,1.26885,-1.038056,...,0,0,0,0,0,0,0,0,0,0


In [14]:
df_processed

Unnamed: 0,search_corpus__searchable_text,ranking_numeric__num__isAdult,ranking_numeric__num__startYear,ranking_numeric__num__runtimeMinutes,ranking_numeric__num__averageRating,ranking_numeric__is_missing_num__startYear,ranking_numeric__is_missing_num__runtimeMinutes,ranking_numeric__is_missing_num__averageRating,ranking_numeric__is_missing_num__numVotes,ranking_numeric__num__numVotes_log,...,filter_multilabel_genres__news,filter_multilabel_genres__reality-tv,filter_multilabel_genres__romance,filter_multilabel_genres__sci-fi,filter_multilabel_genres__short,filter_multilabel_genres__sport,filter_multilabel_genres__talk-show,filter_multilabel_genres__thriller,filter_multilabel_genres__war,filter_multilabel_genres__western
0,Mudflow documentary,-0.150341,0.493356,0.334104,1.303270,-0.464029,-0.614289,-0.788115,-0.788115,0.131212,...,0,0,0,0,0,0,0,0,0,0
1,Nova Seed adventure animation fantasy,-0.150341,0.490733,-0.024751,0.919023,-0.464029,-0.614289,-0.788115,-0.788115,0.735960,...,0,0,0,0,0,0,0,0,0,0
2,Hell Bent drama western,-0.150341,0.362185,-0.271464,0.726899,-0.464029,-0.614289,-0.788115,-0.788115,0.779887,...,0,0,0,0,0,0,0,0,0,1
3,Cocaine Stepdad comedy,-0.150341,0.501226,0.760245,0.887002,-0.464029,-0.614289,-0.788115,-0.788115,0.120404,...,0,0,0,0,0,0,0,0,0,0
4,Dissapointing Luke comedy,-0.150341,-2.153685,-1.460172,-1.194338,2.155040,1.627898,1.268850,1.268850,-1.038056,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
390850,Touch Chesi Chudu action,-0.150341,0.493356,1.836811,0.310631,-0.464029,-0.614289,-0.788115,-0.788115,1.042613,...,0,0,0,0,0,0,0,0,0,0
390851,The Ghost of Crossbone Canyon drama western,-0.150341,0.406783,-0.204178,-1.194338,-0.464029,-0.614289,1.268850,1.268850,-1.038056,...,0,0,0,0,0,0,0,0,0,1
390852,Searing Summer drama,-0.150341,0.492044,0.334104,0.598816,-0.464029,-0.614289,-0.788115,-0.788115,0.854490,...,0,0,0,0,0,0,0,0,0,0
390853,An Injury to One documentary,-0.150341,0.472369,-0.271464,1.271249,-0.464029,-0.614289,-0.788115,-0.788115,0.556193,...,0,0,0,0,0,0,0,0,0,0


In [15]:
tfidf_vectors = TFIDF_VECTORIZER.fit_transform(df_processed['search_corpus__searchable_text'])

In [16]:
tfidf_vectors.shape

(390855, 2000)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_similarities = cosine_similarity(tfidf_vectors)