In [1]:
#%pip install pyarrow

import os
import numpy as np
from dotenv import load_dotenv
import polars as pl
import joblib

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_selector, make_column_transformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MultiLabelBinarizer, FunctionTransformer, KBinsDiscretizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
import lightgbm as lgb


from src.custom_transformers import *


In [2]:
load_dotenv("config/.env")
DATA_CLEANED = os.getenv("URL_CLEANED_DATA")

print(f"Loading cleaned data from: {DATA_CLEANED}")
df = pl.read_parquet(DATA_CLEANED).to_pandas()
print("Data loaded successfully.")

Loading cleaned data from: https://huggingface.co/datasets/wojciechjurewicz/imdb/resolve/main/imdb_us_movies_cleaned.parquet
Data loaded successfully.


In [3]:
df.shape

(390855, 11)

In [4]:
df.sample(5)

Unnamed: 0,num__isAdult,num__startYear,num__runtimeMinutes,num__averageRating,num__numVotes,cat__title,cat__types,cat__genres,remainder__cast,remainder__directors,remainder__writers
115474,0.0,1930.0,81.0,6.1,858.0,Sin Takes a Holiday,imdbdisplay,"comedy,romance","[{'category': 'actor', 'job': 'missing', 'char...","[{'primaryName': 'Paul L. Stein', 'birthYear':...","[{'primaryName': 'Horace Jackson', 'birthYear'..."
165232,0.0,2022.0,75.0,6.9,76.0,First Time Caller,imdbdisplay,sci-fi,"[{'category': 'actor', 'job': 'missing', 'char...","[{'primaryName': 'J.D. Brynn', 'birthYear': -1...","[{'primaryName': 'Mac Rogers', 'birthYear': -1..."
329932,0.0,1995.0,-1.0,-1.0,-1.0,The Grand Prize,imdbdisplay,missing,"[{'category': 'editor', 'job': 'missing', 'cha...","[{'primaryName': 'John M. Toutkaldjian', 'birt...",[]
267611,0.0,1914.0,-1.0,-1.0,-1.0,The Opened Shutters,imdbdisplay,drama,"[{'category': 'actor', 'job': 'missing', 'char...","[{'primaryName': 'Otis Turner', 'birthYear': 1...","[{'primaryName': 'Lois Weber', 'birthYear': 18..."
110091,0.0,2005.0,51.0,8.4,53.0,Brian Wilson Presents Smile,imdbdisplay,"documentary,music","[{'category': 'self', 'job': 'missing', 'chara...","[{'primaryName': 'John Anderson', 'birthYear':...",[]


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 390855 entries, 0 to 390854
Data columns (total 11 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   num__isAdult          390855 non-null  float64
 1   num__startYear        390855 non-null  float64
 2   num__runtimeMinutes   390855 non-null  float64
 3   num__averageRating    390855 non-null  float64
 4   num__numVotes         390855 non-null  float64
 5   cat__title            390855 non-null  object 
 6   cat__types            390855 non-null  object 
 7   cat__genres           390855 non-null  object 
 8   remainder__cast       390855 non-null  object 
 9   remainder__directors  390855 non-null  object 
 10  remainder__writers    390855 non-null  object 
dtypes: float64(5), object(6)
memory usage: 32.8+ MB


In [6]:
PREPROCESSOR_FILE = 'preprocessor.joblib'
TFIDF_VECTORIZER_FILE = 'tfidf_vectorizer.joblib'

FULL_PREPROCESSOR = joblib.load(PREPROCESSOR_FILE)
TFIDF_VECTORIZER = joblib.load(TFIDF_VECTORIZER_FILE)

print("Preprocessor and TF-IDF Vectorizer loaded successfully.")

Preprocessor and TF-IDF Vectorizer loaded successfully.
