##  Imports

In [1]:
import polars as pl
import pandas as pd
import numpy as np
import os
import joblib
from dotenv import load_dotenv

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

from sentence_transformers import SentenceTransformer

from sklearn import set_config
set_config(transform_output="pandas") 

  from .autonotebook import tqdm as notebook_tqdm


## 1. Data Loading 

In [2]:
load_dotenv("config/.env")
DATA_CLEANED = os.getenv("URL_CLEANED_DATA")

print(f"Loading cleaned data from: {DATA_CLEANED}")
df = pl.read_parquet(DATA_CLEANED)

df_sample = df.sample(n=1000, seed=42)
print(f"Full dataset shape: {df.shape}")
print(f"Sample dataset shape: {df_sample.shape}")


df_sample_pd = df_sample.to_pandas()

print(f"\nSchema: {df_sample.schema}")

Loading cleaned data from: https://huggingface.co/datasets/wojciechjurewicz/imdb/resolve/main/imdb_us_movies_cleaned.parquet
Full dataset shape: (390855, 11)
Sample dataset shape: (1000, 11)

Schema: Schema([('num__isAdult', Float64), ('num__startYear', Float64), ('num__runtimeMinutes', Float64), ('num__averageRating', Float64), ('num__numVotes', Float64), ('cat__title', String), ('cat__types', String), ('cat__genres', String), ('remainder__cast', List(Struct({'category': String, 'job': String, 'characters': String, 'primaryName': String, 'primaryProfession': String, 'birthYear': Int64, 'deathYear': Int64}))), ('remainder__directors', List(Struct({'primaryName': String, 'birthYear': Int64, 'deathYear': Int64}))), ('remainder__writers', List(Struct({'primaryName': String, 'birthYear': Int64, 'deathYear': Int64})))])


## 2. Define Feature Groups

In [3]:
numeric_features = [c for c in df_sample_pd.columns if c.startswith('num__')]
list_features = [c for c in df_sample_pd.columns if c.startswith('remainder__')]

categorical_features = ['cat__types'] 
text_features = ['cat__title', 'cat__genres'] 

print(f"Numerical Features: {numeric_features}")
print(f"Categorical Features: {categorical_features}")
print(f"Text Features: {text_features}")
print(f"List Features: {list_features}")

Numerical Features: ['num__isAdult', 'num__startYear', 'num__runtimeMinutes', 'num__averageRating', 'num__numVotes']
Categorical Features: ['cat__types']
Text Features: ['cat__title', 'cat__genres']
List Features: ['remainder__cast', 'remainder__directors', 'remainder__writers']


## 3. Structured Data Pipeline (Numeric & Categorical)

### Numerical Pipeline (Scaling)

In [4]:
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

### Categorical Pipeline (One-Hot Encoding)

In [5]:
categorical_transformer = Pipeline(steps=[
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

## 4. Custom Transformers (Lists & Text Embeddings)

###  List Feature Extractor

In [6]:
class ListCounter(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self 

    def transform(self, X_series):
        counts = X_series.apply(lambda x: len(x) if isinstance(x, list) else 0)
        return counts.values.reshape(-1, 1)

print("Custom 'ListCounter' transformer created.")

Custom 'ListCounter' transformer created.


### Text Embedding Transformer

In [7]:
class SentenceEmbeddingTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, model_name='all-MiniLM-L6-v2'):
        self.model_name = model_name
        self.model = SentenceTransformer(model_name)
        self.feature_names_out = None

    def fit(self, X, y=None):
        column_name = X.columns[0] 
        embedding_dim = self.model.get_sentence_embedding_dimension()
        self.feature_names_out = [f"{column_name}_emb_{i}" for i in range(embedding_dim)]
        return self

    def transform(self, X_df):
        sentences = X_df.iloc[:, 0].fillna("missing").tolist()
        embeddings = self.model.encode(sentences, show_progress_bar=False)
        return embeddings

    def get_feature_names_out(self, input_features=None):
        return np.array(self.feature_names_out, dtype=object)

print("Custom 'SentenceEmbeddingTransformer' created.")

Custom 'SentenceEmbeddingTransformer' created.


## 5. Pipeline Integration

### Helper Function (Genre Tokenizer)

In [8]:
def comma_tokenizer(text_str):
    if text_str and isinstance(text_str, str):
        return [token.strip() for token in text_str.split(',')]
    return []

### Pipeline A: Classic TF-IDF

In [9]:
transformers_tfidf = [
    ('numeric', numeric_transformer, numeric_features),
    ('categorical', categorical_transformer, categorical_features),
    ('tfidf_title', TfidfVectorizer(max_features=1000, stop_words='english'), 'cat__title'),
    ('tfidf_genres', TfidfVectorizer(tokenizer=comma_tokenizer, max_features=50), 'cat__genres'),
    ('list_cast', ListCounter(), 'remainder__cast'),
    ('list_directors', ListCounter(), 'remainder__directors'),
    ('list_writers', ListCounter(), 'remainder__writers')
]

preprocessor_tfidf = ColumnTransformer(
    transformers=transformers_tfidf,
    remainder='drop', 
    sparse_threshold=0 
)

pipeline_tfidf = Pipeline(steps=[
    ('preprocessor', preprocessor_tfidf)
])

print("--- Pipeline A (TF-IDF) created. ---")

--- Pipeline A (TF-IDF) created. ---


### Pipeline B: Modern Embeddings

In [10]:
transformers_embeddings = [
    ('numeric', numeric_transformer, numeric_features),
    ('categorical', categorical_transformer, categorical_features),
    ('embedding_title', SentenceEmbeddingTransformer(), ['cat__title']), 
    ('tfidf_genres', TfidfVectorizer(tokenizer=comma_tokenizer, max_features=50), 'cat__genres'),
    ('list_cast', ListCounter(), 'remainder__cast'),
    ('list_directors', ListCounter(), 'remainder__directors'),
    ('list_writers', ListCounter(), 'remainder__writers')
]

preprocessor_embeddings = ColumnTransformer(
    transformers=transformers_embeddings,
    remainder='drop' 
)

pipeline_embeddings = Pipeline(steps=[
    ('preprocessor', preprocessor_embeddings)
])

print("--- Pipeline B (Embeddings) created. ---")

--- Pipeline B (Embeddings) created. ---


## 6. Testing & Validation

### Test Pipeline A (TF-IDF)

In [11]:
print("--- Validating Pipeline A (TF-IDF) on sample data... ---")
print(f"Input shape: {df_sample_pd.shape}")

X_processed_tfidf = pipeline_tfidf.fit_transform(df_sample_pd)

print(f"\nOutput shape: {X_processed_tfidf.shape}")
X_processed_tfidf.head()

--- Validating Pipeline A (TF-IDF) on sample data... ---
Input shape: (1000, 11)

Output shape: (1000, 1042)




Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1032,1033,1034,1035,1036,1037,1038,1039,1040,1041
0,-0.160128,0.466652,1.223372,0.220856,-0.14229,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.160128,0.386885,0.600247,1.013061,0.013293,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.160128,0.320872,0.088395,-1.250383,-0.142474,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.160128,0.326373,0.088395,1.041354,-0.142153,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.160128,0.459775,0.199667,-0.146954,-0.141403,0.0,0.0,0.0,1.0,0.0,...,0.619687,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Interpretation (TF-IDF)

* **Input:** (1000, 11)
* **Output:** (1000, 1042)

The pipeline works. Our 11 input features were transformed into 1042 features.
* `numeric`: 5 features
* `categorical`: 8 features
* `tfidf_title`: 1000 features
* `tfidf_genres`: 26 features (Note: We set `max_features=50`, but the 1000-row sample only contained 26 unique genres)
* `ListCounter` (x3): 3 features

This is a very wide, dense matrix.

### Test Pipeline B (Embeddings)

In [12]:
print("\n\n--- Validating Pipeline B (Embeddings) on sample data... ---")
print(f"Input shape: {df_sample_pd.shape}")

X_processed_embed = pipeline_embeddings.fit_transform(df_sample_pd)

print(f"\nOutput shape: {X_processed_embed.shape}")
X_processed_embed.head()



--- Validating Pipeline B (Embeddings) on sample data... ---
Input shape: (1000, 11)

Output shape: (1000, 426)




Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,416,417,418,419,420,421,422,423,424,425
0,-0.160128,0.466652,1.223372,0.220856,-0.14229,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.160128,0.386885,0.600247,1.013061,0.013293,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.160128,0.320872,0.088395,-1.250383,-0.142474,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.160128,0.326373,0.088395,1.041354,-0.142153,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.160128,0.459775,0.199667,-0.146954,-0.141403,0.0,0.0,0.0,1.0,0.0,...,0.619687,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Interpretation (Embeddings)

* **Input:** (1000, 11)
* **Output:** (1000, 426)

This pipeline also works. Our 11 input features were transformed into 426 features.
* `numeric`: 5 features
* `categorical`: 8 features
* `embedding_title`: 384 features (the MiniLM vector)
* `tfidf_genres`: 26 features (Note: We set `max_features=50`, but the 1000-row sample only contained 26 unique genres)
* `ListCounter` (x3): 3 features

This is a much denser and smaller feature matrix. This is generally preferred for modern models.

## 7. Save Pipeline

In [13]:
joblib.dump(pipeline_embeddings, 'imdb_embedding_pipeline.joblib') 

print("Pipeline saved to 'imdb_embedding_pipeline.joblib'")

# To load it later for inference:
# loaded_pipeline = joblib.load('imdb_embedding_pipeline.joblib')
# predictions = loaded_pipeline.transform(new_data)

Pipeline saved to 'imdb_embedding_pipeline.joblib'
