In [1]:
# general libs
import pandas as pd
import numpy as np
import ast
import os
import cloudpickle
import pickle

# NLP
import gensim.downloader as api
from sentence_transformers import SentenceTransformer
from nltk.sentiment import SentimentIntensityAnalyzer
import nltk

# sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score, accuracy_score, classification_report, f1_score, hamming_loss
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MultiLabelBinarizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multioutput import MultiOutputClassifier

# XGBoost, LightGBM
from xgboost import XGBClassifier, XGBRegressor
from lightgbm import LGBMRegressor

# warnings
import warnings
warnings.filterwarnings('ignore')

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [2]:
# display all columns
pd.set_option('display.max_columns', None)

In [3]:
# load data
tmdb = pd.read_csv('../../Downloads/raw_tmdb.csv')
tmdb.head()

Unnamed: 0,id,title,vote_average,vote_count,status,release_date,revenue,runtime,adult,backdrop_path,budget,homepage,imdb_id,original_language,original_title,overview,popularity,poster_path,tagline,genres,production_companies,production_countries,spoken_languages,keywords
0,27205,Inception,8.364,34495,Released,2010-07-15,825532764,148,False,/8ZTVqvKDQ8emSGUEMjsS4yHAwrp.jpg,160000000,https://www.warnerbros.com/movies/inception,tt1375666,en,Inception,"Cobb, a skilled thief who commits corporate es...",83.952,/oYuLEt3zVCKq57qu2F8dT7NIa6f.jpg,Your mind is the scene of the crime.,"Action, Science Fiction, Adventure","Legendary Pictures, Syncopy, Warner Bros. Pict...","United Kingdom, United States of America","English, French, Japanese, Swahili","rescue, mission, dream, airplane, paris, franc..."
1,157336,Interstellar,8.417,32571,Released,2014-11-05,701729206,169,False,/pbrkL804c8yAv3zBZR4QPEafpAR.jpg,165000000,http://www.interstellarmovie.net/,tt0816692,en,Interstellar,The adventures of a group of explorers who mak...,140.241,/gEU2QniE6E77NI6lCU6MxlNBvIx.jpg,Mankind was born on Earth. It was never meant ...,"Adventure, Drama, Science Fiction","Legendary Pictures, Syncopy, Lynda Obst Produc...","United Kingdom, United States of America",English,"rescue, future, spacecraft, race against time,..."
2,155,The Dark Knight,8.512,30619,Released,2008-07-16,1004558444,152,False,/nMKdUUepR0i5zn0y1T4CsSB5chy.jpg,185000000,https://www.warnerbros.com/movies/dark-knight/,tt0468569,en,The Dark Knight,Batman raises the stakes in his war on crime. ...,130.643,/qJ2tW6WMUDux911r6m7haRef0WH.jpg,Welcome to a world without rules.,"Drama, Action, Crime, Thriller","DC Comics, Legendary Pictures, Syncopy, Isobel...","United Kingdom, United States of America","English, Mandarin","joker, sadism, chaos, secret identity, crime f..."
3,19995,Avatar,7.573,29815,Released,2009-12-15,2923706026,162,False,/vL5LR6WdxWPjLPFRLe133jXWsh5.jpg,237000000,https://www.avatar.com/movies/avatar,tt0499549,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",79.932,/kyeqWdyUXW608qlYkRqosgbbJyK.jpg,Enter the world of Pandora.,"Action, Adventure, Fantasy, Science Fiction","Dune Entertainment, Lightstorm Entertainment, ...","United States of America, United Kingdom","English, Spanish","future, society, culture clash, space travel, ..."
4,24428,The Avengers,7.71,29166,Released,2012-04-25,1518815515,143,False,/9BBTo63ANSmhC4e6r62OJFuK2GL.jpg,220000000,https://www.marvel.com/movies/the-avengers,tt0848228,en,The Avengers,When an unexpected enemy emerges and threatens...,98.082,/RYMX2wcKCBAr24UyPD7xwmjaTn.jpg,Some assembly required.,"Science Fiction, Action, Adventure",Marvel Studios,United States of America,"English, Hindi, Russian","new york city, superhero, shield, based on com..."


In [4]:
# removed movies that are unreleased
tmdb = tmdb[tmdb['status'] == 'Released']
# remove movies that do not have a description
tmdb = tmdb[tmdb['overview'].notna()]
# change dates to pandas datetime format
tmdb['release_date'] = pd.to_datetime(tmdb['release_date'])
# original language is english
tmdb = tmdb[tmdb['original_language'] == 'en']
# drop columns
tmdb = tmdb.drop(columns = ['backdrop_path', 'homepage', 'original_title', 'poster_path', 'tagline', 'spoken_languages', 'original_language', 'status'])
# insert release_year
years = tmdb['release_date'].dt.year
tmdb.insert(4, 'release_year', years)
# insert profit
tmdb.insert(9, 'profit', tmdb['revenue'] - tmdb['budget'])
# remove 0 budget movies
tmdb = tmdb[tmdb['budget'] != 0]

In [5]:
# drop movies without title
tmdb = tmdb[tmdb['title'].notna()]
# drop movies without runtimes
tmdb = tmdb[tmdb['runtime'] != 0]
# drop movies without release date
tmdb = tmdb[tmdb['release_date'].notna()]
# convert release_year to int
tmdb['release_year'] = tmdb['release_year'].astype(int)
# rename popularity to tmdb_popularity
tmdb = tmdb.rename(columns={'popularity': 'tmdb_popularity'})

In [6]:
# reset index
tmdb = tmdb.reset_index(drop=True)

In [7]:
# get only non-zero revenues and export csv
tmdb_rev = tmdb[tmdb['revenue'] != 0]
tmdb_rev.to_csv('./data/tmdb_rev.csv')

In [8]:
# load dataset and clean missing values
df = pd.read_csv('./data/tmdb_rev.csv')
df = df.dropna(subset=['overview', 'budget', 'release_year', 'revenue', 'title'])
df['overview'] = df['overview'].fillna("")
df['budget'] = df['budget'].fillna(0)
df['release_year'] = df['release_year'].fillna(0)
df['revenue'] = df['revenue'].fillna(0)

# load pre-trained bert model to generate embeddings
bert = SentenceTransformer('all-MiniLM-L6-v2')
overview_embeddings = bert.encode(df['overview'].tolist(), show_progress_bar=True)

# prepare features and target
X = np.hstack([
    overview_embeddings,
    df[['budget', 'release_year']].values
])
y = df['revenue'].values

# split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# train the lightgbm model
model = LGBMRegressor(n_estimators=200, random_state=42)
model.fit(X_train, y_train)

# make predictions
y_pred = model.predict(X_test)

# custom accuracy metric
def calculate_accuracy(y_true, y_pred, tolerance=0.1):
    correct_predictions = np.abs(y_true - y_pred) <= (tolerance * y_true)
    accuracy = np.mean(correct_predictions)
    return accuracy

# evaluate model performance
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# print results
print(f"mean absolute error (mae): {mae}")
print(f"r-squared (r²): {r2}")

# save model and embeddings
with open("models/revenue_model.pkl", "wb") as f:
    pickle.dump(model, f)
with open("models/bert_model.pkl", "wb") as f:
    pickle.dump(bert, f)

# save movie reference and embeddings
df[['title', 'overview']].to_csv("models/title_reference.csv", index=False)
np.save("models/overview_embeddings.npy", overview_embeddings)

# print completion message
print("✅ model training complete and saved.")

Batches:   0%|          | 0/286 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009967 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 98283
[LightGBM] [Info] Number of data points in the train set: 7297, number of used features: 386
[LightGBM] [Info] Start training from score 69626749.761683
mean absolute error (mae): 55131345.73391748
r-squared (r²): 0.519923223320369
✅ model training complete and saved.


In [9]:
# load the dataset and filter out movies with zero budget
df = pd.read_csv('./data/tmdb_rev.csv')
df = df[df["budget"] != 0]

# remove rows where 'genres' column isn't a string
df = df[df['genres'].apply(lambda x: isinstance(x, str))]

# convert genre strings into lists of genres
list_genres = df["genres"].dropna().apply(lambda x: [g.strip() for g in x.split(",")])

# collect unique genres from the genre lists
unique_genres = set(genre for sublist in list_genres for genre in sublist)

# sort and select a few key genres
selected_genres = ['Action', 'Adventure', 'Comedy', 'Crime', 'Drama', 'Fantasy', 'Romance', 'Science Fiction', 'Thriller']
df['genre_list'] = df['genres'].apply(lambda x: [g.strip() for g in x.split(',') if g.strip() in selected_genres])

# download the sentiment analysis model
nltk.download('vader_lexicon')

# ---------- Load GloVe ----------
print("loading glove embeddings...")
glove_model = api.load("glove-wiki-gigaword-100")

# ---------- Custom Transformers ----------
class ListToWordsTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X.apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

class TextToWordList(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X.fillna("").apply(lambda x: x.lower().split())

class PretrainedEmbeddingTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, model, vector_size):
        self.model = model
        self.vector_size = vector_size

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        result = []
        for item in X:
            vectors = [self.model[word] for word in item if word in self.model]
            avg_vector = np.mean(vectors, axis=0) if vectors else np.zeros(self.vector_size)
            result.append(avg_vector)
        return np.array(result)

class SentimentExtractor(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.analyzer = SentimentIntensityAnalyzer()

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X.fillna("").apply(lambda x: self.analyzer.polarity_scores(x)['compound']).to_frame()

# ---------- Data Preparation ----------
# categorize revenue into groups like flop, average, hit, blockbuster
revenue_bins = [0, 1e7, 5e7, 1e8, np.inf]
bin_labels = [0, 1, 2, 3]
df['revenue_class'] = pd.cut(df['revenue'], bins=revenue_bins, labels=bin_labels)
df = df.dropna(subset=['revenue_class'])
y = df['revenue_class'].astype(int)

# select features for training
X = df[['runtime', 'adult', 'budget', 'overview', 'genre_list', 'production_companies', 'production_countries']]

# ---------- Train/Test Split ----------
# split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# ---------- GloVe & Sentiment Pipelines ----------
# pipeline for transforming overview text to GloVe embeddings
overview_glove = Pipeline([
    ('text_to_words', TextToWordList()),
    ('glove_embed', PretrainedEmbeddingTransformer(glove_model, 100))
])

# pipeline for transforming genre lists to GloVe embeddings
genre_glove = Pipeline([
    ('list_to_words', ListToWordsTransformer()),
    ('glove_embed', PretrainedEmbeddingTransformer(glove_model, 100))
])

# pipeline for sentiment analysis of overview text
sentiment_pipe = Pipeline([
    ('sentiment', SentimentExtractor())
])

# ---------- Multi-hot Pipelines for company/country ----------
# custom transformer for multi-hot encoding lists (companies/countries)
class MultiLabelBinarizerTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.mlb = MultiLabelBinarizer()

    def fit(self, X, y=None):
        parsed = X.apply(self._safe_parse)
        self.mlb.fit(parsed)
        return self

    def transform(self, X):
        parsed = X.apply(self._safe_parse)
        return self.mlb.transform(parsed)

    def _safe_parse(self, x):
        if isinstance(x, list):
            return x
        if isinstance(x, str):
            try:
                return ast.literal_eval(x)
            except (ValueError, SyntaxError):
                return [item.strip() for item in x.split(",") if item.strip()]
        return []

# pipelines for production companies and countries
company_pipe = Pipeline([
    ('mlb', MultiLabelBinarizerTransformer())
])

country_pipe = Pipeline([
    ('mlb', MultiLabelBinarizerTransformer())
])

# ---------- Preprocessing ColumnTransformer ----------
# column transformer to apply different preprocessing steps to each feature
preprocessor = ColumnTransformer(transformers=[
    ('runtime', StandardScaler(), ['runtime']),
    ('budget', StandardScaler(), ['budget']),
    ('adult', OneHotEncoder(drop='if_binary'), ['adult']),
    ('overview_glove', overview_glove, 'overview'),
    ('genre_glove', genre_glove, 'genre_list'),
    ('companies', company_pipe, 'production_companies'),
    ('countries', country_pipe, 'production_countries'),
    ('sentiment', sentiment_pipe, 'overview')
])

# ---------- Full Model Pipeline ----------
# full pipeline: preprocessing followed by XGBoost classifier
model = Pipeline([
    ('preprocessing', preprocessor),
    ('classifier', XGBClassifier(
        n_estimators=300,
        max_depth=8,
        learning_rate=0.05,
        subsample=0.85,
        colsample_bytree=0.85,
        use_label_encoder=False,
        eval_metric='mlogloss',
        random_state=42
    ))
])

# ---------- Train & Evaluate ----------
# train the model and evaluate its performance
print("training model...")
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

# print the model evaluation results
print("\naccuracy:", accuracy_score(y_test, y_pred))
print("\nclassification report:")
print(classification_report(y_test, y_pred, target_names=['Flop', 'Average', 'Hit', 'Blockbuster']))

# ---------- Save Model ----------
# save the trained model using cloudpickle
os.makedirs("models", exist_ok=True)

with open("models/final_model.pkl", "wb") as f:
    cloudpickle.dump(model, f)

print("✅ model saved using cloudpickle at models/final_model.pkl")

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/lasyayadlapati/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


loading glove embeddings...
training model...

accuracy: 0.6569506726457399

classification report:
              precision    recall  f1-score   support

        Flop       0.78      0.87      0.82       823
     Average       0.46      0.55      0.50       440
         Hit       0.22      0.03      0.05       190
 Blockbuster       0.65      0.63      0.64       331

    accuracy                           0.66      1784
   macro avg       0.53      0.52      0.50      1784
weighted avg       0.62      0.66      0.63      1784

✅ model saved using cloudpickle at models/final_model.pkl


In [10]:
# ---------- Load and Clean Data ----------
df = pd.read_csv('./data/tmdb_rev.csv')

# remove movies with zero budget
df = df[df["budget"] != 0]

# filter rows where genres column is not a string
df = df[df['genres'].apply(lambda x: isinstance(x, str))]

# parse genre strings into lists
selected_genres = ['Action', 'Adventure', 'Comedy', 'Crime', 'Drama', 'Fantasy', 'Romance', 'Science Fiction', 'Thriller']
df['genre_list'] = df['genres'].apply(lambda x: [g.strip() for g in x.split(',') if g.strip() in selected_genres])

# ---------- Create Multi-Label Genre Matrix ----------
mlb = MultiLabelBinarizer(classes=selected_genres)
Y_genres = mlb.fit_transform(df['genre_list'])

# ---------- Extract Sentiment Feature From Overview ----------
analyzer = SentimentIntensityAnalyzer()
df['sentiment'] = df['overview'].apply(lambda x: analyzer.polarity_scores(str(x))['compound'])

# ---------- Create TF-IDF of Movie Synopses ----------
tfidf = TfidfVectorizer(max_features=1000, stop_words='english')
X_synopsis = tfidf.fit_transform(df['overview']).toarray()

# ---------- Stage 1: Genre Prediction ---------- 
X_train_g, X_test_g, y_train_g, y_test_g = train_test_split(X_synopsis, Y_genres, test_size=0.2, random_state=42)

# train a multi-output classifier for genre prediction
genre_model = MultiOutputClassifier(LogisticRegression(max_iter=1000))
genre_model.fit(X_train_g, y_train_g)

# predict genres on the whole dataset (used in Stage 2)
genre_preds = genre_model.predict(X_synopsis)

# ---------- Combine Features for Revenue Prediction ----------
metadata_cols = ['budget', 'release_year']
metadata = df[metadata_cols].fillna(0)
scaler = StandardScaler()
X_metadata = scaler.fit_transform(metadata)

X_emotion = df[['sentiment']].values  # use sentiment feature for now
X_combined = np.hstack([genre_preds, X_emotion, X_synopsis, X_metadata])

# ---------- Stage 2: Revenue Prediction ---------- 
y_revenue = df['revenue'].values
y_log = np.log1p(y_revenue)  # apply log transformation to revenue

X_train_rev, X_test_rev, y_train_rev, y_test_rev = train_test_split(X_combined, y_log, test_size=0.2, random_state=42)

# train a model to predict revenue
revenue_model = XGBRegressor()
revenue_model.fit(X_train_rev, y_train_rev)

# make predictions and evaluate
log_preds = revenue_model.predict(X_test_rev)
revenue_preds = np.expm1(log_preds)
revenue_true = np.expm1(y_test_rev)

# evaluate revenue prediction accuracy
revenue_acc = revenue_model.score(X_test_rev, y_test_rev)
print("Revenue Accuracy: ", revenue_acc)
print("\n💰 Revenue Prediction Results:")
print("MAE:", mean_absolute_error(revenue_true, revenue_preds))

Revenue Accuracy:  0.832201670717976

💰 Revenue Prediction Results:
MAE: 51435013.073124126
