In [58]:
# Libraries

import pandas as pd
import numpy as np
import matplotlib
import cpi
pd.set_option('display.max_rows', 100)

In [59]:
# Inflation Dictionary

cpi_d = {}

for year in range(1913, 2024):
    cpi_d[year] = cpi.inflate(1, year, to = 2023)
    
cpi_d[1911] = cpi_d[1913] # The api only has data to 1913, so 1 movie released in 1911 is not included
cpi_d[2024] = 1/1.024 # Bring 2024 down to 2023

KeyboardInterrupt: 

In [None]:
# Data Cleaning

df = pd.read_parquet('..\Data\IMDB.parquet') 

df = df.loc[(df.status == 'Released') & # Only include Released Movies
            (df.adult == False) & # Filter out adult films
            (df.vote_count > 0) & # Only include films with viewer reviews
            (df.revenue > 0) & # Only include revenue generating movies
            (pd.notna(df.release_date))] # Only include movies with release dates (included to resolve a small number of error cases)

df['release_date'] = pd.to_datetime(df.release_date, format = '%Y-%m-%d') # Convert Release Date to a Date Time Object

df['release_year'] = df.release_date.dt.year # Pull year out of release date
df['release_month'] = df.release_date.dt.month # Pull month out of release date 

df['inflation_factor'] = df['release_year'].map(cpi_d) # Determine an inflation factor for each year

df['adjusted_revenue'] = df['revenue'] * df['inflation_factor'] # Adjust Revenue for inflation
df['adjusted_budget'] = df['budget'] * df['inflation_factor'] # Adjust Budget for inflation

df['original_english'] = df['original_language'] == 'en' # Feature for if a movie's original language is english

df.loc[pd.isna(df.Certificate), 'Certificate'] = 'None' # Mark Movies that do not have a Rating (ex. R, PG)

ratings_counts = df['Certificate'].value_counts()
rating_filter = lambda x: x if ratings_counts[x] > 100 else 'Other' # Only include commonly used Ratings

df['Certificate'] = df['Certificate'].map(rating_filter) # Apply Rating Filter

stars = lambda x: 'Many' if pd.notna(x.Star2) else ('One' if pd.notna(x.Star1) else 'None') # Mark Movies having 0, 1, or many star actors
df['stars'] = df.apply(stars, axis = 1) # Apply Star Function

df['listed_writer'] = pd.notna(df.Writer) # Feature tracking if the writer is listed
df['listed_photography'] = pd.notna(df.Director_of_Photography) # Feature tracking if the D.O.P is listed
df['listed_producers'] = pd.notna(df.Producers) # Feature tracking if the producer is listed
df['listed_composer'] = pd.notna(df.Music_Composer) # Feature tracking if the composer is listed

import nltk

from nltk.sentiment import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

sia = SentimentIntensityAnalyzer()

df['keyword_sentiment'] = [sia.polarity_scores(x)['compound'] for x in df.keywords] # Get Sentiment Analysis for Keywords

  df = pd.read_parquet('..\Data\IMDB.parquet')
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Max\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [None]:
# Thresholds
    # The minimum amount of values each token should have in the dataset to be included

lang_threshold = 500
com_threshold = 100
country_threshold = 100
keyword_threshold = 100

In [None]:
# Languages

df.loc[df.spoken_languages.isna(), 'spoken_languages'] = 'Missing' # Mark observations with no listed language
df.loc[df.spoken_languages == 'No Language', 'spoken_languages'] = 'None' # Mark silent films

from sklearn.feature_extraction.text import CountVectorizer
vectorizer_lang = CountVectorizer( analyzer='word' ) # Vectorize by work

vector_lang = vectorizer_lang.fit_transform(df.spoken_languages)

dfLanguages = pd.DataFrame(vector_lang.toarray(), columns=vectorizer_lang.get_feature_names_out() )

dfLanguages = dfLanguages.loc[:,dfLanguages.sum() > lang_threshold] # Only include languages that appear more frequently than the threshold
dfLanguages['Other'] = dfLanguages.sum(axis = 1) < 1 # Mark other languages as other

dfLanguages = dfLanguages.add_prefix('language_')

In [33]:
# Production Companies

df.loc[df.production_companies.isna(), 'production_companies'] = 'Missing'

comma_split = lambda x: [company.strip() for company in x.split(',')]

from sklearn.feature_extraction.text import CountVectorizer
vectorizer_com = CountVectorizer(tokenizer=comma_split)

vector_com = vectorizer_com.fit_transform(df.production_companies)

dfProdCom = pd.DataFrame(vector_com.toarray(), columns=vectorizer_com.get_feature_names_out() )

dfProdCom = dfProdCom.loc[:,dfProdCom.sum() > com_threshold]

dfProdCom['Other'] = dfProdCom.sum(axis = 1) < 1

dfProdCom = dfProdCom.add_prefix('prod_company_')



In [34]:
# Production Countries

df.loc[df.production_countries.isna(), 'production_countries'] = 'Missing'

comma_split = lambda x: [company.strip() for company in x.split(',')]

from sklearn.feature_extraction.text import CountVectorizer
vectorizer_country = CountVectorizer(tokenizer=comma_split)

vector_country = vectorizer_country.fit_transform(df.production_countries)

dfCountry = pd.DataFrame(vector_country.toarray(), columns=vectorizer_country.get_feature_names_out() )

dfCountry = dfCountry.loc[:,dfCountry.sum() > country_threshold]

dfCountry['Other'] = dfCountry.sum(axis = 1) < 1

dfCountry = dfCountry.add_prefix('prod_country_')



In [35]:
# Keywords

from sklearn.feature_extraction.text import CountVectorizer
vectorizer_key = CountVectorizer(token_pattern = r'\'([^\']*)\'')

vector_key = vectorizer_key.fit_transform(df.keywords)

dfkeyword = pd.DataFrame(vector_key.toarray(), columns=vectorizer_key.get_feature_names_out() )

dfkeyword = dfkeyword.loc[:,dfkeyword.sum() > keyword_threshold]
dfkeyword['No_Keywords'] = dfkeyword.sum(axis = 1) < 1

dfkeyword = dfkeyword.add_prefix('keyword_')

In [None]:
# Combine Columns

final_columns = ['vote_average', 'vote_count', 'release_year','release_month', 'adjusted_revenue', 'runtime', 
                 'adjusted_budget', 'original_english', 'popularity', 'Certificate', 'listed_writer', 
                 'listed_photography', 'listed_producers', 'listed_composer','overview_sentiment','keyword_sentiment']

final_df = pd.concat([df[final_columns].reset_index(drop=True), # Dataframe of final used columns
                      dfProdCom.reset_index(drop=True), 
                      dfCountry.reset_index(drop=True), 
                      dfkeyword.reset_index(drop=True),
                      dfLanguages.reset_index(drop = True)], axis = 1)

In [None]:
    # Pipeline Libraries
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import PowerTransformer
import sklearn.metrics
from sklearn.model_selection import GridSearchCV

    # Preprocessing

np.random.seed(4767)

X_train, X_test, y_train, y_test = train_test_split(final_df, final_df.adjusted_revenue, test_size = 0.30)

listCat = ['release_month','Certificate']
listSkew = ['vote_average','vote_count', 'adjusted_budget'] # List of skewed numeric variables
listNum = [col for col in final_df.columns if col not in (listCat + listSkew + ['adjusted_revenue'])]

pipeCat = Pipeline([
    ('selector', ColumnTransformer([('selector', 'passthrough', listCat)])),
    ('encoder', OneHotEncoder(dtype=int, drop="first", sparse_output= False))
])

pipeNum = Pipeline([
    ('selector', ColumnTransformer([('selector','passthrough', listNum)])),
    ('scaler', StandardScaler())
])

pipeSkew = Pipeline( [
    ('selector', ColumnTransformer([ ('selector', 'passthrough', listSkew ) ] )),
    ('spline',   PowerTransformer() ),
    ('scaler',   StandardScaler() )
])

preprocessor = FeatureUnion([
    ('cat', pipeCat),
    ('num', pipeNum),
    ('skew', pipeSkew)
])

In [55]:
# Model 1: Gradient Boosting Model

from sklearn.ensemble import HistGradientBoostingRegressor

pipeHGBR = Pipeline([
    ("preprocessor", preprocessor),
    ('model', HistGradientBoostingRegressor()) 
], verbose=True)

paramGridHGBR = {'model__loss': ['squared_error','absolute_error', 'gamma','poisson'], 
                 'model__learning_rate': [0.01,.1,1,10,100],
                 'model__max_iter': [50,100,150,200],
                 }

gridHGBR = GridSearchCV(pipeHGBR,
                        paramGridHGBR,
                        cv = 5,
                        n_jobs = -1,
                        verbose = 4,
                        scoring = 'r2')

gridHGBR.fit(X_train, y_train)

predTrainHGBR = gridHGBR.predict(X_train)
predTestHGBR = gridHGBR.predict(X_test)

train_HGBR_r2 = sklearn.metrics.r2_score(y_train, predTrainHGBR)
test_HGBR_r2 = sklearn.metrics.r2_score(y_test, predTestHGBR)

print("Best Parameters:", gridHGBR.best_params_)
print(f"Training: {train_HGBR_r2}\nTest: {test_HGBR_r2}")

Fitting 5 folds for each of 80 candidates, totalling 400 fits


  9.06505205e-002  2.33152960e-001  3.31305241e-001  3.94184155e-001
  1.61406163e-002  1.61406163e-002  1.61406163e-002  1.61406163e-002
  3.51394606e-001  4.86313692e-001  5.42685297e-001  5.68476302e-001
  5.87526921e-001  5.90964697e-001  5.89917132e-001  5.84499058e-001
  5.20974626e-001  5.35863983e-001  5.46510855e-001  5.58207225e-001
  4.23632474e-002  4.23632474e-002  4.23632474e-002  4.23632474e-002
  5.56094427e-001  5.51496738e-001  5.50647447e-001  5.49807574e-001
  2.67758981e-001  2.53446687e-001  2.48505193e-001  2.48202841e-001
  5.38867903e-001  5.44435351e-001  5.42973267e-001  5.42505513e-001
  9.11305075e-002  9.11305075e-002  9.11305075e-002  9.11305075e-002
              nan              nan              nan              nan
 -8.40752050e+058 -8.40752050e+058 -8.40752050e+058 -8.40752050e+058
 -2.68224700e+094 -7.12442324e+189 -1.89234647e+285             -inf
 -1.05581037e+003 -1.05581037e+003 -1.05581037e+003 -1.05581037e+003
 -4.72495852e+161 -4.72495852e+161

[Pipeline] ...... (step 1 of 2) Processing preprocessor, total=   0.4s
[Pipeline] ............. (step 2 of 2) Processing model, total=   1.4s
Best Parameters: {'model__learning_rate': 0.1, 'model__loss': 'squared_error', 'model__max_iter': 100}
Training: 0.7802154690769751
Test: 0.6814642136890755


In [None]:
# Model 2: Support Vector Regression

from sklearn.svm import SVR

pipeSVR = Pipeline([
    ("preprocessor", preprocessor),
    ('model', SVR()) 
], verbose=True)

paramGridSVR = {'model__kernel': ['linear','poly','rbf'], 
                'model__C': [.1,1,10],
                'model__epsilon': [.01,.1,1],
                 }

gridSVR = GridSearchCV(pipeSVR,
                        paramGridSVR,
                        cv = 5,
                        n_jobs = -1,
                        verbose = 4,
                        scoring = 'r2')

gridSVR.fit(X_train, y_train)

predTrainSVR = gridSVR.predict(X_train)
predTestSVR = gridSVR.predict(X_test)

train_SVR_r2 = sklearn.metrics.r2_score(y_train, predTrainSVR)
test_SVR_r2 = sklearn.metrics.r2_score(y_test, predTestSVR)

print("Best Parameters:", gridSVR.best_params_)
print(f"Training: {train_SVR_r2}\nTest: {test_SVR_r2}")

Fitting 5 folds for each of 27 candidates, totalling 135 fits
[Pipeline] ...... (step 1 of 2) Processing preprocessor, total=   0.4s
[Pipeline] ............. (step 2 of 2) Processing model, total=  20.9s
Best Parameters: {'model__C': 10, 'model__epsilon': 1, 'model__kernel': 'linear'}
Training: -0.08885226146197178
Test: -0.10232736156776512


In [None]:
# Model 3: KNeighbors Regression

from sklearn.neighbors import KNeighborsRegressor

pipeKN = Pipeline([
    ("preprocessor", preprocessor),
    ('model', KNeighborsRegressor()) 
], verbose=True)

paramGridKN = {'model__n_neighbors': [3,6,9], 
               'model__weights': ['uniform','distance'],
               'model__algorithm': ['auto','ball_tree','kd_tree','brute'],
              }

gridKN = GridSearchCV(pipeKN,
                        paramGridKN,
                        cv = 5,
                        n_jobs = -1,
                        verbose = 4,
                        scoring = 'r2')

gridKN.fit(X_train, y_train)

predTrainKN = gridKN.predict(X_train)
predTestKN = gridKN.predict(X_test)

train_KN_r2 = sklearn.metrics.r2_score(y_train, predTrainKN)
test_KN_r2 = sklearn.metrics.r2_score(y_test, predTestKN)

print("Best Parameters:", gridKN.best_params_)
print(f"Training: {train_KN_r2}\nTest: {test_KN_r2}")

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[Pipeline] ...... (step 1 of 2) Processing preprocessor, total=   0.3s
[Pipeline] ............. (step 2 of 2) Processing model, total=   0.0s
Best Parameters: {'model__algorithm': 'auto', 'model__n_neighbors': 9, 'model__weights': 'distance'}
Training: 0.9999999999999655
Test: 0.17628344302502008


In [None]:
# Model 4: Neural Network

from sklearn.neural_network import MLPRegressor

pipeMLP = Pipeline([
    ("preprocessor", preprocessor),
    ('model', MLPRegressor()) 
], verbose=True)

paramGridMLP = {'model__hidden_layer_sizes': [(2,)],
               'model__learning_rate': ['constant','invscaling','adaptive'],
               'model__max_iter':[200] # Brought as high as 1000, still did not converge
              }

gridMLP = GridSearchCV(pipeMLP,
                        paramGridMLP,
                        cv = 5,
                        n_jobs = -1,
                        verbose = 4,
                        scoring = 'r2')

gridMLP.fit(X_train, y_train)

predTrainMLP = gridKN.predict(X_train)
predTestMLP = gridKN.predict(X_test)

train_MLP_r2 = sklearn.metrics.r2_score(y_train, predTrainMLP)
test_MLP_r2 = sklearn.metrics.r2_score(y_test, predTestMLP)

print("Best Parameters:", gridMLP.best_params_)
print(f"Training: {train_MLP_r2}\nTest: {test_MLP_r2}")

Fitting 5 folds for each of 3 candidates, totalling 15 fits


KeyboardInterrupt: 

In [None]:
# Model 5: Linear Regression

from sklearn.linear_model import LinearRegression

pipeLR = Pipeline([
    ('preprocessor', preprocessor),
    ('model', LinearRegression())
], verbose=True)

pipeLR.fit(X_train, y_train)

predTrainLR = pipeLR.predict(X_train)
predTestLR = pipeLR.predict(X_test)

train_LR_r2 = sklearn.metrics.r2_score(y_train, predTrainLR)
test_LR_r2 = sklearn.metrics.r2_score(y_test, predTestLR)

print(f"Training: {train_LR_r2}\nTest: {test_LR_r2}")

[Pipeline] ...... (step 1 of 2) Processing preprocessor, total=   0.4s
[Pipeline] ............. (step 2 of 2) Processing model, total=   0.2s
Training: 0.3406094010450834
Test: 0.36376913129788147
