In [None]:
# !pip install statsmodels

In [None]:
import numpy as np
import pandas as pd
import json
import functools
import math

import statsmodels.api as sm

import matplotlib.pyplot as plt 
%matplotlib inline

import random as rand

from collections import Counter

import warnings
warnings.filterwarnings("ignore")
np.set_printoptions(suppress=True)

Then we load the data. We will also print a list of each columns together with its datatype, using the "type" function.

In [None]:
np.random.seed(26)
df = pd.read_csv('tmdb_5000_movies.csv') # Load in the csv file 
print(df.shape)
df = df.loc[~((df['budget'] == 0) | (df['revenue'] == 0))]
print(df.shape)

feature_names = df.columns
for i in range(len(feature_names)):
    print(str(i), "\t", str(feature_names[i]),"\t\t\t", str(type(df.iloc[0,i])))

# 1. Dealing with Nan/empty data

Our goal is to predict the success of a movie (represented by its revenue) provided a list of features.

In [None]:
# delete id column (which is not relevent to our task)
df = df.drop(["id"], axis=1)
print(df.shape)

feature_names = df.columns

# delete rows which contains empty field
for feature in feature_names:
    if (feature != "homepage" and feature != "tagline"):
        df[feature].replace('', np.nan, inplace=True)
        df = df[~df[feature].isna()]
    else:
        df[feature].replace(np.nan, "", inplace=True)

print(df.shape)

# 2. Split Train/Test

optional: k-fold etc.

In [None]:
df = df.sample(frac = 1) # we shuffle the data so that our train/test split will be truly random

train_proportion = 0.8
n = len(df)
print('Size of dataset: ', str(n))

# Put the first ntrain observations in the DataFrame df into the training set, and the rest into the test set
t = int(train_proportion * n)

def regression_to_classification(i):
    return int(math.log10(abs(i)))

target = df['revenue']
labels = df.loc[:, df.columns.isin(['revenue'])].applymap(regression_to_classification)
data = df.loc[:, ~df.columns.isin(['revenue'])]

# the following variable records the features of examples in the training set
train_x = data.iloc[0:t]
# the following variable records the features of examples in the test set
test_x = data.iloc[t:]
# the following variable records the labels of examples in the training set
train_y = target[0:t]
# the following variable records the labels of examples in the test set
test_y = target[t:]
# the following variable records the label of examples in the training set
train_label = labels[0:t]
# the following variable records the label of examples in the test set
test_label = labels[t:]

# let's take a look
print('Training dataset: ', train_x)
print('Training y: ',train_y)
print('Training label: ',train_label)

# 3. Boolean and Real Value Data

homepage: whether a movie has a home page

In [None]:
# change homepage to boolean feature: whether a movie has a home page
def string_to_bool(string):
    if (string != ""):
        return True
    return False

# change tagline to int feature: length of the tagline (if no tagline, len = 0)
def string_to_int(string):
    try:
        return len(string.split())
    except:
        return 0

In [None]:
type(train_x['vote_count'][0])

In [None]:
label_str_to_bool = ['homepage']
label_str_to_int = ['tagline']
label_str_to_real = ['budget', 'popularity', 'runtime', 'vote_average', 'vote_count']

train_vals_homepage = np.asarray(train_x.loc[:, train_x.columns.isin(label_str_to_bool)].applymap(string_to_bool))
test_vals_homepage = np.asarray(test_x.loc[:, test_x.columns.isin(label_str_to_bool)].applymap(string_to_bool))

train_vals_tagline = np.asarray(train_x.loc[:, train_x.columns.isin(label_str_to_int)].applymap(string_to_int))
test_vals_tagline = np.asarray(test_x.loc[:, test_x.columns.isin(label_str_to_int)].applymap(string_to_int))

train_vals_real = np.asarray(train_x.loc[:, train_x.columns.isin(label_str_to_real)])
test_vals_real = np.asarray(test_x.loc[:, test_x.columns.isin(label_str_to_real)])

# 4. Onehot Data

features: production company，country, spoken language

In [None]:
def string_to_list(string):
    data = json.loads(string)
    result = {d['id']: d['name'] for d in data}
    return result.values()

label_str_onehot = [
    'original_language', 
]

# print(train_x.columns)

#Sets of all categories in a particular column
label_onehot_set = [train_x.loc[:, label].fillna('NaN').unique() for label in label_str_onehot]
print(label_onehot_set[0])

def onehot(column=None, col=None):
    # print(col)
    result = []
    for data in column:
        # print(data)
        dic = dict.fromkeys(list(col), 0)
        # print(dic)
        if data in dic.keys():
            dic[data] += 1
        result.append(list(dic.values()))
    return result

train_vals_onehot = train_x.loc[:, train_x.columns.isin(label_str_onehot)]
test_vals_onehot = test_x.loc[:, test_x.columns.isin(label_str_onehot)]

def process_onehot():
    trains = np.ones((len(train_vals_onehot),1))
    tests = np.ones((len(test_vals_onehot),1))
    for i in range(len(label_str_onehot)):
        train_data_col = train_vals_onehot[label_str_onehot[-i]]
        # print(label_str_onehot[-i])
        test_data_col = test_vals_onehot[label_str_onehot[-i]]
        feature_list = label_onehot_set[-i]
        train_vector = onehot(train_data_col, feature_list)
        # print(feature_list)
        test_vector = onehot(test_data_col, feature_list)
        trains = np.concatenate((train_vector, trains), 1)
        tests = np.concatenate((test_vector, tests), 1)
    # print(trains[:, :-1].shape)
    # print(trains[:, :-1])
    return trains[:, :-1], tests[:, :-1]

train_vals_onehot, test_vals_onehot = process_onehot()
print(train_vals_onehot[0])

# 5. Manyhot Data

In [None]:
def string_to_list(string):
    data = json.loads(string)
    result = {d['id']: d['name'] for d in data}
    return result.values()

def string_to_list_prod_countries(string):
    data = json.loads(string)
    result = {d['iso_3166_1']: d['iso_3166_1'] for d in data}
    return result.values()

def string_to_list_spoken_lang(string):
    data = json.loads(string)
    result = {d['iso_639_1']: d['iso_639_1'] for d in data}
    return result.values()

def manyhot(column=None, col=None):
    cat_to_idx = {cat: i for i, cat in enumerate(col)}
    manyhot_vectors = []
    for entry in column:
        vec = [0] * len(col)
        for ele in entry:
            if (ele in cat_to_idx):
                vec[cat_to_idx[ele]] = 1
        manyhot_vectors.append(vec)
    return np.asarray(manyhot_vectors)

label_str_to_dict = ['genres', 'keywords', 'production_companies']
train_vals_genres_keywords = train_x.loc[:, train_x.columns.isin(label_str_to_dict)].applymap(string_to_list)
test_vals_genres_keywords = test_x.loc[:, test_x.columns.isin(label_str_to_dict)].applymap(string_to_list)

# set of all genres
all_genres = functools.reduce(lambda x, y: x.union(y), train_vals_genres_keywords['genres'], set())
train_vals_genres = manyhot(column=train_vals_genres_keywords['genres'], col=all_genres)
test_vals_genres = manyhot(column=test_vals_genres_keywords['genres'], col=all_genres)

In [None]:
# Flatten the list of lists into a single list using a list comprehension
prod_company_list = [company for sublist in train_vals_genres_keywords['production_companies'] for company in sublist]
prod_company_freq = Counter(prod_company_list)
# Create a list of companies to remove
prod_company_to_remove = [company for company, count in prod_company_freq.items() if count <= 3]
# Remove the keys from the dictionary
for company in prod_company_to_remove:
    del prod_company_freq[company]
all_prod_companies = prod_company_freq.keys()
train_vals_prod_company = manyhot(column=train_vals_genres_keywords['production_companies'], col=all_prod_companies)
test_vals_prod_company = manyhot(column=test_vals_genres_keywords['production_companies'], col=all_prod_companies)

In [None]:
# Flatten the list of lists into a single list using a list comprehension
flat_keywords_list = [keyword for sublist in train_vals_genres_keywords['keywords'] for keyword in sublist]
keywords_freq = Counter(flat_keywords_list)
# Create a list of keys to remove
keys_to_remove = [key for key, count in keywords_freq.items() if count <= 3]
# Remove the keys from the dictionary
for key in keys_to_remove:
    del keywords_freq[key]
all_keywords = keywords_freq.keys()
train_vals_keywords = manyhot(column=train_vals_genres_keywords['keywords'], col=all_keywords)
test_vals_keywords = manyhot(column=test_vals_genres_keywords['keywords'], col=all_keywords)

In [None]:
# feature of spoken language
train_spoken_lang_keywords = train_x.loc[:, train_x.columns.isin(['spoken_languages'])].applymap(string_to_list_spoken_lang)
test_spoken_lang_keywords = test_x.loc[:, test_x.columns.isin(['spoken_languages'])].applymap(string_to_list_spoken_lang)
# set of all spoken lang
all_lang = functools.reduce(lambda x, y: x.union(y), train_spoken_lang_keywords['spoken_languages'], set())
train_vals_lang = manyhot(column=train_spoken_lang_keywords['spoken_languages'], col=all_lang)
test_vals_lang = manyhot(column=test_spoken_lang_keywords['spoken_languages'], col=all_lang)

In [None]:
# feature of production_countries
train_prod_country_keywords = train_x.loc[:, train_x.columns.isin(['production_countries'])].applymap(string_to_list_prod_countries)
test_prod_country_keywords = test_x.loc[:, test_x.columns.isin(['production_countries'])].applymap(string_to_list_prod_countries)
# set of all spoken lang
all_prod_country = functools.reduce(lambda x, y: x.union(y), train_prod_country_keywords['production_countries'], set())
train_vals_prod_country = manyhot(column=train_prod_country_keywords['production_countries'], col=all_prod_country)
test_vals_prod_country = manyhot(column=test_prod_country_keywords['production_countries'], col=all_prod_country)

In [None]:
# label_str_onehot = [
#     'production_companies', 
#     'production_countries', 
#     'spoken_languages'
# ]
# print(train_x['spoken_languages'][0])
# print(train_vals_prod_company.shape)
# print(train_vals_lang[5])
# print(train_vals_prod_country[0])

In [None]:
train_offset = np.ones((train_vals_homepage.shape[0], 1))
test_offset = np.ones((test_vals_homepage.shape[0], 1))
train_vals = np.concatenate((train_vals_homepage, train_vals_tagline, train_vals_onehot, train_vals_real, train_vals_prod_company, train_vals_prod_country, train_vals_lang, train_vals_genres, train_offset), axis=1)
test_vals = np.concatenate((test_vals_homepage, test_vals_tagline, test_vals_onehot, test_vals_real, test_vals_prod_company, test_vals_prod_country, test_vals_lang, test_vals_genres, test_offset), axis=1)

In [None]:
# This function just computes the mean squared error
def MSE(y, pred):
    return np.mean(np.square(y - pred))

In [None]:
# This function runs OLS and bypasses any SVD (Singular Value Decomposition) convergence errors by refitting the model
def run_OLS(train_y, test_y, train_vals, test_vals):
    ols_model = sm.regression.linear_model.OLS(train_y, train_vals)
    while True: # Bypasses SVD convergence assertion error
        try:
            results = ols_model.fit()
            break
        except:
            None
            
    w = np.array(results.params).reshape([len(results.params),1])

    train_pred = np.matmul(train_vals,w)
    test_pred = np.matmul(test_vals,w)

    train_MSE = MSE(train_y, train_pred.flatten())
    test_MSE = MSE(test_y, test_pred.flatten())
    
    return train_MSE, test_MSE, test_pred

In [None]:
train_MSE, test_MSE, test_pred = run_OLS(train_y, test_y, train_vals, test_vals)

print("Train MSE\t", str(train_MSE))
print("Test MSE\t", str(test_MSE))

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import accuracy_score
from sklearn import metrics

# Create a nonlinear model
model_classifier = MLPClassifier(hidden_layer_sizes=(20, 20))
model_regressor = MLPRegressor(hidden_layer_sizes=(20, 20))

# Train the model on the train set
model_classifier.fit(train_vals, train_label)
model_regressor.fit(train_vals, train_y)

In [None]:
# Predict the labels of the test set
train_pred = model_classifier.predict(train_vals)
test_pred = model_classifier.predict(test_vals)

# Evaluate the performance of the model on the test set
accuracy_train = accuracy_score(train_label, train_pred)
accuracy_test = accuracy_score(test_label, test_pred)

# Print the accuracy score
print("Train accuracy:", accuracy_train)
print("Test accuracy:", accuracy_test)

In [None]:
# Predict the labels of the test set
train_pred = model_regressor.predict(train_vals)
test_pred = model_regressor.predict(test_vals)

# Evaluate the performance of the model on the test set
print(metrics.r2_score(train_y, train_pred))
print(metrics.mean_squared_log_error(train_y, train_pred))
print(metrics.r2_score(test_y, test_pred))
print(metrics.mean_squared_log_error(test_y, test_pred))