In [None]:
# !pip install statsmodels

In [None]:
import numpy as np
import pandas as pd
import json
import functools

import statsmodels.api as sm

import matplotlib.pyplot as plt 
%matplotlib inline

import random as rand

from collections import Counter

import warnings
warnings.filterwarnings("ignore")

Then we load the data. We will also print a list of each columns together with its datatype, using the "type" function.

In [None]:
np.random.seed(26)
df = pd.read_csv('tmdb_5000_movies.csv') # Load in the csv file 

feature_names = df.columns
for i in range(len(feature_names)):
    print(str(i), "\t", str(feature_names[i]),"\t\t\t", str(type(df.iloc[0,i])))

print(df.shape)

# 1. Dealing with Nan/empty data

Our goal is to predict the success of a movie (represented by its revenue) provided a list of features.

In [None]:
# delete id column (which is not relevent to our task)
df = df.drop(["id"], axis=1)
print(df.shape)

feature_names = df.columns

# delete rows which contains empty field
for feature in feature_names:
    if (feature != "homepage" and feature != "tagline"):
        df[feature].replace('', np.nan, inplace=True)
        df = df[~df[feature].isna()]
    else:
        df[feature].replace(np.nan, "", inplace=True)

print(df.shape)

# 2. Split Train/Test

optional: k-fold etc.

In [None]:
df = df.sample(frac = 1) # we shuffle the data so that our train/test split will be truly random

train_proportion = 0.8
n = len(df)
print('Size of dataset: ', str(n))

# Put the first ntrain observations in the DataFrame df into the training set, and the rest into the test set
t = int(train_proportion * n)

target = df['revenue']
data = df.loc[:, ~df.columns.isin(['revenue'])]

# the following variable records the features of examples in the training set
train_x = data.iloc[0:t]
# the following variable records the features of examples in the test set
test_x = data.iloc[t:]
# the following variable records the labels of examples in the training set
train_y = target[0:t]
# the following variable records the labels of examples in the test set
test_y = target[t:]

# let's take a look
print('Training dataset: ', train_x)
print('Training y: ',train_y)

# 3. Boolean/Int Data

homepage: whether a movie has a home page

In [None]:
# change homepage to boolean feature: whether a movie has a home page
def string_to_bool(string):
    if (string != ""):
        return True
    return False

# change tagline to int feature: length of the tagline (if no tagline, len = 0)
def string_to_int(string):
    try:
        return len(string.split())
    except:
        return 0


label_str_to_bool = ["homepage"]
df_homepage = df.loc[:, df.columns.isin(label_str_to_bool)].applymap(string_to_bool)
label_str_to_int = ["tagline"]
df_tagline = df.loc[:, df.columns.isin(label_str_to_int)].applymap(string_to_int)

print(df_tagline)

In [None]:
train_vals_homepage = train_x.loc[:, train_x.columns.isin(label_str_to_bool)].applymap(string_to_bool)
train_vals_homepage = np.asarray(train_vals_homepage)
test_vals_homepage = test_x.loc[:, test_x.columns.isin(label_str_to_bool)].applymap(string_to_bool)
test_vals_homepage = np.asarray(test_vals_homepage)

train_vals_tagline = train_x.loc[:, train_x.columns.isin(label_str_to_int)].applymap(string_to_int)
train_vals_tagline = np.asarray(train_vals_tagline)
test_vals_tagline = test_x.loc[:, test_x.columns.isin(label_str_to_int)].applymap(string_to_int)
test_vals_tagline = np.asarray(test_vals_tagline)

# 4. Manyhot Data

In [None]:
def string_to_list(string):
    data = json.loads(string)
    result = {d['id']: d['name'] for d in data}
    return result.values()

def manyhot(column=None, col=None):
    cat_to_idx = {cat: i for i, cat in enumerate(col)}
    manyhot_vectors = []
    for entry in column:
        vec = [0] * len(col)
        for ele in entry:
            if (ele in cat_to_idx):
                vec[cat_to_idx[ele]] = 1
        manyhot_vectors.append(vec)
    return np.asarray(manyhot_vectors)

label_str_to_dict = ['genres', 'keywords']
train_vals_genres_keywords = train_x.loc[:, train_x.columns.isin(label_str_to_dict)].applymap(string_to_list)
test_vals_genres_keywords = test_x.loc[:, test_x.columns.isin(label_str_to_dict)].applymap(string_to_list)

# set of all genres
all_genres = functools.reduce(lambda x, y: x.union(y), train_vals_genres_keywords['genres'], set())
train_vals_genres = manyhot(column=train_vals_genres_keywords['genres'], col=all_genres)
test_vals_genres = manyhot(column=test_vals_genres_keywords['genres'], col=all_genres)

In [None]:
# Flatten the list of lists into a single list using a list comprehension
flat_keywords_list = [keyword for sublist in train_vals_genres_keywords['keywords'] for keyword in sublist]
keywords_freq = Counter(flat_keywords_list)
# Create a list of keys to remove
keys_to_remove = [key for key, count in keywords_freq.items() if count <= 3]
# Remove the keys from the dictionary
for key in keys_to_remove:
    del keywords_freq[key]
all_keywords = keywords_freq.keys()
train_vals_keywords = manyhot(column=train_vals_genres_keywords['keywords'], col=all_keywords)
test_vals_keywords = manyhot(column=test_vals_genres_keywords['keywords'], col=all_keywords)

In [None]:
train_offset = np.ones((train_vals_homepage.shape[0], 1))
test_offset = np.ones((test_vals_homepage.shape[0], 1))
train_vals = np.concatenate((train_vals_homepage, train_vals_tagline, train_vals_genres, train_vals_keywords, train_offset), axis=1)
test_vals = np.concatenate((test_vals_homepage, test_vals_tagline, test_vals_genres, test_vals_keywords, test_offset), axis=1)

In [None]:
# This function just computes the mean squared error
def MSE(y, pred):
    # YOUR CODE HERE
    return np.mean(np.square(y - pred))

# This function plots the main diagonal;for a "predicted vs true" plot with perfect predictions, all data lies on this line
def plotDiagonal(xmin, xmax):
    xsamples = np.arange(xmin,xmax,step=0.01)
    plt.plot(xsamples,xsamples,c='black')

# This helper function plots x vs y and labels the axes
def plotdata(x=None,y=None,xname=None,yname=None,margin=0.05,plotDiag=True,zeromin=False):
    plt.scatter(x,y,label='data')
    plt.xlabel(xname)
    plt.ylabel(yname)
    range_x = max(x) - min(x)
    range_y = max(y) - min(y)
    if plotDiag:
        plotDiagonal(min(x)-margin*range_x,max(x)+margin*range_x)
    if zeromin:
        plt.xlim(0.0,max(x)+margin*range_x)
        plt.ylim(0.0,max(y)+margin*range_y)
    else:
        plt.xlim(min(x)-margin*range_x,max(x)+margin*range_x)
        plt.ylim(min(y)-margin*range_y,max(y)+margin*range_y)
    plt.show()

# This function plots the predicted labels vs the actual labels (We only plot the first 1000 points to avoid slow plots)
def plot_pred_true(test_pred=None, test_y=None, max_points = 1000):
    plotdata(test_pred[1:max_points], test_y[1:max_points],'Predicted', 'True', zeromin=True)

In [None]:
# This function runs OLS and bypasses any SVD (Singular Value Decomposition) convergence errors by refitting the model
def run_OLS(train_y, test_y, train_vals, test_vals):
    ols_model = sm.regression.linear_model.OLS(train_y, train_vals)
    while True: # Bypasses SVD convergence assertion error
        try:
            results = ols_model.fit()
            break
        except:
            None
            
    w = np.array(results.params).reshape([len(results.params),1])

    train_pred = np.matmul(train_vals,w)
    test_pred = np.matmul(test_vals,w)

    train_MSE = MSE(train_y, train_pred.flatten())
    test_MSE = MSE(test_y, test_pred.flatten())
    
    return train_MSE, test_MSE, test_pred

In [None]:
train_MSE, test_MSE, test_pred = run_OLS(train_y, test_y, train_vals, test_vals)

print("Train MSE\t", str(train_MSE))
print("Test MSE\t", str(test_MSE))

plot_pred_true(test_pred.flatten(), test_y)

In [90]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

# Create a nonlinear model
model = MLPClassifier(hidden_layer_sizes=(100, 50))

# Train the model on the train set
model.fit(train_vals, train_y)

# Predict the labels of the test set
y_pred = model.predict(test_vals)

# Evaluate the performance of the model on the test set
accuracy = accuracy_score(test_y, y_pred)

# Print the accuracy score
print("Accuracy:", accuracy)

(960, 1788)