In [None]:
# !pip install statsmodels

In [None]:
import numpy as np
import pandas as pd

import statsmodels.api as sm

import matplotlib.pyplot as plt 
%matplotlib inline

import random as rand

import warnings
warnings.filterwarnings("ignore")

Then we load the data. We will also print a list of each columns together with its datatype, using the "type" function.

In [None]:
np.random.seed(26)
df = pd.read_csv('tmdb_5000_movies.csv') # Load in the csv file 

feature_names = df.columns
for i in range(len(feature_names)):
    print(str(i), "\t", str(feature_names[i]),"\t\t\t", str(type(df.iloc[0,i])))

print(df.shape)

# 1. Dealing with Nan/empty data

Our goal is to predict the success of a movie (represented by its revenue) provided a list of features.

In [None]:
# delete id column (which is not relevent to our task)
df = df.drop(["id"], axis=1)
print(df.shape)

feature_names = df.columns

# delete rows which contains empty field
for feature in feature_names:
    if (feature != "homepage" and feature != "tagline"):
        df[feature].replace('', np.nan, inplace=True)
        df = df[~df[feature].isna()]
    else:
        df[feature].replace(np.nan, "", inplace=True)

print(df.shape)

# 2. Split Train/Test

optional: k-fold etc.

In [None]:
df = df.sample(frac = 1) # we shuffle the data so that our train/test split will be truly random

train_proportion = 0.8
n = len(df)
print('Size of dataset: ', str(n))

# Put the first ntrain observations in the DataFrame df into the training set, and the rest into the test set
t = int(train_proportion * n)

target = df['revenue']
data = df.loc[:, ~df.columns.isin(['revenue'])]

# the following variable records the features of examples in the training set
train_x = data.iloc[0:t]
# the following variable records the features of examples in the test set
test_x = data.iloc[t:]
# the following variable records the labels of examples in the training set
train_y = target[0:t]
# the following variable records the labels of examples in the test set
test_y = target[t:]

# let's take a look
print('Training dataset: ', train_x)
print('Training y: ',train_y)

# 3. Boolean Data

homepage: whether a movie has a home page

In [None]:
# change homepage to boolean feature: whether a movie has a home page
def string_to_bool(string):
    if (string != ""):
        return True
    return False

# change tagline to int feature: length of the tagline (if no tagline, len = 0)
def string_to_int(string):
    try:
        return len(string.split())
    except:
        return 0


label_str_to_bool = ["homepage"]
df_homepage = df.loc[:, df.columns.isin(label_str_to_bool)].applymap(string_to_bool)
label_str_to_int = ["tagline"]
df_tagline = df.loc[:, df.columns.isin(label_str_to_int)].applymap(string_to_int)

print(df_tagline)

In [None]:
train_vals_homepage = train_x.loc[:, train_x.columns.isin(label_str_to_bool)].applymap(string_to_bool)
train_vals_homepage = np.asarray(train_vals_homepage)
test_vals_homepage = test_x.loc[:, test_x.columns.isin(label_str_to_bool)].applymap(string_to_bool)
test_vals_homepage = np.asarray(test_vals_homepage)

train_vals_tagline = train_x.loc[:, train_x.columns.isin(label_str_to_int)].applymap(string_to_int)
train_vals_tagline = np.asarray(train_vals_tagline)
test_vals_tagline = test_x.loc[:, test_x.columns.isin(label_str_to_int)].applymap(string_to_int)
test_vals_tagline = np.asarray(test_vals_tagline)

In [None]:
print(train_vals_homepage)
print(test_vals_tagline)