In [25]:
!pip install statsmodels



In [26]:
import numpy as np
import pandas as pd

import statsmodels.api as sm

import matplotlib.pyplot as plt 
%matplotlib inline

import random as rand

import warnings
warnings.filterwarnings("ignore")

Then we load the data. We will also print a list of each columns together with its datatype, using the "type" function.

In [27]:
np.random.seed(26)
df = pd.read_csv('tmdb_5000_movies.csv') # Load in the csv file 

feature_names = df.columns
for i in range(len(feature_names)):
    print(str(i), "\t", str(feature_names[i]),"\t\t\t", str(type(df.iloc[0,i])))

print(df.shape)

0 	 budget 			 <class 'numpy.int64'>
1 	 genres 			 <class 'str'>
2 	 homepage 			 <class 'str'>
3 	 id 			 <class 'numpy.int64'>
4 	 keywords 			 <class 'str'>
5 	 original_language 			 <class 'str'>
6 	 original_title 			 <class 'str'>
7 	 overview 			 <class 'str'>
8 	 popularity 			 <class 'numpy.float64'>
9 	 production_companies 			 <class 'str'>
10 	 production_countries 			 <class 'str'>
11 	 release_date 			 <class 'str'>
12 	 revenue 			 <class 'numpy.int64'>
13 	 runtime 			 <class 'numpy.float64'>
14 	 spoken_languages 			 <class 'str'>
15 	 status 			 <class 'str'>
16 	 tagline 			 <class 'str'>
17 	 title 			 <class 'str'>
18 	 vote_average 			 <class 'numpy.float64'>
19 	 vote_count 			 <class 'numpy.int64'>
(4803, 20)


# 1. Dealing with Nan/empty data

Our goal is to predict the success of a movie (represented by its revenue) provided a list of features.

In [28]:
# delete id column (which is not relevent to our task)
df = df.drop(["id"], axis=1)
print(df.shape)

feature_names = df.columns

# delete rows which contains empty field
for feature in feature_names:
    if (feature != "homepage"):
        df[feature].replace('', np.nan, inplace=True)
        df = df[~df[feature].isna()]

print(df.shape)

(4803, 19)
(3959, 19)


# 2. Split Train/Test

optional: k-fold etc.

In [29]:
df = df.sample(frac = 1) # we shuffle the data so that our train/test split will be truly random

train_proportion = 0.8
n = len(df)
print('Size of dataset: ', str(n))

# Put the first ntrain observations in the DataFrame df into the training set, and the rest into the test set
t = int(train_proportion * n)

target = df['revenue']
data = df.loc[:, ~df.columns.isin(['revenue'])]

# the following variable records the features of examples in the training set
train_x = data.iloc[0:t]
# the following variable records the features of examples in the test set
test_x = data.iloc[t:]
# the following variable records the labels of examples in the training set
train_y = target[0:t]
# the following variable records the labels of examples in the test set
test_y = target[t:]

# let's take a look
print('Training dataset: ', train_x)
print('Training y: ',train_y)

Size of dataset:  3959
Training dataset:           budget                                             genres  \
2874   12000000  [{"id": 9648, "name": "Mystery"}, {"id": 53, "...   
2654    7000000  [{"id": 53, "name": "Thriller"}, {"id": 878, "...   
1043   45000000  [{"id": 35, "name": "Comedy"}, {"id": 80, "nam...   
1439   35000000  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   
3211    7000000  [{"id": 18, "name": "Drama"}, {"id": 10402, "n...   
...         ...                                                ...   
1428   35000000  [{"id": 28, "name": "Action"}, {"id": 35, "nam...   
85    170000000  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   
2029   22000000  [{"id": 14, "name": "Fantasy"}, {"id": 28, "na...   
1110   50000000  [{"id": 14, "name": "Fantasy"}, {"id": 28, "na...   
1650   30000000  [{"id": 28, "name": "Action"}, {"id": 878, "na...   

                                               homepage  \
2874                                                NaN   

# 3. Boolean Data

homepage: whether a movie has a home page

In [30]:
# change homepage to boolean feature: whether a movie has a home page
def string_to_bool(string):
    if (string != ""):
        return True
    return False

# change tagline to int feature: length of the tagline (if no tagline, len = 0)
def string_to_int(string):
    if (string == ""):
        return 0
    return len(string.split())


label_str_to_bool = ["homepage"]
df_homepage = df.loc[:, df.columns.isin(label_str_to_bool)].applymap(string_to_bool)
label_str_to_int = ["tagline"]
df_tagline = df.loc[:, df.columns.isin(label_str_to_int)].applymap(string_to_int)

# print(df_tagline)

In [34]:
train_vals_homepage = train_x.loc[:, train_x.columns.isin(label_str_to_bool)].applymap(string_to_bool)
train_vals_homepage = np.asarray(train_vals_homepage)
test_vals_homepage = test_x.loc[:, test_x.columns.isin(label_str_to_bool)].applymap(string_to_bool)
test_vals_homepage = np.asarray(test_vals_homepage)

train_vals_tagline = train_x.loc[:, train_x.columns.isin(label_str_to_int)].applymap(string_to_int)
train_vals_tagline = np.asarray(train_vals_tagline)
test_vals_tagline = test_x.loc[:, test_x.columns.isin(label_str_to_int)].applymap(string_to_int)
test_vals_tagline = np.asarray(test_vals_tagline)

In [37]:
print(train_vals_tagline.shape)
print(test_vals_tagline.shape)

(3167, 1)
(792, 1)
