# IMDB Movie Regression

In [22]:
# Setting the ast_node_interactivity to "all" will print
# each statement, not only the last one of a cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# dataframes and arrays
import pandas as pd
import numpy as np

# plotting libs
import seaborn as sns
from matplotlib import pyplot as plt

# machine learning
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, LabelBinarizer, LabelEncoder, Imputer
from sklearn.base import TransformerMixin

In [63]:
# load data
df = pd.read_csv("movie_metadata.csv")

In [64]:
# some important stats
df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5043 entries, 0 to 5042
Data columns (total 28 columns):
color                        5024 non-null object
director_name                4939 non-null object
num_critic_for_reviews       4993 non-null float64
duration                     5028 non-null float64
director_facebook_likes      4939 non-null float64
actor_3_facebook_likes       5020 non-null float64
actor_2_name                 5030 non-null object
actor_1_facebook_likes       5036 non-null float64
gross                        4159 non-null float64
genres                       5043 non-null object
actor_1_name                 5036 non-null object
movie_title                  5043 non-null object
num_voted_users              5043 non-null int64
cast_total_facebook_likes    5043 non-null int64
actor_3_name                 5020 non-null object
facenumber_in_poster         5030 non-null float64
plot_keywords                4890 non-null object
movie_imdb_link              5043 non-

Unnamed: 0,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_1_facebook_likes,gross,num_voted_users,cast_total_facebook_likes,facenumber_in_poster,num_user_for_reviews,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
count,4993.0,5028.0,4939.0,5020.0,5036.0,4159.0,5043.0,5043.0,5030.0,5022.0,4551.0,4935.0,5030.0,5043.0,4714.0,5043.0
mean,140.194272,107.201074,686.509212,645.009761,6560.047061,48468410.0,83668.16,9699.063851,1.371173,272.770808,39752620.0,2002.470517,1651.754473,6.442138,2.220403,7525.964505
std,121.601675,25.197441,2813.328607,1665.041728,15020.75912,68452990.0,138485.3,18163.799124,2.013576,377.982886,206114900.0,12.474599,4042.438863,1.125116,1.385113,19320.44511
min,1.0,7.0,0.0,0.0,0.0,162.0,5.0,0.0,0.0,1.0,218.0,1916.0,0.0,1.6,1.18,0.0
25%,50.0,93.0,7.0,133.0,614.0,5340988.0,8593.5,1411.0,0.0,65.0,6000000.0,1999.0,281.0,5.8,1.85,0.0
50%,110.0,103.0,49.0,371.5,988.0,25517500.0,34359.0,3090.0,1.0,156.0,20000000.0,2005.0,595.0,6.6,2.35,166.0
75%,195.0,118.0,194.5,636.0,11000.0,62309440.0,96309.0,13756.5,2.0,326.0,45000000.0,2011.0,918.0,7.2,2.35,3000.0
max,813.0,511.0,23000.0,23000.0,640000.0,760505800.0,1689764.0,656730.0,43.0,5060.0,12215500000.0,2016.0,137000.0,9.5,16.0,349000.0


# Create Transformer for Pipe

In [65]:
# base class where the others will inherit from
from sklearn.base import TransformerMixin

In [66]:
# Inherit from so you do not need a fit function
class NoFitMixin:
    def fit(self, df, y=None):
        return self

In [67]:
# class which takes a df and a function and returns the modified df
class DFTransform(TransformerMixin, NoFitMixin):
    def __init__(self, func, copy=False):
        self.func = func
        self.copy = copy

    def transform(self, df):
        df_ = df if not self.copy else df.copy()
        return self.func(df)

In [68]:
# transforms the genre string to a list: action|drama|... -> [action,drama,...]
class StringToArray(TransformerMixin, NoFitMixin):
    ''' Takes a Pandas Dataframe and two column names as input and transforms the input column '''
    def __init__(self, seperator):
 # e.g. pass in a column name to extract  
        self.seperator = seperator

    def transform(self, df):
        df = df.apply(lambda x: x.split(self.seperator))
        return df

In [135]:
class ArrayExplodePivot(TransformerMixin, NoFitMixin):
    '''Explodes the Array and adds column for each genre with binary field values
    (similar to a LabelBinarizer)'''
    

    def transform(self, X):
        df = self.__explode__(X)
        df_pivot = self.__pivot__(df)
        df_pivot.index.names = ["index"]
        return df_pivot
        #return self.__join__(X, df_pivot)
    
    def __explode__(self, X):
        rows = []
        ind = 0
        for row in X:
            for element in row:
                rows.append([ind,element])
            ind+=1
        return pd.DataFrame(data=rows, columns=["old_index", "genres"])
    
    def __pivot__(self, X):
        X["val"] = int(1)
        return X.pivot(X.index, 'genres')["val"].fillna(int(0))

In [211]:
from sklearn.pipeline import Pipeline, FeatureUnion, _transform_one
from sklearn.externals.joblib import Parallel, delayed

class DFFeatureUnion(FeatureUnion):
    def fit_transform(self, X, y=None, **fit_params):
        # non-optimized default implementation; override when a better
        # method is possible
        if y is None:
            # fit method of arity 1 (unsupervised transformation)
            return self.fit(X, **fit_params).transform(X)
        else:
            # fit method of arity 2 (supervised transformation)
            return self.fit(X, y, **fit_params).transform(X)

    def transform(self, X):
        Xs = Parallel(n_jobs=self.n_jobs)(
            delayed(_transform_one)(trans, name, weight, X)
            for name, trans, weight in self._iter())
        return pd.concat(Xs, axis=1, join='inner')

# Pipeline Test

In [136]:
genrePipeline = Pipeline([
    ('selectGenres', DFTransform(lambda x: x['genres'], True))
    ,('stringToArray', StringToArray('|'))
    ,('explodeAndLabel', ArrayExplodePivot())
])

In [222]:
numericalPipeline = Pipeline([
    ('selectNumerical', DFTransform(lambda x: x.select_dtypes(exclude=['object']))),
    ('impure', DFTransform(lambda x: x.fillna(x.mean()))),
    ('normalize', DFTransform(lambda x: x.div(x.max())))
])

In [223]:
# plotkeywords can be treated similar to genres
categoricalPipeline = Pipeline([
    ('dropColumns', DFTransform(lambda x: x.drop(['genres','movie_imdb_link', 'imdb_score', 'plot_keywords'],1))),
    ('selectCategorical', DFTransform(lambda x: x.select_dtypes(include=['object']))),
    ('impure', DFTransform(lambda x: x.apply(lambda x:x.fillna(x.value_counts().index[0])))),
    ('labelBinarize', DFTransform(lambda x: pd.get_dummies(x)))
])

In [248]:
from sklearn.pipeline import FeatureUnion
from sklearn.decomposition import PCA
from sklearn.decomposition import KernelPCA
from sklearn.ensemble import RandomForestRegressor

#completePipeline =
estimators = [('linear_pca', PCA()), ('forest', RandomForestRegressor(n_jobs=4))]
features = DFFeatureUnion([('1', genrePipeline), ('2',numericalPipeline), ('3',categoricalPipeline)])
finalPipe = Pipeline([('linear_pca', PCA()),('forest', RandomForestRegressor(n_jobs=4))])

#finalPipe = Pipeline([('featureSelection', union), ('rf',RandomForestRegressor(n_jobs=4) )])

In [251]:
from sklearn.model_selection import train_test_split
X, y = df, df['imdb_score']
X_feat = features.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split( X_feat, df['imdb_score'], test_size=0.2, random_state=42)

In [252]:
finalPipe.fit(X_train, y_train)

Pipeline(steps=[('linear_pca', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('forest', RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impuri...timators=10, n_jobs=4, oob_score=False, random_state=None,
           verbose=0, warm_start=False))])

In [253]:
from sklearn.metrics import accuracy_score

In [256]:
y_pred = finalPipe.predict(X_test)

In [258]:
pd.DataFrame(y_pred).head()

Unnamed: 0,0
0,7.08
1,7.05
2,6.5
3,6.86
4,6.03


In [260]:
finalPipe.score(X_test, y_test)

0.82165504466169803