# IMDB Movie Regression

In [351]:
# Setting the ast_node_interactivity to "all" will print
# each statement, not only the last one of a cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# dataframes and arrays
import pandas as pd
import numpy as np

# plotting libs
import seaborn as sns
from matplotlib import pyplot as plt

# machine learning
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, LabelBinarizer, LabelEncoder, Imputer
from sklearn.base import TransformerMixin

In [352]:
# load data
df = pd.read_csv("movie_metadata.csv")

In [353]:
# some important stats
df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5043 entries, 0 to 5042
Data columns (total 28 columns):
color                        5024 non-null object
director_name                4939 non-null object
num_critic_for_reviews       4993 non-null float64
duration                     5028 non-null float64
director_facebook_likes      4939 non-null float64
actor_3_facebook_likes       5020 non-null float64
actor_2_name                 5030 non-null object
actor_1_facebook_likes       5036 non-null float64
gross                        4159 non-null float64
genres                       5043 non-null object
actor_1_name                 5036 non-null object
movie_title                  5043 non-null object
num_voted_users              5043 non-null int64
cast_total_facebook_likes    5043 non-null int64
actor_3_name                 5020 non-null object
facenumber_in_poster         5030 non-null float64
plot_keywords                4890 non-null object
movie_imdb_link              5043 non-

Unnamed: 0,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_1_facebook_likes,gross,num_voted_users,cast_total_facebook_likes,facenumber_in_poster,num_user_for_reviews,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
count,4993.0,5028.0,4939.0,5020.0,5036.0,4159.0,5043.0,5043.0,5030.0,5022.0,4551.0,4935.0,5030.0,5043.0,4714.0,5043.0
mean,140.194272,107.201074,686.509212,645.009761,6560.047061,48468410.0,83668.16,9699.063851,1.371173,272.770808,39752620.0,2002.470517,1651.754473,6.442138,2.220403,7525.964505
std,121.601675,25.197441,2813.328607,1665.041728,15020.75912,68452990.0,138485.3,18163.799124,2.013576,377.982886,206114900.0,12.474599,4042.438863,1.125116,1.385113,19320.44511
min,1.0,7.0,0.0,0.0,0.0,162.0,5.0,0.0,0.0,1.0,218.0,1916.0,0.0,1.6,1.18,0.0
25%,50.0,93.0,7.0,133.0,614.0,5340988.0,8593.5,1411.0,0.0,65.0,6000000.0,1999.0,281.0,5.8,1.85,0.0
50%,110.0,103.0,49.0,371.5,988.0,25517500.0,34359.0,3090.0,1.0,156.0,20000000.0,2005.0,595.0,6.6,2.35,166.0
75%,195.0,118.0,194.5,636.0,11000.0,62309440.0,96309.0,13756.5,2.0,326.0,45000000.0,2011.0,918.0,7.2,2.35,3000.0
max,813.0,511.0,23000.0,23000.0,640000.0,760505800.0,1689764.0,656730.0,43.0,5060.0,12215500000.0,2016.0,137000.0,9.5,16.0,349000.0


# Create Custom Transformer for Pipeline

In [354]:
# base class where the others will inherit from
from sklearn.base import TransformerMixin

In [355]:
# Inherit from so you do not need a fit function
class NoFitMixin:
    def fit(self, df, y=None):
        return self

In [406]:
# class which takes a df and a function and returns the modified df
class DFTransform(TransformerMixin, NoFitMixin):
    def __init__(self, func, copy=False):
        self.func = func
        self.copy = copy

    def transform(self, df):
        df_ = df if not self.copy else df.copy()
        return self.func(df)

In [440]:
# fixes mismatch in number of categorical variables in training and test dataset
class AddMissingCategorialColumns(TransformerMixin):
    def __init__(self, copy=False):
        self.copy = copy
        self.columns = 0
        
    def transform(self, df):
        df_ = df if not self.copy else df.copy()
  #      self.__add_missing_dummy_columns__(df_)
  #      self.__rem_additional_dummy_columns__(df_)
        return df_
    
    def fit(self, df, y=None):
        #return df
        self.columns = df.columns.values.tolist()
    
    def __add_missing_dummy_columns__(self, df):
        missing_cols = set( self.columns ) - set( df.columns )
        for c in missing_cols:
            df[c] = 0
            
    def __rem_additional_dummy_columns__(self, df):
        add_cols = set( df.columns ) - set( self.columns )
        for c in add_cols:
            del df[c]

In [441]:
# transforms the genre string to a list: action|drama|... -> [action,drama,...]
class StringToArray(TransformerMixin, NoFitMixin):
    ''' Takes a Pandas Dataframe and two column names as input and transforms the input column '''
    def __init__(self, seperator, copy=True):
        self.copy = copy
        self.seperator = seperator

    def transform(self, df):
        df_ = df if not self.copy else df.copy()
        df_ = df.apply(lambda x: x.split(self.seperator))
        return df_

In [442]:
# explodes and array with categorical variables and transforms them to columns including binaries
class ArrayExplodePivot(TransformerMixin, NoFitMixin):
    '''Explodes the Array and adds column for each genre with binary field values
    (similar to a LabelBinarizer)'''
    
    def __init__(self, copy=True):
        self.copy = copy

    def transform(self, df, copy=False):
        df_ = df if not self.copy else df.copy()
        df_ = self.__explode__(df_)
        test = self.__explode__(df_)
        test['val'] = 1
        return self.__pivot__(test).fillna(int(0))
    
    def __explode__(self, df):
        rows = []
        ind = 0
        for row in df:
            for element in row:
                rows.append([ind,element])
            ind+=1
        return pd.DataFrame(data=rows, columns=["old_index", "genres"])
    
    def __pivot__(self, df):
        return df.pivot(index='old_index', columns='genres', values='val')
        return self

In [410]:
# custom feature union pipeline
from sklearn.pipeline import Pipeline, FeatureUnion, _transform_one
from sklearn.externals.joblib import Parallel, delayed

class DFFeatureUnion(FeatureUnion):
    def fit_transform(self, X, y=None, **fit_params):
        # non-optimized default implementation; override when a better
        # method is possible
        if y is None:
            # fit method of arity 1 (unsupervised transformation)
            return self.fit(X, **fit_params).transform(X)
        else:
            # fit method of arity 2 (supervised transformation)
            return self.fit(X, y, **fit_params).transform(X)

    def transform(self, X):
        Xs = Parallel(n_jobs=self.n_jobs)(
            delayed(_transform_one)(trans, name, weight, X)
            for name, trans, weight in self._iter())
        return pd.concat(Xs, axis=1, join='inner')

# Pipeline Test

In [411]:
genrePipeline = Pipeline([
    ('selectGenres', DFTransform(lambda x: x['genres'], True))
    ,('stringToArray', StringToArray('|'))
    ,('explodeAndLabel', ArrayExplodePivot()),
     ('addMissingCols', AddMissingCategorialColumns())
])

In [412]:
numericalPipeline = Pipeline([
    ('selectNumerical', DFTransform(lambda x: x.select_dtypes(exclude=['object']))),
    ('impure', DFTransform(lambda x: x.fillna(x.mean()))),
    ('normalize', DFTransform(lambda x: x.div(x.max())))
])

In [413]:
# transforms entry type to categorical
def toCategorical(df):
    X = df # copy
    for col in X.columns.values.tolist():
        X[col] = X[col].astype('category', categories= list(set(X[col].values.tolist())))
    return X

# plotkeywords can be treated similar to genres
categoricalPipeline = Pipeline([
#    ('dropColumns', DFTransform(lambda x: x.drop(['genres','movie_imdb_link', 'imdb_score', 'plot_keywords'],1))),
    ('selectCategorical', DFTransform(lambda x: x.select_dtypes(include=['object']))),
    ('impure', DFTransform(lambda x: x.apply(lambda x:x.fillna(x.value_counts().index[0])))),
    ('transformToCategorical', DFTransform(lambda x: toCategorical(x))),
    ('labelBinarize', DFTransform(lambda x: pd.get_dummies(x))),
    ('addMissingCols', AddMissingCategorialColumns())
])

In [376]:
from sklearn.pipeline import FeatureUnion
from sklearn.decomposition import PCA
from sklearn.decomposition import KernelPCA
from sklearn.ensemble import RandomForestRegressor

#completePipeline =
estimators = [('linear_pca', PCA()), ('forest', RandomForestRegressor(n_jobs=4))]
#features = FeatureUnion([('1', categoricalPipeline), ('2',numericalPipeline)])
features = FeatureUnion([('1', categoricalPipeline), ('2',numericalPipeline)])
#finalPipe = Pipeline([('linear_pca', PCA()),('forest', RandomForestRegressor(n_jobs=4))])

finalPipe = Pipeline([('featureSelection', features), ('rf',RandomForestRegressor(n_jobs=4) )])

In [377]:
from sklearn.model_selection import train_test_split
X, y = df, df['imdb_score']
#X_feat = features.transform(X)
X_train, X_test, y_train, y_test = train_test_split( X, df['imdb_score'], test_size=0.2, random_state=42)

In [385]:
#finalPipe.fit(X_train, y_train)
#genrePipeline.fit_transform(X_train).as_matrix().shape
#genrePipeline.fit_transform(X_train).head()
numericalPipeline.fit_transform(X_train).as_matrix().shape
#categoricalPipeline.fit_transform(X_train).as_matrix().shape
#categoricalPipeline.fit_transform(X_train).columns.values.tolist()
#pd.DataFrame(features.transform(X_train)).head()
#print("test dataset")
#genrePipeline.transform(X_test).as_matrix().shape
#genrePipeline.fit_transform(X_train).head()
#numericalPipeline.transform(X_test).as_matrix().shape
#categoricalPipeline.transform(X_test).as_matrix().shape
#print("missing col")
#missing_cols = set( train.columns ) - set( test.columns )
#finalPipe.fit(X_train, y_train)

#X_train

#finalPipe.predict(X_test)

categoricalPipeline.fit_transform(X_train)

(4034, 16)

Unnamed: 0,color_Color,color_ Black and White,director_name_Joseph Ruben,director_name_Jeff Crook,director_name_Peter Cousens,director_name_John Hillcoat,director_name_Irwin Winkler,director_name_Ice Cube,director_name_Siddiq Barmak,director_name_Gene Teigland,...,content_rating_Unrated,content_rating_X,content_rating_Passed,content_rating_GP,content_rating_TV-Y7,content_rating_Not Rated,content_rating_TV-MA,content_rating_TV-G,content_rating_M,content_rating_PG
463,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3688,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1270,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2118,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
594,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3128,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1361,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
625,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
450,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4932,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [39]:
from sklearn.metrics import accuracy_score

In [279]:
y_pred = finalPipe.predict(X_test)

In [280]:
pd.DataFrame(y_pred).head()

Unnamed: 0,0
0,7.12
1,7.01
2,6.64
3,6.85
4,6.06


In [281]:
finalPipe.score(X_test, y_test)

0.82563613293367122

In [1]:
from hpsklearn import HyperoptEstimator, svc

WARN: OMP_NUM_THREADS=None =>
... If you are using openblas if you are using openblas set OMP_NUM_THREADS=1 or risk subprocess calls hanging indefinitely


In [2]:
df2 = pd.DataFrame({'foo': ['one','one','one','two','two','two'],
                       'bar': ['A', 'B', 'C', 'A', 'B', 'C'],
                       'baz': [1, 2, 3, 4, 5, 6]})
df2

Unnamed: 0,bar,baz,foo
0,A,1,one
1,B,2,one
2,C,3,one
3,A,4,two
4,B,5,two
5,C,6,two


In [5]:
df2.pivot(columns='bar', values='baz')
df2.pivot(index='foo', columns='bar', values='baz')

bar,A,B,C
0,1.0,,
1,,2.0,
2,,,3.0
3,4.0,,
4,,5.0,
5,,,6.0


bar,A,B,C
foo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
one,1,2,3
two,4,5,6


In [71]:
arrayExplode = ArrayExplodePivot()
stringToArray = StringToArray('|')

In [72]:
df_new = stringToArray.fit_transform(df.genres)

In [78]:
set(arrayExplode.__explode__(df_new).genres.values.tolist())

{'Action',
 'Adventure',
 'Animation',
 'Biography',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Family',
 'Fantasy',
 'Film-Noir',
 'Game-Show',
 'History',
 'Horror',
 'Music',
 'Musical',
 'Mystery',
 'News',
 'Reality-TV',
 'Romance',
 'Sci-Fi',
 'Short',
 'Sport',
 'Thriller',
 'War',
 'Western'}

In [83]:
arrayExplode.__explode__(df_new).genres.astype('category')

0             Action
1          Adventure
2            Fantasy
3             Sci-Fi
4             Action
5          Adventure
6            Fantasy
7             Action
8          Adventure
9           Thriller
10            Action
11          Thriller
12       Documentary
13            Action
14         Adventure
15            Sci-Fi
16            Action
17         Adventure
18           Romance
19         Adventure
20         Animation
21            Comedy
22            Family
23           Fantasy
24           Musical
25           Romance
26            Action
27         Adventure
28            Sci-Fi
29         Adventure
            ...     
14474         Horror
14475       Thriller
14476          Crime
14477          Drama
14478          Drama
14479         Sci-Fi
14480       Thriller
14481       Thriller
14482         Action
14483          Crime
14484          Drama
14485        Romance
14486       Thriller
14487          Crime
14488          Drama
14489         Comedy
14490        

In [292]:
train = pd.DataFrame({'A': ['a','a','b']})
test = pd.DataFrame({'A': ['a','a','c']})
test
train

Unnamed: 0,A
0,a
1,a
2,c


Unnamed: 0,A
0,a
1,a
2,b


In [424]:
train_dummies = pd.get_dummies(train)
test_dummies = pd.get_dummies(test)
test_dummies
train_dummies

Unnamed: 0,A_a,A_b
0,1,0
1,1,0
2,0,1


Unnamed: 0,A_a,A_b
0,1,0
1,1,0
2,0,1


In [425]:
addMissing = AddMissingCategorialColumns(copy=False)

In [426]:
addMissing.fit(train_dummies)
test = addMissing.transform(train_dummies)

In [427]:
train_dummies
test_dummies

Unnamed: 0,A_a,A_b
0,1,0
1,1,0
2,0,1


Unnamed: 0,A_a,A_b
0,1,0
1,1,0
2,0,1


In [428]:
addMissing.transform(test_dummies)

Unnamed: 0,A_a,A_b
0,1,0
1,1,0
2,0,1


In [429]:
test_dummies

Unnamed: 0,A_a,A_b
0,1,0
1,1,0
2,0,1


In [562]:
# fixes mismatch in number of categorical variables in training and test dataset
class AddMissingCategorialColumns(TransformerMixin):
    def __init__(self, copy=True):
        self.copy = copy
        self.columns = 0
        
    def transform(self, df):
        return self.__add_missing_dummy_columns__(df)
        #df_ = df if not self.copy else df.copy()
        #test = self.__add_missing_dummy_columns__(df_)
        #test2 = self.__rem_additional_dummy_columns__(test)
        #return test2
    
    def fit(self, df, y=None):
        self.columns = df.columns.values.tolist()
        #print(self.columns)
        return self
    
    def __add_missing_dummy_columns__(self, df):
        missing_cols = set( self.columns ) - set( df.columns )
        print(len(missing_cols))
        if len(missing_cols) != 0:
            for c in missing_cols:
                df[c] = 0
        return df
            
    def __rem_additional_dummy_columns__(self, df):
        add_cols = set( df.columns ) - set( self.columns )
        if len(add_cols) != 0:
            for c in add_cols:
                del df[c]
        return df

categoricalPipeline2 = Pipeline([
    ('dropColumns', DFTransform(lambda x: x.drop(['genres','movie_imdb_link', 'imdb_score', 'plot_keywords'],1))),
    ('selectCategorical', DFTransform(lambda x: x.select_dtypes(include=['object']))),
    ('impure', DFTransform(lambda x: x.apply(lambda x:x.fillna(x.value_counts().index[0])))),
    ('transformToCategorical', DFTransform(lambda x: toCategorical(x))),
    ('labelBinarize', DFTransform(lambda x: pd.get_dummies(x))),
    ('addMissingCols', AddMissingCategorialColumns())
])



In [563]:
#categoricalPipeline2.transform(X_test)

In [564]:
categoricalPipeline2.fit_transform(X_train)

0


Unnamed: 0,color_Color,color_ Black and White,director_name_Joseph Ruben,director_name_Jeff Crook,director_name_Peter Cousens,director_name_John Hillcoat,director_name_Irwin Winkler,director_name_Ice Cube,director_name_Siddiq Barmak,director_name_Gene Teigland,...,content_rating_Unrated,content_rating_X,content_rating_Passed,content_rating_GP,content_rating_TV-Y7,content_rating_Not Rated,content_rating_TV-MA,content_rating_TV-G,content_rating_M,content_rating_PG
463,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3688,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1270,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2118,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
594,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3128,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1361,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
625,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
450,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4932,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [519]:
categoricalPipeline2.transform(X_train).head()

0


Unnamed: 0,color_Color,color_ Black and White,director_name_Joseph Ruben,director_name_Jeff Crook,director_name_Peter Cousens,director_name_John Hillcoat,director_name_Irwin Winkler,director_name_Ice Cube,director_name_Siddiq Barmak,director_name_Gene Teigland,...,content_rating_Unrated,content_rating_X,content_rating_Passed,content_rating_GP,content_rating_TV-Y7,content_rating_Not Rated,content_rating_TV-MA,content_rating_TV-G,content_rating_M,content_rating_PG
463,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3688,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1270,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2118,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
594,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [492]:
X_train.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
463,Color,Albert Hughes,325.0,118.0,117.0,10000.0,Mila Kunis,18000.0,94822707.0,Action|Adventure|Drama|Thriller,...,560.0,English,USA,R,80000000.0,2010.0,15000.0,6.9,2.35,20000
3688,Color,David Wain,65.0,96.0,136.0,139.0,Ken Marino,4000.0,766487.0,Comedy|Romance,...,117.0,English,USA,R,5250000.0,2007.0,543.0,5.0,1.85,915
1270,Color,Roger Michell,212.0,107.0,50.0,58.0,Patti D'Arbanville,293.0,30993544.0,Comedy|Drama|Romance,...,156.0,English,USA,PG-13,40000000.0,2010.0,117.0,6.5,2.35,0
2118,Color,Paul Verhoeven,231.0,145.0,719.0,104.0,Sebastian Koch,2000.0,4398392.0,Drama|Thriller|War,...,232.0,Dutch,Netherlands,R,21000000.0,2006.0,380.0,7.8,2.35,0
594,Color,Cameron Crowe,153.0,141.0,488.0,509.0,Ivana Milicevic,10000.0,100614858.0,Fantasy|Mystery|Romance|Sci-Fi|Thriller,...,1248.0,English,USA,R,68000000.0,2001.0,834.0,6.9,1.85,0


In [None]:
categoricalPipeline2