# IMDB Movie Regression

In [79]:
# Setting the ast_node_interactivity to "all" will print
# each statement, not only the last one of a cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# dataframes and arrays
import pandas as pd
import numpy as np

# plotting libs
import seaborn as sns
from matplotlib import pyplot as plt

# machine learning
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, LabelBinarizer, LabelEncoder, Imputer
from sklearn.base import TransformerMixin, BaseEstimator

In [2]:
# load data
df = pd.read_csv("movie_metadata.csv")

In [4]:
# some important stats
df.info()
df.describe()
df.isnull().any()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5043 entries, 0 to 5042
Data columns (total 28 columns):
color                        5024 non-null object
director_name                4939 non-null object
num_critic_for_reviews       4993 non-null float64
duration                     5028 non-null float64
director_facebook_likes      4939 non-null float64
actor_3_facebook_likes       5020 non-null float64
actor_2_name                 5030 non-null object
actor_1_facebook_likes       5036 non-null float64
gross                        4159 non-null float64
genres                       5043 non-null object
actor_1_name                 5036 non-null object
movie_title                  5043 non-null object
num_voted_users              5043 non-null int64
cast_total_facebook_likes    5043 non-null int64
actor_3_name                 5020 non-null object
facenumber_in_poster         5030 non-null float64
plot_keywords                4890 non-null object
movie_imdb_link              5043 non-

Unnamed: 0,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_1_facebook_likes,gross,num_voted_users,cast_total_facebook_likes,facenumber_in_poster,num_user_for_reviews,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
count,4993.0,5028.0,4939.0,5020.0,5036.0,4159.0,5043.0,5043.0,5030.0,5022.0,4551.0,4935.0,5030.0,5043.0,4714.0,5043.0
mean,140.194272,107.201074,686.509212,645.009761,6560.047061,48468410.0,83668.16,9699.063851,1.371173,272.770808,39752620.0,2002.470517,1651.754473,6.442138,2.220403,7525.964505
std,121.601675,25.197441,2813.328607,1665.041728,15020.75912,68452990.0,138485.3,18163.799124,2.013576,377.982886,206114900.0,12.474599,4042.438863,1.125116,1.385113,19320.44511
min,1.0,7.0,0.0,0.0,0.0,162.0,5.0,0.0,0.0,1.0,218.0,1916.0,0.0,1.6,1.18,0.0
25%,50.0,93.0,7.0,133.0,614.0,5340988.0,8593.5,1411.0,0.0,65.0,6000000.0,1999.0,281.0,5.8,1.85,0.0
50%,110.0,103.0,49.0,371.5,988.0,25517500.0,34359.0,3090.0,1.0,156.0,20000000.0,2005.0,595.0,6.6,2.35,166.0
75%,195.0,118.0,194.5,636.0,11000.0,62309440.0,96309.0,13756.5,2.0,326.0,45000000.0,2011.0,918.0,7.2,2.35,3000.0
max,813.0,511.0,23000.0,23000.0,640000.0,760505800.0,1689764.0,656730.0,43.0,5060.0,12215500000.0,2016.0,137000.0,9.5,16.0,349000.0


color                         True
director_name                 True
num_critic_for_reviews        True
duration                      True
director_facebook_likes       True
actor_3_facebook_likes        True
actor_2_name                  True
actor_1_facebook_likes        True
gross                         True
genres                       False
actor_1_name                  True
movie_title                  False
num_voted_users              False
cast_total_facebook_likes    False
actor_3_name                  True
facenumber_in_poster          True
plot_keywords                 True
movie_imdb_link              False
num_user_for_reviews          True
language                      True
country                       True
content_rating                True
budget                        True
title_year                    True
actor_2_facebook_likes        True
imdb_score                   False
aspect_ratio                  True
movie_facebook_likes         False
dtype: bool

# Create Custom Transformer for Pipeline

In [7]:
# base class where the others will inherit from
from sklearn.base import TransformerMixin

In [8]:
# Inherit from so you do not need a fit function
class NoFitMixin:
    def fit(self, df, y=None):
        return self

In [84]:
# class which takes a df and a function and returns the modified df
class DFTransform(TransformerMixin, BaseEstimator, NoFitMixin):
    def __init__(self, func, copy=False):
        self.func = func
        self.copy = copy

    def transform(self, df):
        df_ = df if not self.copy else df.copy()
        return self.func(df_)

In [119]:
# fixes mismatch in number of categorical variables in training and test dataset
class AddMissingCategorialColumns(TransformerMixin, BaseEstimator):
    def __init__(self, copy=True):
        self.copy = copy
        self.columns = 0
        
    def transform(self, df):
        #return self.__add_missing_dummy_columns__(df)
        df_ = df if not self.copy else df.copy()
        self.__add_missing_dummy_columns__(df_)
        self.__rem_additional_dummy_columns__(df_)
        return df_
    
    def fit(self, df, y=None):
        self.columns = df.columns.values.tolist()
        return self
    
    def __add_missing_dummy_columns__(self, df):
        missing_cols = set( self.columns ) - set( df.columns )
        print(len(missing_cols))
        if len(missing_cols) != 0:
            for c in missing_cols:
                df[c] = 0
        return df
            
    def __rem_additional_dummy_columns__(self, df):
        add_cols = set( df.columns ) - set( self.columns )
        if len(add_cols) != 0:
            for c in add_cols:
                del df[c]
        return df

In [120]:
# transforms the genre string to a list: action|drama|... -> [action,drama,...]
class StringToArray(TransformerMixin, BaseEstimator, NoFitMixin):
    ''' Takes a Pandas Dataframe and two column names as input and transforms the input column '''
    def __init__(self, seperator, copy=True):
        self.copy = copy
        self.seperator = seperator

    def transform(self, df):
        df_ = df if not self.copy else df.copy()
        df_ = df.apply(lambda x: x.split(self.seperator))
        return df_

In [121]:
# explodes and array with categorical variables and transforms them to columns including binaries
class ArrayExplodePivot(TransformerMixin, BaseEstimator, NoFitMixin):
    '''Explodes the Array and adds column for each genre with binary field values
    (similar to a LabelBinarizer)'''
    
    def __init__(self, copy=True):
        self.copy = copy

    def transform(self, df):
        df_ = df if not self.copy else df.copy()
        df_ = self.__explode__(df_)
        #self.__explode__(df_)
        #return df_
        df_['val'] = 1
        return self.__pivot__(df_).fillna(int(0))
    
    def __explode__(self, df):
        rows = []
        ind = 0
        for row in df:
            for element in row:
                rows.append([ind,element])
            ind+=1
        return pd.DataFrame(data=rows, columns=["old_index", "genres"])
    
    def __pivot__(self, df):
        return df.pivot(index='old_index', columns='genres', values='val')
        return self
    

class CustomImpure(TransformerMixin, NoFitMixin, BaseEstimator):
    
    def __init__(self, replaceDict, copy=True):
        self.copy = copy
        self.replaceDict = replaceDict
    
    def transform(self, df):
        df_ = df if not self.copy else df.copy()
        for key in self.replaceDict:
            df_.loc[df_[key].isnull(), key] = self.replaceDict[key]
            #if key == 'numerical' and self.replaceDict[key] == 'mean':
            #    col_nums = df_.select_dtypes(exclude=['object'])
            #    df_.loc[df_[col_nums].isnull(), col_nums] = self.replaceDict[key]
            #
            #df_.loc[df_[key].isnull(), key] = self.replaceDict[key]
        return df_

In [122]:
# custom feature union pipeline
from sklearn.pipeline import Pipeline, FeatureUnion, _transform_one
from sklearn.externals.joblib import Parallel, delayed

class DFFeatureUnion(FeatureUnion):
    def fit_transform(self, X, y=None, **fit_params):
        # non-optimized default implementation; override when a better
        # method is possible
        if y is None:
            # fit method of arity 1 (unsupervised transformation)
            return self.fit(X, **fit_params).transform(X)
        else:
            # fit method of arity 2 (supervised transformation)
            return self.fit(X, y, **fit_params).transform(X)

    def transform(self, X):
        Xs = Parallel(n_jobs=self.n_jobs)(
            delayed(_transform_one)(trans, name, weight, X)
            for name, trans, weight in self._iter())
        return pd.concat(Xs, axis=1, join='inner')

# Pipeline Test

In [123]:
genrePipeline = Pipeline([
    ('selectGenres', DFTransform(lambda x: x['genres'], True)),
    ('stringToArray', StringToArray('|')),
    ('explodeAndLabel', ArrayExplodePivot()),
    ('addMissingCols', AddMissingCategorialColumns())
])

In [124]:
numericalPipeline = Pipeline([
    ('selectNumerical', DFTransform(lambda x: x.select_dtypes(exclude=['object']))),
    ('impure', DFTransform(lambda x: x.fillna(x.mean()))),
    ('normalize', DFTransform(lambda x: x.div(x.max())))
])

In [125]:
# transforms entry type to categorical
def toCategorical(df):
    X = df # copy
    for col in X.columns.values.tolist():
        X[col] = X[col].astype('category', categories= list(set(X[col].values.tolist())))
    return X

# transforms the genre string to a list: action|drama|... -> [action,drama,...]
class DropColumns(TransformerMixin, BaseEstimator, NoFitMixin):
    ''' Takes a Pandas Dataframe and two column names as input and transforms the input column '''
    def __init__(self, columns_list, copy=True):
        self.copy = copy
        self.columns_list = columns_list

    def transform(self, df):
        df_ = df if not self.copy else df.copy()
        temp_list = [] 
        for column in self.columns_list:
            if column in df_.columns:
                temp_list.append(column)
            else:
                print (column)
        return df_.drop(temp_list, axis = 1)
    
#if 'A' in df.columns:
#x.drop(['genres','movie_imdb_link', 'imdb_score', 'plot_keywords'],1)

# plotkeywords can be treated similar to genres
# drop columns need to be optional(if check)
replaceDict = {'director_name': 'unknown', 'actor_1_name': 'unknown', 'actor_2_name': 'unknown',
              'country': 'unknown'}

categoricalPipeline = Pipeline([
    ('dropGenre', DropColumns(['genres'])),
    ('selectCategorical', DFTransform(lambda x: x.select_dtypes(include=['object']))),
    ('customImpure', CustomImpure(replaceDict=replaceDict, copy=False)),
    ('impure', DFTransform(lambda x: x.apply(lambda x:x.fillna(x.value_counts().index[0])))),
    ('transformToCategorical', DFTransform(lambda x: toCategorical(x))),
    ('labelBinarize', DFTransform(lambda x: pd.get_dummies(x))),
    ('addMissingCols', AddMissingCategorialColumns())
])

In [136]:
from sklearn.pipeline import FeatureUnion
from sklearn.decomposition import PCA
from sklearn.decomposition import KernelPCA
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn import svm
from sklearn.model_selection import GridSearchCV

#clf = GridSearchCV(estimator=svc, param_grid=dict(C=Cs),n_jobs=-1)


#regressor = LinearRegression()
#regressor = GradientBoostingRegressor()
#regressor = svm.SVR(kernel='poly')
regressor = RandomForestRegressor()

from sklearn.model_selection import GridSearchCV

param_grid = [
  {'regressor__n_estimators': [10], 'regressor__criterion': ['mse']
   , 'regressor__max_features':['sqrt', 'log2'], 'regressor__n_jobs':[10] }
#  ,{'n_estimators': [10, 100, 1000], 'criterion': ['mse', 'mae']}
 ]

#regressor = GridSearchCV(RandomForestRegressor(), param_grid)

features = FeatureUnion([('1', categoricalPipeline), ('2',numericalPipeline), ('3', genrePipeline)])

finalPipe = Pipeline([('dropColumns', DropColumns(['movie_imdb_link', 'imdb_score', 'plot_keywords'])),
                        ('featureSelection', features),
                        ('regressor', regressor)  ])

#search = GridSearchCV(finalPipe, param_grid)

In [137]:
from sklearn.model_selection import train_test_split
X, y = df, df['imdb_score']
X_train, X_test, y_train, y_test = train_test_split( X, df['imdb_score'], test_size=0.2, random_state=42)

In [139]:
#finalPipe.fit(X_train, y_train)
search.fit(X_train, y_train)

0
0
7997
2
0
0
0
0
7984
2
0
0
0
0
7941
0
0
0
0
0
7997
2
0
0
0
0
7984
2
0
0
0
0
7941
0
0
0
0
0


GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('dropColumns', DropColumns(columns_list=['movie_imdb_link', 'imdb_score', 'plot_keywords'],
      copy=True)), ('featureSelection', FeatureUnion(n_jobs=1,
       transformer_list=[('1', Pipeline(memory=None,
     steps=[('dropGenre', DropColumns(columns_list=['genres'], copy=True)), ('select...timators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'regressor__n_estimators': [100], 'regressor__criterion': ['mse'], 'regressor__max_features': ['sqrt', 'log2'], 'regressor__n_jobs': [10]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [189]:
featureSelection = search.best_estimator_.steps[1][1]
featureNumerical_1 = featureSelection.transformer_list[1][1]
featureGenres_0 = featureSelection.transformer_list[0][1]
featureCat_2 = featureSelection.transformer_list[2][1]


dropcol = DropColumns(['movie_imdb_link', 'imdb_score', 'plot_keywords'])

featureNumerical_1_list = featureNumerical_1.transform(dropcol.transform(X_train)).columns.values.tolist()
featureGenres_0_list = featureGenres_0.transform(dropcol.transform(X_train)).columns.values.tolist()
featureCat_2_list = featureCat_2.transform(dropcol.transform(X_train)).columns.values.tolist()

#featureNumerical_1_list
#featureGenres_0_list
#featureCat_2_list

allFeatures = featureGenres_0_list + featureNumerical_1_list + featureCat_2_list

#allFeatures

bestForest = search.best_estimator_.steps[2][1]
allFeaturesImportanceList = bestForest.feature_importances_.tolist()

len(allFeatures)
len(allFeaturesImportanceList)

feats = {} # a dict to hold feature_name: feature_importance
for feature, importance in zip(allFeatures, allFeaturesImportanceList):
    feats[feature] = importance #add the name/value pair 
    
#feats

importances = pd.DataFrame.from_dict(feats, orient='index').rename(columns={0: 'Gini-importance'})
importances.sort_values(by='Gini-importance').plot(kind='bar', rot=45)

0
0


13519

13519

<matplotlib.axes._subplots.AxesSubplot at 0x7f2145580898>

In [197]:
#plt.show()
importances.head()
#DataFrame.sort_values(by, axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last')[source]

importances.sort_values(by='Gini-importance', axis=0, ascending=False).head(20)

Unnamed: 0,Gini-importance
color_ Black and White,0.005029
color_Color,0.003451
director_name_Anthony Russo,8e-05
director_name_Quentin Tarantino,0.000477
director_name_Andrey Zvyagintsev,1.8e-05


Unnamed: 0,Gini-importance
num_voted_users,0.055178
duration,0.035883
num_user_for_reviews,0.033709
num_critic_for_reviews,0.027798
Drama,0.024492
movie_facebook_likes,0.023237
director_facebook_likes,0.022572
title_year,0.022322
gross,0.018533
cast_total_facebook_likes,0.018397


In [140]:
finalPipe.get_params().keys()
y_pred = search.predict(X_test)
#sorted(search.cv_results_.keys())

dict_keys(['memory', 'steps', 'dropColumns', 'featureSelection', 'regressor', 'dropColumns__columns_list', 'dropColumns__copy', 'featureSelection__n_jobs', 'featureSelection__transformer_list', 'featureSelection__transformer_weights', 'featureSelection__1', 'featureSelection__2', 'featureSelection__3', 'featureSelection__1__memory', 'featureSelection__1__steps', 'featureSelection__1__dropGenre', 'featureSelection__1__selectCategorical', 'featureSelection__1__customImpure', 'featureSelection__1__impure', 'featureSelection__1__transformToCategorical', 'featureSelection__1__labelBinarize', 'featureSelection__1__addMissingCols', 'featureSelection__1__dropGenre__columns_list', 'featureSelection__1__dropGenre__copy', 'featureSelection__1__selectCategorical__copy', 'featureSelection__1__selectCategorical__func', 'featureSelection__1__customImpure__copy', 'featureSelection__1__customImpure__replaceDict', 'featureSelection__1__impure__copy', 'featureSelection__1__impure__func', 'featureSelectio

11761
3


Returns the coefficient of determination R^2 of the prediction.
The coefficient R^2 is defined as (1 - u/v), where u is the residual sum of squares ((y_true - y_pred) ** 2).sum() and v is the total sum of squares ((y_true - y_true.mean()) ** 2).sum(). The best possible score is 1.0 and it can be negative (because the model can be arbitrarily worse). A constant model that always predicts the expected value of y, disregarding the input features, would get a R^2 score of 0.0.

In [142]:
from sklearn.metrics import accuracy_score
#finalPipe.score(X_test, y_test)
search.score(X_test, y_test)

11761
3


0.15732763024161356

In [143]:
X_test_copy = X_test.copy()
X_test_copy['pred'] = y_pred
X_test_copy['imdb_score'] = y_test

In [144]:
pd.DataFrame(y_pred).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1009 entries, 0 to 1008
Data columns (total 1 columns):
0    1009 non-null float64
dtypes: float64(1)
memory usage: 8.0 KB


In [145]:
X_test_copy[['movie_title', 'imdb_score', 'pred']].head(20)

Unnamed: 0,movie_title,imdb_score,pred
4943,An American in Hollywood,7.2,6.352
1919,Million Dollar Arm,7.0,7.026
1049,Kate & Leopold,6.4,6.664
4697,Naturally Native,6.5,6.47
3312,Two Evil Eyes,6.1,6.804
4387,You Can't Take It with You,8.0,7.034
240,Star Wars: Episode I - The Phantom Menace,6.5,7.344
4064,Pokémon 3: The Movie,5.6,6.216
2702,The Dead Zone,7.5,6.608
239,The Wolverine,6.7,7.15


In [None]:
#rrf = finalPipe.steps[-1][1]
#rrf.feature_importances_
#features = finalPipe.steps[0][1]
#features[0]
#-np.sort(-rrf.feature_importances_, axis=None)

In [57]:
clr.fit(X_train, y_train)

AttributeError: 'DropColumns' object has no attribute 'get_params'

# Tests to build the custom Transformers

In [112]:
genrePipeline = Pipeline([
    ('selectGenres', DFTransform(lambda x: x['genres'], True)),
    ('stringToArray', StringToArray('|')),
    ('explodeAndLabel', ArrayExplodePivot()),
#    ('addMissingCols', AddMissingCategorialColumns())
])

In [113]:
genrePipeline.fit_transform(X_train).head()

genres,Action,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,Fantasy,...,Mystery,News,Reality-TV,Romance,Sci-Fi,Short,Sport,Thriller,War,Western
old_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0


In [2]:
df2 = pd.DataFrame({'foo': ['one','one','one','two','two','two'],
                       'bar': ['A', 'B', 'C', 'A', 'B', 'C'],
                       'baz': [1, 2, 3, 4, 5, 6]})
df2

Unnamed: 0,bar,baz,foo
0,A,1,one
1,B,2,one
2,C,3,one
3,A,4,two
4,B,5,two
5,C,6,two


In [5]:
df2.pivot(columns='bar', values='baz')
df2.pivot(index='foo', columns='bar', values='baz')

bar,A,B,C
0,1.0,,
1,,2.0,
2,,,3.0
3,4.0,,
4,,5.0,
5,,,6.0


bar,A,B,C
foo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
one,1,2,3
two,4,5,6


In [71]:
arrayExplode = ArrayExplodePivot()
stringToArray = StringToArray('|')

In [72]:
df_new = stringToArray.fit_transform(df.genres)

In [78]:
set(arrayExplode.__explode__(df_new).genres.values.tolist())

{'Action',
 'Adventure',
 'Animation',
 'Biography',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Family',
 'Fantasy',
 'Film-Noir',
 'Game-Show',
 'History',
 'Horror',
 'Music',
 'Musical',
 'Mystery',
 'News',
 'Reality-TV',
 'Romance',
 'Sci-Fi',
 'Short',
 'Sport',
 'Thriller',
 'War',
 'Western'}

In [83]:
arrayExplode.__explode__(df_new).genres.astype('category')

0             Action
1          Adventure
2            Fantasy
3             Sci-Fi
4             Action
5          Adventure
6            Fantasy
7             Action
8          Adventure
9           Thriller
10            Action
11          Thriller
12       Documentary
13            Action
14         Adventure
15            Sci-Fi
16            Action
17         Adventure
18           Romance
19         Adventure
20         Animation
21            Comedy
22            Family
23           Fantasy
24           Musical
25           Romance
26            Action
27         Adventure
28            Sci-Fi
29         Adventure
            ...     
14474         Horror
14475       Thriller
14476          Crime
14477          Drama
14478          Drama
14479         Sci-Fi
14480       Thriller
14481       Thriller
14482         Action
14483          Crime
14484          Drama
14485        Romance
14486       Thriller
14487          Crime
14488          Drama
14489         Comedy
14490        

In [40]:
custom3 =  DFTransform(lambda x: x.fillna(x.mean()))
custom2 = DFTransform(lambda x: x.apply(lambda x:x.fillna(x.value_counts().index[0])))
train = pd.DataFrame({'A': ['a','a','a'], 'B': ['c','c',np.nan], 'C': [12, 5, np.nan]})
train
custom3.transform(train.C)
custom2.transform(train)
replace = CustomImpure({'A': 'Hans', 'B' : 'Franz'}, copy=False)
replace.fit_transform(train)
train

Unnamed: 0,A,B,C
0,a,c,12.0
1,a,c,5.0
2,a,,


0    12.0
1     5.0
2     8.5
Name: C, dtype: float64

Unnamed: 0,A,B,C
0,a,c,12.0
1,a,c,5.0
2,a,c,5.0


Unnamed: 0,A,B,C
0,a,c,12.0
1,a,c,5.0
2,a,Franz,


Unnamed: 0,A,B,C
0,a,c,12.0
1,a,c,5.0
2,a,Franz,


In [292]:
train = pd.DataFrame({'A': ['a','a','b']})
test = pd.DataFrame({'A': ['a','a','c']})
test
train

Unnamed: 0,A
0,a
1,a
2,c


Unnamed: 0,A
0,a
1,a
2,b


In [424]:
train_dummies = pd.get_dummies(train)
test_dummies = pd.get_dummies(test)
test_dummies
train_dummies

Unnamed: 0,A_a,A_b
0,1,0
1,1,0
2,0,1


Unnamed: 0,A_a,A_b
0,1,0
1,1,0
2,0,1


In [425]:
addMissing = AddMissingCategorialColumns(copy=False)

In [426]:
addMissing.fit(train_dummies)
test = addMissing.transform(train_dummies)

In [427]:
train_dummies
test_dummies

Unnamed: 0,A_a,A_b
0,1,0
1,1,0
2,0,1


Unnamed: 0,A_a,A_b
0,1,0
1,1,0
2,0,1


In [428]:
addMissing.transform(test_dummies)

Unnamed: 0,A_a,A_b
0,1,0
1,1,0
2,0,1


In [429]:
test_dummies

Unnamed: 0,A_a,A_b
0,1,0
1,1,0
2,0,1


In [565]:


categoricalPipeline2 = Pipeline([
    ('dropColumns', DFTransform(lambda x: x.drop(['genres','movie_imdb_link', 'imdb_score', 'plot_keywords'],1))),
    ('selectCategorical', DFTransform(lambda x: x.select_dtypes(include=['object']))),
    ('impure', DFTransform(lambda x: x.apply(lambda x:x.fillna(x.value_counts().index[0])))),
    ('transformToCategorical', DFTransform(lambda x: toCategorical(x))),
    ('labelBinarize', DFTransform(lambda x: pd.get_dummies(x))),
    ('addMissingCols', AddMissingCategorialColumns())
])



In [566]:
#categoricalPipeline2.transform(X_test)

In [567]:
categoricalPipeline2.fit_transform(X_train)

0


Unnamed: 0,color_Color,color_ Black and White,director_name_Joseph Ruben,director_name_Jeff Crook,director_name_Peter Cousens,director_name_John Hillcoat,director_name_Irwin Winkler,director_name_Ice Cube,director_name_Siddiq Barmak,director_name_Gene Teigland,...,content_rating_Unrated,content_rating_X,content_rating_Passed,content_rating_GP,content_rating_TV-Y7,content_rating_Not Rated,content_rating_TV-MA,content_rating_TV-G,content_rating_M,content_rating_PG
463,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3688,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1270,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2118,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
594,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3128,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1361,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
625,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
450,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4932,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [569]:
categoricalPipeline2.transform(X_test).head()

11760


Unnamed: 0,color_Color,color_ Black and White,director_name_Mario Van Peebles,director_name_David Raynr,director_name_Leslie Small,director_name_Blake Edwards,director_name_Miguel Arteta,director_name_Brian De Palma,director_name_Don Coscarelli,director_name_Vadim Perelman,...,actor_2_name_Bubba Smith,actor_3_name_Darren Kendrick,actor_2_name_John Doman,actor_3_name_Tom McCarthy,actor_2_name_Billy Boyd,actor_1_name_Heather Donahue,actor_2_name_Richard Anderson,movie_title_Hotel Rwanda,movie_title_Where the Heart Is,actor_1_name_Amber Stevens West
4943,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1919,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1049,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4697,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3312,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [492]:
X_train.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
463,Color,Albert Hughes,325.0,118.0,117.0,10000.0,Mila Kunis,18000.0,94822707.0,Action|Adventure|Drama|Thriller,...,560.0,English,USA,R,80000000.0,2010.0,15000.0,6.9,2.35,20000
3688,Color,David Wain,65.0,96.0,136.0,139.0,Ken Marino,4000.0,766487.0,Comedy|Romance,...,117.0,English,USA,R,5250000.0,2007.0,543.0,5.0,1.85,915
1270,Color,Roger Michell,212.0,107.0,50.0,58.0,Patti D'Arbanville,293.0,30993544.0,Comedy|Drama|Romance,...,156.0,English,USA,PG-13,40000000.0,2010.0,117.0,6.5,2.35,0
2118,Color,Paul Verhoeven,231.0,145.0,719.0,104.0,Sebastian Koch,2000.0,4398392.0,Drama|Thriller|War,...,232.0,Dutch,Netherlands,R,21000000.0,2006.0,380.0,7.8,2.35,0
594,Color,Cameron Crowe,153.0,141.0,488.0,509.0,Ivana Milicevic,10000.0,100614858.0,Fantasy|Mystery|Romance|Sci-Fi|Thriller,...,1248.0,English,USA,R,68000000.0,2001.0,834.0,6.9,1.85,0


In [316]:



def dropColumns(df, column_list):
    test = df.copy()
    for column in column_list:
        if column in test.columns:
            test.drop(['genres'], axis=1)
        else:
            print(column)
            continue
    return test

dropShit = DFTransform(lambda x: x.drop(['genres','movie_imdb_link', 
                                                         'plot_keywords'], axis = 1), copy=False)

dropSupershit =  DFTransform(lambda x: dropColumns(x,['genres','movie_imdb_link', 
                                                       'imdb_score', 'plot_keywords']))

dropAwesome = DropColumns(['genres','movie_imdb_link','plot_keywords'], copy=False)

In [322]:
X_train.genres.head()

463             Action|Adventure|Drama|Thriller
3688                             Comedy|Romance
1270                       Comedy|Drama|Romance
2118                         Drama|Thriller|War
594     Fantasy|Mystery|Romance|Sci-Fi|Thriller
Name: genres, dtype: object

In [323]:
test = dropShit.transform(X_train)

In [324]:
test.genres.head()

AttributeError: 'DataFrame' object has no attribute 'genres'

In [325]:
test2 = dropAwesome.transform(X_train)

In [327]:
test2.genres

AttributeError: 'DataFrame' object has no attribute 'genres'