# IMDB Movie Regression

In [1]:
# Setting the ast_node_interactivity to "all" will print
# each statement, not only the last one of a cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# dataframes and arrays
import pandas as pd
import numpy as np

# plotting libs
import seaborn as sns
from matplotlib import pyplot as plt

# machine learning
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, LabelBinarizer, LabelEncoder, Imputer
from sklearn.base import TransformerMixin, BaseEstimator

In [2]:
# load data
df = pd.read_csv("movie_metadata.csv")

In [3]:
# some important stats
df.info()
df.describe()
df.isnull().any()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5043 entries, 0 to 5042
Data columns (total 28 columns):
color                        5024 non-null object
director_name                4939 non-null object
num_critic_for_reviews       4993 non-null float64
duration                     5028 non-null float64
director_facebook_likes      4939 non-null float64
actor_3_facebook_likes       5020 non-null float64
actor_2_name                 5030 non-null object
actor_1_facebook_likes       5036 non-null float64
gross                        4159 non-null float64
genres                       5043 non-null object
actor_1_name                 5036 non-null object
movie_title                  5043 non-null object
num_voted_users              5043 non-null int64
cast_total_facebook_likes    5043 non-null int64
actor_3_name                 5020 non-null object
facenumber_in_poster         5030 non-null float64
plot_keywords                4890 non-null object
movie_imdb_link              5043 non-

Unnamed: 0,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_1_facebook_likes,gross,num_voted_users,cast_total_facebook_likes,facenumber_in_poster,num_user_for_reviews,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
count,4993.0,5028.0,4939.0,5020.0,5036.0,4159.0,5043.0,5043.0,5030.0,5022.0,4551.0,4935.0,5030.0,5043.0,4714.0,5043.0
mean,140.194272,107.201074,686.509212,645.009761,6560.047061,48468410.0,83668.16,9699.063851,1.371173,272.770808,39752620.0,2002.470517,1651.754473,6.442138,2.220403,7525.964505
std,121.601675,25.197441,2813.328607,1665.041728,15020.75912,68452990.0,138485.3,18163.799124,2.013576,377.982886,206114900.0,12.474599,4042.438863,1.125116,1.385113,19320.44511
min,1.0,7.0,0.0,0.0,0.0,162.0,5.0,0.0,0.0,1.0,218.0,1916.0,0.0,1.6,1.18,0.0
25%,50.0,93.0,7.0,133.0,614.0,5340988.0,8593.5,1411.0,0.0,65.0,6000000.0,1999.0,281.0,5.8,1.85,0.0
50%,110.0,103.0,49.0,371.5,988.0,25517500.0,34359.0,3090.0,1.0,156.0,20000000.0,2005.0,595.0,6.6,2.35,166.0
75%,195.0,118.0,194.5,636.0,11000.0,62309440.0,96309.0,13756.5,2.0,326.0,45000000.0,2011.0,918.0,7.2,2.35,3000.0
max,813.0,511.0,23000.0,23000.0,640000.0,760505800.0,1689764.0,656730.0,43.0,5060.0,12215500000.0,2016.0,137000.0,9.5,16.0,349000.0


color                         True
director_name                 True
num_critic_for_reviews        True
duration                      True
director_facebook_likes       True
actor_3_facebook_likes        True
actor_2_name                  True
actor_1_facebook_likes        True
gross                         True
genres                       False
actor_1_name                  True
movie_title                  False
num_voted_users              False
cast_total_facebook_likes    False
actor_3_name                  True
facenumber_in_poster          True
plot_keywords                 True
movie_imdb_link              False
num_user_for_reviews          True
language                      True
country                       True
content_rating                True
budget                        True
title_year                    True
actor_2_facebook_likes        True
imdb_score                   False
aspect_ratio                  True
movie_facebook_likes         False
dtype: bool

# Create Custom Transformer for Pipeline

In [4]:
# base class where the others will inherit from
from sklearn.base import TransformerMixin

In [5]:
# Inherit from so you do not need a fit function
class NoFitMixin:
    def fit(self, df, y=None):
        return self

In [6]:
# class which takes a df and a function and returns the modified df
class DFTransform(TransformerMixin, BaseEstimator, NoFitMixin):
    def __init__(self, func, copy=False):
        self.func = func
        self.copy = copy

    def transform(self, df):
        df_ = df if not self.copy else df.copy()
        return self.func(df_)

In [7]:
# fixes mismatch in number of categorical variables in training and test dataset
class AddMissingCategorialColumns(TransformerMixin, BaseEstimator):
    def __init__(self, copy=True):
        self.copy = copy
        self.columns = 0
        
    def transform(self, df):
        #return self.__add_missing_dummy_columns__(df)
        df_ = df if not self.copy else df.copy()
        self.__add_missing_dummy_columns__(df_)
        self.__rem_additional_dummy_columns__(df_)
        return df_
    
    def fit(self, df, y=None):
        self.columns = df.columns.values.tolist()
        return self
    
    def __add_missing_dummy_columns__(self, df):
        missing_cols = set( self.columns ) - set( df.columns )
        print(len(missing_cols))
        if len(missing_cols) != 0:
            for c in missing_cols:
                df[c] = 0
        return df
            
    def __rem_additional_dummy_columns__(self, df):
        add_cols = set( df.columns ) - set( self.columns )
        if len(add_cols) != 0:
            for c in add_cols:
                del df[c]
        return df

In [8]:
# transforms the genre string to a list: action|drama|... -> [action,drama,...]
class StringToArray(TransformerMixin, BaseEstimator, NoFitMixin):
    ''' Takes a Pandas Dataframe and two column names as input and transforms the input column '''
    def __init__(self, seperator, copy=True):
        self.copy = copy
        self.seperator = seperator

    def transform(self, df):
        df_ = df if not self.copy else df.copy()
        df_ = df.apply(lambda x: x.split(self.seperator))
        return df_

In [9]:
# explodes and array with categorical variables and transforms them to columns including binaries
class ArrayExplodePivot(TransformerMixin, BaseEstimator, NoFitMixin):
    '''Explodes the Array and adds column for each genre with binary field values
    (similar to a LabelBinarizer)'''
    
    def __init__(self, copy=True):
        self.copy = copy

    def transform(self, df):
        df_ = df if not self.copy else df.copy()
        df_ = self.__explode__(df_)
        #self.__explode__(df_)
        #return df_
        df_['val'] = 1
        return self.__pivot__(df_).fillna(int(0))
    
    def __explode__(self, df):
        rows = []
        ind = 0
        for row in df:
            for element in row:
                rows.append([ind,element])
            ind+=1
        return pd.DataFrame(data=rows, columns=["old_index", "genres"])
    
    def __pivot__(self, df):
        return df.pivot(index='old_index', columns='genres', values='val')
        return self
    

class CustomImpure(TransformerMixin, NoFitMixin, BaseEstimator):
    
    def __init__(self, replaceDict, copy=True):
        self.copy = copy
        self.replaceDict = replaceDict
    
    def transform(self, df):
        df_ = df if not self.copy else df.copy()
        for key in self.replaceDict:
            df_.loc[df_[key].isnull(), key] = self.replaceDict[key]
            #if key == 'numerical' and self.replaceDict[key] == 'mean':
            #    col_nums = df_.select_dtypes(exclude=['object'])
            #    df_.loc[df_[col_nums].isnull(), col_nums] = self.replaceDict[key]
            #
            #df_.loc[df_[key].isnull(), key] = self.replaceDict[key]
        return df_

In [10]:
# custom feature union pipeline
from sklearn.pipeline import Pipeline, FeatureUnion, _transform_one
from sklearn.externals.joblib import Parallel, delayed

class DFFeatureUnion(FeatureUnion):
    def fit_transform(self, X, y=None, **fit_params):
        # non-optimized default implementation; override when a better
        # method is possible
        if y is None:
            # fit method of arity 1 (unsupervised transformation)
            return self.fit(X, **fit_params).transform(X)
        else:
            # fit method of arity 2 (supervised transformation)
            return self.fit(X, y, **fit_params).transform(X)

    def transform(self, X):
        Xs = Parallel(n_jobs=self.n_jobs)(
            delayed(_transform_one)(trans, name, weight, X)
            for name, trans, weight in self._iter())
        return pd.concat(Xs, axis=1, join='inner')

# Pipeline Test

In [11]:
genrePipeline = Pipeline([
    ('selectGenres', DFTransform(lambda x: x['genres'], True)),
    ('stringToArray', StringToArray('|')),
    ('explodeAndLabel', ArrayExplodePivot()),
    ('addMissingCols', AddMissingCategorialColumns())
])

In [12]:
numericalPipeline = Pipeline([
    ('selectNumerical', DFTransform(lambda x: x.select_dtypes(exclude=['object']))),
    ('impure', DFTransform(lambda x: x.fillna(x.mean()))),
    ('normalize', DFTransform(lambda x: x.div(x.max())))
])

In [13]:
# transforms entry type to categorical
def toCategorical(df):
    X = df # copy
    for col in X.columns.values.tolist():
        X[col] = X[col].astype('category', categories= list(set(X[col].values.tolist())))
    return X

# transforms the genre string to a list: action|drama|... -> [action,drama,...]
class DropColumns(TransformerMixin, BaseEstimator, NoFitMixin):
    ''' Takes a Pandas Dataframe and two column names as input and transforms the input column '''
    def __init__(self, columns_list, copy=True):
        self.copy = copy
        self.columns_list = columns_list

    def transform(self, df):
        df_ = df if not self.copy else df.copy()
        temp_list = [] 
        for column in self.columns_list:
            if column in df_.columns:
                temp_list.append(column)
            else:
                print (column)
        return df_.drop(temp_list, axis = 1)
    
#if 'A' in df.columns:
#x.drop(['genres','movie_imdb_link', 'imdb_score', 'plot_keywords'],1)

# plotkeywords can be treated similar to genres
# drop columns need to be optional(if check)
replaceDict = {'director_name': 'unknown', 'actor_1_name': 'unknown', 'actor_2_name': 'unknown',
              'country': 'unknown'}

categoricalPipeline = Pipeline([
    ('dropGenre', DropColumns(['genres'])),
    ('selectCategorical', DFTransform(lambda x: x.select_dtypes(include=['object']))),
    ('customImpure', CustomImpure(replaceDict=replaceDict, copy=False)),
    ('impure', DFTransform(lambda x: x.apply(lambda x:x.fillna(x.value_counts().index[0])))),
    ('transformToCategorical', DFTransform(lambda x: toCategorical(x))),
    ('labelBinarize', DFTransform(lambda x: pd.get_dummies(x))),
    ('addMissingCols', AddMissingCategorialColumns())
])

In [37]:
from sklearn.pipeline import FeatureUnion
from sklearn.decomposition import PCA
from sklearn.decomposition import KernelPCA
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn import svm
from sklearn.model_selection import GridSearchCV

#clf = GridSearchCV(estimator=svc, param_grid=dict(C=Cs),n_jobs=-1)


#regressor = LinearRegression()
#regressor = GradientBoostingRegressor()
#regressor = svm.SVR(kernel='poly')
regressor = RandomForestRegressor(n_jobs=10)

from sklearn.model_selection import GridSearchCV

N_estimators = [120, 300, 500, 800 , 1200]
Max_depth = [5,8,15,25,30, None]
Min_sample_split = [1,2,5,10, 15, 100]
Min_sample_lead = [1,2,5,10]

param_grid = [
  {'regressor__n_estimators': [120, 300], 'regressor__criterion': ['mse', 'mae']
   , 'regressor__max_features':['auto', 'sqrt', 'log2'], 'regressor__max_depth': [8,30]
  ,  'regressor__min_samples_leaf': [2,10], 'regressor__min_samples_split': [2,10]}
#  ,{'n_estimators': [10, 100, 1000], 'criterion': ['mse', 'mae']}
 ]

#regressor = GridSearchCV(RandomForestRegressor(), param_grid)

features = FeatureUnion([('1', categoricalPipeline), ('2',numericalPipeline), ('3', genrePipeline)])

finalPipe = Pipeline([('dropColumns', DropColumns(['movie_imdb_link', 'imdb_score', 'plot_keywords'])),
                        ('featureSelection', features),
                        ('regressor', regressor)  ])

search = GridSearchCV(finalPipe, param_grid)

In [38]:
from sklearn.model_selection import train_test_split
X, y = df, df['imdb_score']
X_train, X_test, y_train, y_test = train_test_split( X, df['imdb_score'], test_size=0.2, random_state=42)

In [39]:
#finalPipe.fit(X_train, y_train)
search.fit(X_train, y_train)

0
0
7997
2
0
0
0
0
7984
2
0
0
0
0
7941
0
0
0
0
0
7997
2
0
0
0
0
7984
2
0
0
0
0
7941
0
0
0
0
0
7997
2
0
0
0
0
7984
2
0
0
0
0
7941
0
0
0
0
0
7997
2
0
0
0
0
7984
2
0
0
0
0
7941
0
0
0
0
0
7997
2
0
0
0
0
7984
2
0
0
0
0
7941
0
0
0
0
0
7997
2
0
0
0
0
7984
2
0
0
0
0
7941
0
0
0
0
0
7997
2
0
0
0
0
7984
2
0
0
0
0
7941
0
0
0
0
0
7997
2
0
0
0
0
7984
2
0
0
0
0
7941
0
0
0
0
0
7997
2
0
0
0
0
7984
2
0
0
0
0
7941
0
0
0
0
0
7997
2
0
0
0
0
7984
2
0
0
0
0
7941
0
0
0
0
0
7997
2
0
0
0
0
7984
2
0
0
0
0
7941
0
0
0
0
0
7997
2
0
0
0
0
7984
2
0
0
0
0
7941
0
0
0
0
0
7997
2
0
0
0
0
7984
2
0
0
0
0
7941
0
0
0
0
0
7997
2
0
0
0
0
7984
2
0
0
0
0
7941
0
0
0
0
0
7997
2
0
0
0
0
7984
2
0
0
0
0
7941
0
0
0
0
0
7997
2
0
0
0
0
7984
2
0
0
0
0
7941
0
0
0
0
0
7997
2
0
0
0
0
7984
2
0
0
0
0
7941
0
0
0
0
0
7997
2
0
0
0
0
7984
2
0
0
0
0
7941
0
0
0
0
0
7997
2
0
0
0
0
7984
2
0
0
0
0
7941
0
0
0
0
0
7997
2
0
0
0
0
7984
2
0
0
0
0
7941
0
0
0
0
0
7997
2
0
0
0
0
7984
2
0
0
0
0
7941
0
0
0
0
0
7997
2
0
0
0
0
7984
2
0
0
0
0
7941
0
0
0
0
0
7997
2

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('dropColumns', DropColumns(columns_list=['movie_imdb_link', 'imdb_score', 'plot_keywords'],
      copy=True)), ('featureSelection', FeatureUnion(n_jobs=1,
       transformer_list=[('1', Pipeline(memory=None,
     steps=[('dropGenre', DropColumns(columns_list=['genres'], copy=True)), ('select...imators=10, n_jobs=10,
           oob_score=False, random_state=None, verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'regressor__n_estimators': [120, 300], 'regressor__criterion': ['mse', 'mae'], 'regressor__max_features': ['auto', 'sqrt', 'log2'], 'regressor__max_depth': [8, 30], 'regressor__min_samples_leaf': [2, 10], 'regressor__min_samples_split': [2, 10]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [40]:
featureSelection = search.best_estimator_.steps[1][1]
featureNumerical_1 = featureSelection.transformer_list[1][1]
featureGenres_0 = featureSelection.transformer_list[0][1]
featureCat_2 = featureSelection.transformer_list[2][1]


dropcol = DropColumns(['movie_imdb_link', 'imdb_score', 'plot_keywords'])

featureNumerical_1_list = featureNumerical_1.transform(dropcol.transform(X_train)).columns.values.tolist()
featureGenres_0_list = featureGenres_0.transform(dropcol.transform(X_train)).columns.values.tolist()
featureCat_2_list = featureCat_2.transform(dropcol.transform(X_train)).columns.values.tolist()

#featureNumerical_1_list
#featureGenres_0_list
#featureCat_2_list

allFeatures = featureGenres_0_list + featureNumerical_1_list + featureCat_2_list

#allFeatures

bestForest = search.best_estimator_.steps[2][1]
allFeaturesImportanceList = bestForest.feature_importances_.tolist()

len(allFeatures)
len(allFeaturesImportanceList)

feats = {} # a dict to hold feature_name: feature_importance
for feature, importance in zip(allFeatures, allFeaturesImportanceList):
    feats[feature] = importance #add the name/value pair 
    
#feats

importances = pd.DataFrame.from_dict(feats, orient='index').rename(columns={0: 'Gini-importance'})
importances.sort_values(by='Gini-importance').plot(kind='bar', rot=45)

0
0


13519

13519

<matplotlib.axes._subplots.AxesSubplot at 0x7fe99da65da0>

In [59]:
#plt.show()
importances.head()
#DataFrame.sort_values(by, axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last')[source]

importances.sort_values(by='Gini-importance', axis=0, ascending=False).head(40)

Unnamed: 0,Gini-importance
color_ Black and White,9.3e-05
color_Color,2.3e-05
director_name_Victor Nunez,0.0
director_name_Leslie H. Martinson,0.0
director_name_John Cameron Mitchell,0.0


Unnamed: 0,Gini-importance
num_voted_users,0.324927
Drama,0.136863
title_year,0.071573
duration,0.0567
budget,0.056693
Documentary,0.044804
num_user_for_reviews,0.036299
num_critic_for_reviews,0.032702
gross,0.03201
director_facebook_likes,0.023772


In [43]:
import pickle
from sklearn.externals import joblib

In [51]:
search.best_estimator_.get_params
pd.DataFrame(search.cv_results_).head()
search.best_params_

<bound method Pipeline.get_params of Pipeline(memory=None,
     steps=[('dropColumns', DropColumns(columns_list=['movie_imdb_link', 'imdb_score', 'plot_keywords'],
      copy=True)), ('featureSelection', FeatureUnion(n_jobs=1,
       transformer_list=[('1', Pipeline(memory=None,
     steps=[('dropGenre', DropColumns(columns_list=['genres'], copy=True)), ('select...mators=120, n_jobs=10,
           oob_score=False, random_state=None, verbose=0, warm_start=False))])>

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_regressor__criterion,param_regressor__max_depth,param_regressor__max_features,param_regressor__min_samples_leaf,param_regressor__min_samples_split,param_regressor__n_estimators,...,split0_test_score,split0_train_score,split1_test_score,split1_train_score,split2_test_score,split2_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
0,4.951747,20.490077,0.352259,0.680862,mse,8,auto,2,2,120,...,0.263153,0.680903,0.370427,0.683777,0.423249,0.677906,0.086499,1.666643,0.066606,0.002397
1,11.842787,20.503628,0.353751,0.681524,mse,8,auto,2,2,300,...,0.27767,0.68155,0.358818,0.686524,0.424818,0.676499,0.076094,1.672095,0.060176,0.004093
2,4.881347,20.488467,0.355777,0.668488,mse,8,auto,2,10,120,...,0.280393,0.668802,0.367664,0.671714,0.41932,0.664947,0.04275,1.654513,0.057333,0.002771
3,11.664443,20.502347,0.345463,0.66809,mse,8,auto,2,10,300,...,0.274545,0.667272,0.345345,0.67135,0.416552,0.66565,0.097673,1.664462,0.05797,0.002398
4,4.77696,20.478247,0.356612,0.63548,mse,8,auto,10,2,120,...,0.284438,0.635666,0.387153,0.63554,0.398276,0.635233,0.046535,1.673643,0.051246,0.000182


{'regressor__criterion': 'mse',
 'regressor__max_depth': 30,
 'regressor__max_features': 'auto',
 'regressor__min_samples_leaf': 10,
 'regressor__min_samples_split': 2,
 'regressor__n_estimators': 120}

In [60]:
#joblib.dump(search.best_estimator_, 'search.pkl')

In [52]:
#finalPipe.get_params().keys()
y_pred = search.predict(X_test)
#sorted(search.cv_results_.keys())

11761
3


Returns the coefficient of determination R^2 of the prediction.
The coefficient R^2 is defined as (1 - u/v), where u is the residual sum of squares ((y_true - y_pred) ** 2).sum() and v is the total sum of squares ((y_true - y_true.mean()) ** 2).sum(). The best possible score is 1.0 and it can be negative (because the model can be arbitrarily worse). A constant model that always predicts the expected value of y, disregarding the input features, would get a R^2 score of 0.0.

In [53]:
from sklearn.metrics import accuracy_score
#finalPipe.score(X_test, y_test)
#search.score(X_test, y_test)

In [54]:
X_test_copy = X_test.copy()
X_test_copy['pred'] = y_pred
X_test_copy['imdb_score'] = y_test

In [55]:
pd.DataFrame(y_pred).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1009 entries, 0 to 1008
Data columns (total 1 columns):
0    1009 non-null float64
dtypes: float64(1)
memory usage: 8.0 KB


In [58]:
X_test_copy[['movie_title', 'imdb_score', 'pred']].head(40)

Unnamed: 0,movie_title,imdb_score,pred
4943,An American in Hollywood,7.2,6.385324
1919,Million Dollar Arm,7.0,7.283078
1049,Kate & Leopold,6.4,6.21354
4697,Naturally Native,6.5,6.190637
3312,Two Evil Eyes,6.1,5.987227
4387,You Can't Take It with You,8.0,7.322336
240,Star Wars: Episode I - The Phantom Menace,6.5,7.712324
4064,Pokémon 3: The Movie,5.6,5.906921
2702,The Dead Zone,7.5,6.508113
239,The Wolverine,6.7,7.043641


In [None]:
#rrf = finalPipe.steps[-1][1]
#rrf.feature_importances_
#features = finalPipe.steps[0][1]
#features[0]
#-np.sort(-rrf.feature_importances_, axis=None)

In [None]:
clr.fit(X_train, y_train)

# Tests to build the custom Transformers

In [None]:
genrePipeline = Pipeline([
    ('selectGenres', DFTransform(lambda x: x['genres'], True)),
    ('stringToArray', StringToArray('|')),
    ('explodeAndLabel', ArrayExplodePivot()),
#    ('addMissingCols', AddMissingCategorialColumns())
])

In [None]:
genrePipeline.fit_transform(X_train).head()

In [None]:
df2 = pd.DataFrame({'foo': ['one','one','one','two','two','two'],
                       'bar': ['A', 'B', 'C', 'A', 'B', 'C'],
                       'baz': [1, 2, 3, 4, 5, 6]})
df2

In [None]:
df2.pivot(columns='bar', values='baz')
df2.pivot(index='foo', columns='bar', values='baz')

In [None]:
arrayExplode = ArrayExplodePivot()
stringToArray = StringToArray('|')

In [None]:
df_new = stringToArray.fit_transform(df.genres)

In [None]:
set(arrayExplode.__explode__(df_new).genres.values.tolist())

In [None]:
arrayExplode.__explode__(df_new).genres.astype('category')

In [None]:
custom3 =  DFTransform(lambda x: x.fillna(x.mean()))
custom2 = DFTransform(lambda x: x.apply(lambda x:x.fillna(x.value_counts().index[0])))
train = pd.DataFrame({'A': ['a','a','a'], 'B': ['c','c',np.nan], 'C': [12, 5, np.nan]})
train
custom3.transform(train.C)
custom2.transform(train)
replace = CustomImpure({'A': 'Hans', 'B' : 'Franz'}, copy=False)
replace.fit_transform(train)
train

In [None]:
train = pd.DataFrame({'A': ['a','a','b']})
test = pd.DataFrame({'A': ['a','a','c']})
test
train

In [None]:
train_dummies = pd.get_dummies(train)
test_dummies = pd.get_dummies(test)
test_dummies
train_dummies

In [None]:
addMissing = AddMissingCategorialColumns(copy=False)

In [None]:
addMissing.fit(train_dummies)
test = addMissing.transform(train_dummies)

In [None]:
train_dummies
test_dummies

In [None]:
addMissing.transform(test_dummies)

In [None]:
test_dummies

In [None]:


categoricalPipeline2 = Pipeline([
    ('dropColumns', DFTransform(lambda x: x.drop(['genres','movie_imdb_link', 'imdb_score', 'plot_keywords'],1))),
    ('selectCategorical', DFTransform(lambda x: x.select_dtypes(include=['object']))),
    ('impure', DFTransform(lambda x: x.apply(lambda x:x.fillna(x.value_counts().index[0])))),
    ('transformToCategorical', DFTransform(lambda x: toCategorical(x))),
    ('labelBinarize', DFTransform(lambda x: pd.get_dummies(x))),
    ('addMissingCols', AddMissingCategorialColumns())
])



In [None]:
#categoricalPipeline2.transform(X_test)

In [None]:
categoricalPipeline2.fit_transform(X_train)

In [None]:
categoricalPipeline2.transform(X_test).head()

In [None]:
X_train.head()

In [None]:



def dropColumns(df, column_list):
    test = df.copy()
    for column in column_list:
        if column in test.columns:
            test.drop(['genres'], axis=1)
        else:
            print(column)
            continue
    return test

dropShit = DFTransform(lambda x: x.drop(['genres','movie_imdb_link', 
                                                         'plot_keywords'], axis = 1), copy=False)

dropSupershit =  DFTransform(lambda x: dropColumns(x,['genres','movie_imdb_link', 
                                                       'imdb_score', 'plot_keywords']))

dropAwesome = DropColumns(['genres','movie_imdb_link','plot_keywords'], copy=False)

In [None]:
X_train.genres.head()

In [None]:
test = dropShit.transform(X_train)

In [None]:
test.genres.head()

In [None]:
test2 = dropAwesome.transform(X_train)

In [None]:
test2.genres