In [47]:
import pandas as pd
import  numpy as np
import ast
import json
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from sklearn.preprocessing import MultiLabelBinarizer
import ast

train = pd.read_csv('C:/Users/Idan/Documents/GitHub/machine-learning/Kaggle_TMDB_Box_Office_Prediction/Data/train.csv')
test = pd.read_csv('C:/Users/Idan/Documents/GitHub/machine-learning/Kaggle_TMDB_Box_Office_Prediction/Data/test.csv')

dict_columns = ['belongs_to_collection', 'genres', 'production_companies', 'cast']
mlb = MultiLabelBinarizer()

In [48]:
def convert_string_to_list(strVal):
    if type(strVal) is not str:
        return  []
    else:
        return ast.literal_eval(strVal)
    
def format_dict_column_and_extract_names(strVal, col="name"):
    listOfItems = convert_string_to_list(strVal)
    
    return list(map(lambda x: x[col], listOfItems))

def extract_genres(df):
    df['genres'] = df['genres'].apply(format_dict_column_and_extract_names)
    
    return df.join(pd.DataFrame(mlb.fit_transform(df.pop('genres')),
                                columns=list(map(lambda x: 'genre_'+x, mlb.classes_)),index=df.index))

def extract_common_prod_companies(df):
    df['production_companies'] = df['production_companies'].apply(format_dict_column_and_extract_names)
    companiesCount = df['production_companies'].apply(pd.Series).stack().value_counts()
    companiesToKeep = companiesCount[companiesCount > 30].keys()
    
    #print("We'll keep the companies that appear more than 30 times:")
    #print(companiesToKeep)
    
    df['production_companies'] = df['production_companies'].apply(lambda x: list(filter(lambda i: i in companiesToKeep, x)))
    
    return df.join(pd.DataFrame(mlb.fit_transform(df.pop('production_companies')),
                          columns=list(map(lambda x: 'prod_company_'+x,mlb.classes_)),
                          index=df.index))

def add_cast_length_column(df):
    castNames = df['cast'].apply(format_dict_column_and_extract_names)
    df['cast_len'] = castNames.apply(lambda x: len(x))
    df.drop(['cast'], axis=1, inplace=True)
    
    return df

def extract_date_values(df, cols=[]):
    for col in cols:
        
        df[col] = pd.to_datetime(df[col])
        df[col + '_month'] = df[col].dt.month
        df[col + '_day_month'] = df[col].dt.day
        df[col + '_day_week'] = df[col].dt.weekday
    
    return df


df = extract_genres(train)
df = extract_common_prod_companies(df)
df = add_cast_length_column(df)
df = extract_date_values(df, ['release_date'])

df['belongs_to_collection_bool'] = np.where(df['belongs_to_collection'].isna(), 0, 1)
df.drop(['belongs_to_collection'], axis=1, inplace=True)

df.head()


Unnamed: 0,id,budget,homepage,imdb_id,original_language,original_title,overview,popularity,poster_path,production_countries,release_date,runtime,spoken_languages,status,tagline,title,Keywords,crew,revenue,genre_Action,genre_Adventure,genre_Animation,genre_Comedy,genre_Crime,genre_Documentary,genre_Drama,genre_Family,genre_Fantasy,genre_Foreign,genre_History,genre_Horror,genre_Music,genre_Mystery,genre_Romance,genre_Science Fiction,genre_TV Movie,genre_Thriller,genre_War,genre_Western,prod_company_Canal+,prod_company_Columbia Pictures,prod_company_Columbia Pictures Corporation,prod_company_Metro-Goldwyn-Mayer (MGM),prod_company_Miramax Films,prod_company_New Line Cinema,prod_company_Paramount Pictures,prod_company_Regency Enterprises,prod_company_Relativity Media,prod_company_Touchstone Pictures,prod_company_TriStar Pictures,prod_company_Twentieth Century Fox Film Corporation,prod_company_United Artists,prod_company_Universal Pictures,prod_company_Village Roadshow Pictures,prod_company_Walt Disney Pictures,prod_company_Warner Bros.,cast_len,release_date_month,release_date_day_month,release_date_day_week,belongs_to_collection_bool
0,1,14000000,,tt2637294,en,Hot Tub Time Machine 2,"When Lou, who has become the ""father of the In...",6.575393,/tQtWuwvMf0hCc2QR2tkolwl7c3c.jpg,"[{'iso_3166_1': 'US', 'name': 'United States o...",2015-02-20,93.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The Laws of Space and Time are About to be Vio...,Hot Tub Time Machine 2,"[{'id': 4379, 'name': 'time travel'}, {'id': 9...","[{'credit_id': '59ac067c92514107af02c8c8', 'de...",12314651,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,24,2,20,4,1
1,2,40000000,,tt0368933,en,The Princess Diaries 2: Royal Engagement,Mia Thermopolis is now a college graduate and ...,8.248895,/w9Z7A0GHEhIp7etpj0vyKOeU1Wx.jpg,"[{'iso_3166_1': 'US', 'name': 'United States o...",2004-08-06,113.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,It can take a lifetime to find true love; she'...,The Princess Diaries 2: Royal Engagement,"[{'id': 2505, 'name': 'coronation'}, {'id': 42...","[{'credit_id': '52fe43fe9251416c7502563d', 'de...",95149435,0,0,0,1,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,20,8,6,4,1
2,3,3300000,http://sonyclassics.com/whiplash/,tt2582802,en,Whiplash,"Under the direction of a ruthless instructor, ...",64.29999,/lIv1QinFqz4dlp5U4lQ6HaiskOZ.jpg,"[{'iso_3166_1': 'US', 'name': 'United States o...",2014-10-10,105.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The road to greatness can take you to the edge.,Whiplash,"[{'id': 1416, 'name': 'jazz'}, {'id': 1523, 'n...","[{'credit_id': '54d5356ec3a3683ba0000039', 'de...",13092000,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,51,10,10,4,0
3,4,1200000,http://kahaanithefilm.com/,tt1821480,hi,Kahaani,Vidya Bagchi (Vidya Balan) arrives in Kolkata ...,3.174936,/aTXRaPrWSinhcmCrcfJK17urp3F.jpg,"[{'iso_3166_1': 'IN', 'name': 'India'}]",2012-03-09,122.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,,Kahaani,"[{'id': 10092, 'name': 'mystery'}, {'id': 1054...","[{'credit_id': '52fe48779251416c9108d6eb', 'de...",16000000,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7,3,9,4,0
4,5,0,,tt1380152,ko,마린보이,Marine Boy is the story of a former national s...,1.14807,/m22s7zvkVFDU9ir56PiiqIEWFdT.jpg,"[{'iso_3166_1': 'KR', 'name': 'South Korea'}]",2009-02-05,118.0,"[{'iso_639_1': 'ko', 'name': '한국어/조선말'}]",Released,,Marine Boy,,"[{'credit_id': '52fe464b9251416c75073b43', 'de...",3923970,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,2,5,3,0
