In [54]:
import pandas as pd
import  numpy as np
import ast
import json
import datetime
from fastai.imports import *
from fastai.tabular import *
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from sklearn.preprocessing import MultiLabelBinarizer
import ast
pd.options.display.float_format = '{:.2f}'.format

train = pd.read_csv('C:/Users/Idan/Documents/GitHub/machine-learning/Kaggle_TMDB_Box_Office_Prediction/Data/train.csv', parse_dates=['release_date'])
#test = pd.read_csv('C:/Users/idan.benaun/Documents/GitHub/Kaggle_TMDB_Box_Office_Prediction/Data/test.csv')

train['release_date'] = train['release_date'].mask(train['release_date'].dt.year > 2017, 
                                                   train['release_date'] - pd.offsets.DateOffset(years=100))

dict_columns = ['belongs_to_collection', 'genres', 'production_companies', 'cast']
mlb = MultiLabelBinarizer()

In [55]:
def convert_string_to_list(strVal):
    if type(strVal) is not str:
        return  []
    else:
        return ast.literal_eval(strVal)
    
def format_dict_column_and_extract_names(strVal, col="name"):
    listOfItems = convert_string_to_list(strVal)
    
    return list(map(lambda x: x[col], listOfItems))

def extract_genres(df):
    df['genres'] = df['genres'].apply(format_dict_column_and_extract_names)
    
    return df.join(pd.DataFrame(mlb.fit_transform(df.pop('genres')),
                                columns=list(map(lambda x: 'genre_'+x, mlb.classes_)),index=df.index))

def extract_common(df, col='production_companies', limit=30):
    df[col] = df[col].apply(format_dict_column_and_extract_names)
    companiesCount = df[col].apply(pd.Series).stack().value_counts()
    companiesToKeep = companiesCount[companiesCount > limit].keys()
    
    #print("We'll keep the companies that appear more than 30 times:")
    #print(companiesToKeep)
    
    df[col] = df[col].apply(lambda x: list(filter(lambda i: i in companiesToKeep, x)))
    
    return df.join(pd.DataFrame(mlb.fit_transform(df.pop(col)),
                          columns=list(map(lambda x: col+'_'+x,mlb.classes_)),
                          index=df.index))

def add_x_length_column(df, col='cast'):
    castNames = df[col].apply(format_dict_column_and_extract_names)
    df[col + '_len'] = castNames.apply(lambda x: len(x))
    df.drop([col], axis=1, inplace=True)
    
    return df

def last_year_avg_revenue(df, agg_col='release_Year', calc_col='revenue', leg=1):
    temp = df.groupby([agg_col]).agg({calc_col: 'mean'}).shift(leg).reset_index()
    temp = temp.rename(index=str, columns={'revenue':'last_period_revenue'})
    
    return pd.merge(df, temp, on=['release_Year'], how='left')

df = extract_genres(train)
df = extract_common(df, col='production_companies', limit=30)
df = extract_common(df, col='production_countries', limit=20)
df = extract_common(df, col='spoken_languages', limit=10)
df = add_x_length_column(df, col='cast')
df = add_x_length_column(df, col='crew')
df = add_datepart(df, 'release_date')
df = last_year_avg_revenue(df)


df['belongs_to_collection_bool'] = np.where(df['belongs_to_collection'].isna(), 0, 1)
df.drop(['belongs_to_collection'], axis=1, inplace=True)


df.head(10)


Unnamed: 0,id,budget,homepage,imdb_id,original_language,original_title,overview,popularity,poster_path,runtime,status,tagline,title,Keywords,revenue,genre_Action,genre_Adventure,genre_Animation,genre_Comedy,genre_Crime,genre_Documentary,genre_Drama,genre_Family,genre_Fantasy,genre_Foreign,genre_History,genre_Horror,genre_Music,genre_Mystery,genre_Romance,genre_Science Fiction,genre_TV Movie,genre_Thriller,genre_War,genre_Western,production_companies_Canal+,production_companies_Columbia Pictures,production_companies_Columbia Pictures Corporation,production_companies_Metro-Goldwyn-Mayer (MGM),production_companies_Miramax Films,production_companies_New Line Cinema,production_companies_Paramount Pictures,production_companies_Regency Enterprises,production_companies_Relativity Media,production_companies_Touchstone Pictures,production_companies_TriStar Pictures,production_companies_Twentieth Century Fox Film Corporation,production_companies_United Artists,production_companies_Universal Pictures,production_companies_Village Roadshow Pictures,production_companies_Walt Disney Pictures,production_companies_Warner Bros.,production_countries_Australia,production_countries_Belgium,production_countries_Canada,production_countries_China,production_countries_France,production_countries_Germany,production_countries_Hong Kong,production_countries_India,production_countries_Ireland,production_countries_Italy,production_countries_Japan,production_countries_Russia,production_countries_South Korea,production_countries_Spain,production_countries_United Kingdom,production_countries_United States of America,spoken_languages_,spoken_languages_Dansk,spoken_languages_Deutsch,spoken_languages_English,spoken_languages_Español,spoken_languages_Français,spoken_languages_Italiano,spoken_languages_Latin,spoken_languages_Magyar,spoken_languages_Polski,spoken_languages_Português,spoken_languages_Pусский,spoken_languages_Türkçe,spoken_languages_svenska,spoken_languages_Český,spoken_languages_ελληνικά,spoken_languages_עִבְרִית,spoken_languages_العربية,spoken_languages_हिन्दी,spoken_languages_தமிழ்,spoken_languages_ภาษาไทย,spoken_languages_广州话 / 廣州話,spoken_languages_日本語,spoken_languages_普通话,spoken_languages_한국어/조선말,cast_len,crew_len,release_Year,release_Month,release_Week,release_Day,release_Dayofweek,release_Dayofyear,release_Is_month_end,release_Is_month_start,release_Is_quarter_end,release_Is_quarter_start,release_Is_year_end,release_Is_year_start,release_Elapsed,last_period_revenue,belongs_to_collection_bool
0,1,14000000,,tt2637294,en,Hot Tub Time Machine 2,"When Lou, who has become the ""father of the In...",6.58,/tQtWuwvMf0hCc2QR2tkolwl7c3c.jpg,93.0,Released,The Laws of Space and Time are About to be Vio...,Hot Tub Time Machine 2,"[{'id': 4379, 'name': 'time travel'}, {'id': 9...",12314651,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,24,72,2015,2,8,20,4,51,False,False,False,False,False,False,1424390400,76690081.33,1
1,2,40000000,,tt0368933,en,The Princess Diaries 2: Royal Engagement,Mia Thermopolis is now a college graduate and ...,8.25,/w9Z7A0GHEhIp7etpj0vyKOeU1Wx.jpg,113.0,Released,It can take a lifetime to find true love; she'...,The Princess Diaries 2: Royal Engagement,"[{'id': 2505, 'name': 'coronation'}, {'id': 42...",95149435,0,0,0,1,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,20,9,2004,8,32,6,4,219,False,False,False,False,False,False,1091750400,78921195.15,1
2,3,3300000,http://sonyclassics.com/whiplash/,tt2582802,en,Whiplash,"Under the direction of a ruthless instructor, ...",64.3,/lIv1QinFqz4dlp5U4lQ6HaiskOZ.jpg,105.0,Released,The road to greatness can take you to the edge.,Whiplash,"[{'id': 1416, 'name': 'jazz'}, {'id': 1523, 'n...",13092000,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,51,64,2014,10,41,10,4,283,False,False,False,False,False,False,1412899200,72400505.96,0
3,4,1200000,http://kahaanithefilm.com/,tt1821480,hi,Kahaani,Vidya Bagchi (Vidya Balan) arrives in Kolkata ...,3.17,/aTXRaPrWSinhcmCrcfJK17urp3F.jpg,122.0,Released,,Kahaani,"[{'id': 10092, 'name': 'mystery'}, {'id': 1054...",16000000,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,7,3,2012,3,10,9,4,69,False,False,False,False,False,False,1331251200,72719445.68,0
4,5,0,,tt1380152,ko,마린보이,Marine Boy is the story of a former national s...,1.15,/m22s7zvkVFDU9ir56PiiqIEWFdT.jpg,118.0,Released,,Marine Boy,,3923970,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,4,2,2009,2,6,5,3,36,False,False,False,False,False,False,1233792000,80945077.79,0
5,6,8000000,,tt0093743,en,Pinocchio and the Emperor of the Night,"Pinocchio and his friends, a glow worm and a m...",0.74,/6IDqA1D2NBIVhzEEaMMRL28iBrq.jpg,83.0,Released,,Pinocchio and the Emperor of the Night,,3261638,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,11,1987,8,32,6,3,218,False,False,False,False,False,False,555206400,37631510.64,0
6,7,14000000,http://www.thepossessionmovie.com/,tt0431021,en,The Possession,A young girl buys an antique box at a yard sal...,7.29,/4QjzFuaZmB4btGnLwAgdp23BzIU.jpg,92.0,Released,Fear The Demon That Doesn't Fear God,The Possession,,85446075,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,14,77,2012,8,35,30,3,243,False,False,False,False,False,False,1346284800,72719445.68,0
7,8,0,,tt0391024,en,Control Room,A chronicle which provides a rare window into ...,1.95,/83BV8fGy5p9i8RwE9oj76tZ1Ikp.jpg,84.0,Released,Different channels. Different truths.,Control Room,"[{'id': 917, 'name': 'journalism'}, {'id': 163...",2586511,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,4,1,2004,1,3,15,3,15,False,False,False,False,False,False,1074124800,78921195.15,0
8,9,0,,tt0117110,en,Muppet Treasure Island,After telling the story of Flint's last journe...,6.9,/5A8gKzOrF9Z7tSUX6xd5dEx4NXf.jpg,100.0,Released,Set sail for Muppet mayhem!,Muppet Treasure Island,"[{'id': 2041, 'name': 'island'}, {'id': 4418, ...",34327391,1,1,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,12,8,1996,2,7,16,4,47,False,False,False,False,False,False,824428800,74985827.1,1
9,10,6000000,,tt0310281,en,A Mighty Wind,"In ""A Mighty Wind"", director Christopher Guest...",4.67,/xwD4EsPOGLuKP50Q1gwqAsyhAFB.jpg,91.0,Released,"Back together for the first time, again.",A Mighty Wind,"[{'id': 11800, 'name': 'mockumentary'}, {'id':...",18750246,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,20,11,2003,4,16,16,2,106,False,False,False,False,False,False,1050451200,87773835.64,0
