In [2]:
# Clone the entire repo.
!git clone -l -s git://github.com/Idan707/Kaggle_TMDB_Box_Office_Prediction.git cloned-repo
%cd cloned-repo
# !ls

fatal: destination path 'cloned-repo' already exists and is not an empty directory.
/content/cloned-repo


In [3]:
#pip install pandas==0.25
pip install catboost



# Import & Load

In [0]:
# imports
import pandas as pd
import  numpy as np
import ast
import json
import gensim
import datetime
import ast
import nltk
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from sklearn.preprocessing import MultiLabelBinarizer ,normalize
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import confusion_matrix, roc_auc_score, classification_report
from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures, StandardScaler
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
import lightgbm as lgb
import catboost as cat
from sklearn.metrics import mean_squared_error, mean_squared_log_error
from sklearn.model_selection import GroupKFold
import xgboost as xgb

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.options.display.float_format = '{:.2f}'.format

train = pd.read_csv('/content/cloned-repo/Data/train.csv',
                    parse_dates=['release_date'])
test = pd.read_csv('/content/cloned-repo/Data/test.csv',
                    parse_dates=['release_date'])

# Date Fix
train['release_date'] = train['release_date'].mask(train['release_date'].dt.year > 2017, 
                                                   train['release_date'] - pd.offsets.DateOffset(years=100))
test['release_date'] = test['release_date'].mask(test['release_date'].dt.year > 2017, 
                                                   test['release_date'] - pd.offsets.DateOffset(years=100))

test['revenue'] = 0

mlb = MultiLabelBinarizer()

# Feature Engineering 

In [123]:
train.columns

Index(['id', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'imdb_id', 'original_language', 'original_title', 'overview', 'popularity', 'poster_path', 'production_companies', 'production_countries', 'release_date', 'runtime', 'spoken_languages', 'status', 'tagline', 'title', 'Keywords', 'cast', 'crew', 'revenue', 'train_test'], dtype='object')

In [10]:
def convert_string_to_list(strVal):
    if type(strVal) is not str:
        return  []
    else:
        return ast.literal_eval(strVal)
    
def format_dict_column_and_extract_names(strVal, col="name"):
    listOfItems = convert_string_to_list(strVal)

    return list(map(lambda x: x[col], listOfItems))

def add_x_length_column(df, col):
    df[col + '_size'] = df[col].apply(lambda x: len(x))
    
    return df

def extract_genres(df):
    df['genres'] = df['genres'].apply(format_dict_column_and_extract_names)
    df = add_x_length_column(df, col='genres')
    temp = mlb.fit_transform(df.pop('genres'))
    temp_df = pd.DataFrame(temp, columns=list(map(lambda x: 'genres'+'_'+x,mlb.classes_)), 
                              )#index=temp.index)
    print('extract_genres temp df shape: ', temp_df.shape)
    df = pd.merge(df, temp_df, how='inner', on=df.id)
    
    return df

def extract_common(df, col, limit, apply_dict_to_col=True):
    if apply_dict_to_col == True:
        df[col] = df[col].apply(format_dict_column_and_extract_names)
        
    companiesCount = df[col].apply(pd.Series).stack().value_counts().sort_values(axis=0, ascending=False)
    companiesToKeep = companiesCount[:limit].keys()
    
    add_x_length_column(df, col)
    df[col] = df[col].apply(lambda x: list(filter(lambda i: i in companiesToKeep, x)))
    
    temp = mlb.fit_transform(df.pop(col))
    temp_df = pd.DataFrame(temp, columns=list(map(lambda x: col+'_'+x,mlb.classes_)), 
                              )#index=temp.index)
    print('extract_common temp df shape: ', temp_df.shape)
    print('temp index: ',temp_df.index)
    df = pd.merge(df, temp_df, how='inner', left_index=True, right_index=True)

    return df

def last_year_metrics(df, agg_col='release_date_Year', calc_col=['revenue','imdb_id','popularity','budget'], leg=1):
    
    temp = df.groupby([agg_col]).agg(last_period_mean_revenue = (calc_col[0], 'mean'),
                                     last_period_median_revenue = (calc_col[0], 'median'),
                                     last_period_movies_count = (calc_col[1], 'count'),
                                     last_period_mean_popularity = (calc_col[2], 'mean'),
                                     last_period_median_popularity = (calc_col[2], 'median'),
                                     last_period_mean_budget = (calc_col[3], 'mean'),
                                     last_period_median_budget = (calc_col[3], 'median')).shift(leg).reset_index()

    return pd.merge(df, temp, on=['release_date_Year'], how='left')

def days_diff_from_last_movie(df, col='release_date'):#see if there is a need in buckts
    df = df.sort_values(by=[col])
    df['days_diff_from_last_movie'] = df[col].diff().fillna(0).dt.days
    
    return df

def ratios(df):
    df['budget_to_runtime'] = df['budget']/df['runtime']
    df['budget_to_cast_crew_size'] = df['budget']/(df['cast_size'] + df['crew_size'])
    df['budget_to_popularity'] = df['budget']/df['popularity']
    
    return df

def add_datepart(df,col ='release_date'):
    df[col] = pd.to_datetime(df[col])
    df[col +'_Year']  = df[col].dt.year
    df[col +'_Month'] = df[col].dt.month
    df[col +'_day_of_month'] = df[col].dt.day
    df[col +'_week_day'] = df[col].dt.dayofweek
    df[col +'_is_quarter_start'] = df[col].dt.is_quarter_start
    df[col +'_is_quarter_start'] = df[col +'_is_quarter_start'].fillna(False)
    
    return df 

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(token)
            
    return result

def extract_common_overview_words(df, col):
    df[col] = df[col].fillna('').astype(str).map(preprocess)
    df[col] = df[col].map(lambda x: list(set(x)))
    df = extract_common(df, col, limit=15, apply_dict_to_col=False)
    
    return df
  
def extract_gender(strVal, dict_key, dict_value):
    listOfItems = convert_string_to_list(strVal)

    Item = (list(filter(lambda lst: lst[dict_key] == dict_value, listOfItems)) or [None])[0]
    if type(Item) is dict:
        return Item['gender']
    else:
        return None
      
def add_gender(df, col, dict_key_in , dict_value_in):
    df[dict_key_in + '_' + str(dict_value_in) + '_gender'] = df[col].apply(extract_gender, args=[dict_key_in, dict_value_in])
    df[dict_key_in + '_' + str(dict_value_in) + '_gender'] = df[dict_key_in + '_' + str(dict_value_in) + '_gender'].fillna(2).astype('int64')
    
    return df

def run_main(train, test):

  test['train_test'] = 'test'  
  train['train_test'] = 'train'

  # work on full df
  print('working on full df..')
  df = pd.concat([train, test],axis=0, sort=True)
  df = extract_genres(df)
  df = extract_common(df, col='production_companies', limit=30)
  df = extract_common(df, col='production_countries', limit=20)
  df = extract_common(df, col='spoken_languages', limit=15)
  df = extract_common(df, col='Keywords', limit=15)
  
  df = add_gender(df, col='crew', dict_key_in='job', dict_value_in='Producer')
  df = add_gender(df, col='crew', dict_key_in='job', dict_value_in='Director')
  df = add_gender(df, col='cast', dict_key_in='order', dict_value_in=0)
  df = add_gender(df, col='cast', dict_key_in='order', dict_value_in=1)
  df = extract_common(df, col='cast', limit=15)
  df = extract_common(df, col='crew', limit=15)

  # fill missing values 1
  print('fill missing values 1...')
  df['cast_size'] = np.where(df['cast_size'] == 0, df['cast_size'].mean(), df['cast_size'])
  df['crew_size'] = np.where(df['crew_size'] == 0, df['crew_size'].mean(), df['crew_size'])
  
  # add datepart
  print('add datepart...')
  df = add_datepart(df,col = 'release_date') 

  # split for data leak prevention
  print('split for data leak prevention and run main...')
  train = df[df['train_test'] == 'train']
  test = df[df['train_test'] == 'test']

  train = last_year_metrics(train)
  train = days_diff_from_last_movie(train)
  train['revenue'] = np.where(train['revenue'] == 0, train['last_period_median_revenue'], train['revenue'])
  train['revenue'] = np.where(train['revenue'] == 0, train["revenue"].mean(), train['revenue'])
  train = extract_common_overview_words(train, col='overview')

  test = last_year_metrics(test)
  test = days_diff_from_last_movie(test)
  test['revenue'] = np.where(test['revenue'] == 0, test['last_period_median_revenue'], test['revenue'])
  test['revenue'] = np.where(test['revenue'] == 0, test["revenue"].mean(), test['revenue'])
  test = extract_common_overview_words(test, col='overview')

  df = pd.concat([train, test],axis=0, sort=True)

  # fill missing values 2
  print('fill missing values 2...')
  df['budget'] = np.where(df['budget'] == 0, df['last_period_median_budget'], df['budget'])
  df['budget'] = np.where(df['budget'] == 0, df["budget"].mean(), df['budget'])
  df['production_companies_size'] = np.where(df['production_companies_size'] == 0, df["production_companies_size"].mean(), df['production_companies_size'])
  df['production_countries_size'] = np.where(df['production_countries_size'] == 0, df["production_countries_size"].mean(), df['production_countries_size'])
  df['Keywords_size'] = np.where(df['Keywords_size'] == 0, df["Keywords_size"].mean(), df['Keywords_size'])

  # run ratios
  print('run ratios...')
  df = ratios(df)

  # fill missing values 3
  print('fill missing values 3...')
  df = df.replace([np.inf, -np.inf], np.nan)
  df['budget_to_runtime'] = np.where(df['budget_to_runtime'] == np.nan, df["budget_to_runtime"].mean(), df['budget_to_runtime'])
  df['belongs_to_collection_bool'] = np.where(df['belongs_to_collection'].isna(), 0, 1)
  df.drop(['belongs_to_collection'], axis=1, inplace=True)

  # counting the numbers of words in the movie title 
  print('counting the numbers of words in the movie title..')
  df['original_title_words'] = df.original_title.str.split()
  df['original_title_words'] = df['original_title_words'].apply(lambda x: len(x))

  df = pd.get_dummies(df, prefix=['job_Producer_gender', 'job_Director_gender', 'order_0_gender', 'order_1_gender'], 
                          columns=['job_Producer_gender', 'job_Director_gender', 'order_0_gender', 'order_1_gender'])

  # removing highly correlated features
  print('removing highly correlated features...')
  df.drop(["crew_Bob Weinstein", "release_date_Year"], inplace=True, axis=1)

  # normalize features
  scaler = preprocessing.MinMaxScaler()
  df["budget_to_popularity"] = scaler.fit_transform(df[["budget_to_popularity"]])

  return df

df = run_main(train, test)
df.head()

working on full df..
extract_genres temp df shape:  (7398, 20)
extract_common temp df shape:  (7398, 30)
temp index:  RangeIndex(start=0, stop=7398, step=1)
extract_common temp df shape:  (7398, 20)
temp index:  RangeIndex(start=0, stop=7398, step=1)
extract_common temp df shape:  (7398, 15)
temp index:  RangeIndex(start=0, stop=7398, step=1)
extract_common temp df shape:  (7398, 15)
temp index:  RangeIndex(start=0, stop=7398, step=1)
extract_common temp df shape:  (7398, 15)
temp index:  RangeIndex(start=0, stop=7398, step=1)
extract_common temp df shape:  (7398, 15)
temp index:  RangeIndex(start=0, stop=7398, step=1)
fill missing values 1...
add datepart...
split for data leak prevention and run main...




extract_common temp df shape:  (3000, 15)
temp index:  RangeIndex(start=0, stop=3000, step=1)
extract_common temp df shape:  (4398, 15)
temp index:  RangeIndex(start=0, stop=4398, step=1)
fill missing values 2...
run ratios...
fill missing values 3...
counting the numbers of words in the movie title..
removing highly correlated features...


Unnamed: 0,Keywords_aftercreditsstinger,Keywords_based on novel,Keywords_biography,Keywords_duringcreditsstinger,Keywords_dystopia,Keywords_friendship,Keywords_independent film,Keywords_love,Keywords_murder,Keywords_police,Keywords_revenge,Keywords_sex,Keywords_size,Keywords_sport,Keywords_violence,Keywords_woman director,budget,cast_Bruce Willis,cast_Christopher Walken,cast_J.K. Simmons,cast_John Goodman,cast_Julianne Moore,cast_Liam Neeson,cast_Matt Damon,cast_Morgan Freeman,cast_Nicolas Cage,cast_Robert De Niro,cast_Robin Williams,cast_Samuel L. Jackson,cast_Steve Buscemi,cast_Sylvester Stallone,cast_Willem Dafoe,cast_size,crew_Avy Kaufman,crew_Deborah Aquila,crew_Francine Maisler,crew_Hans Zimmer,crew_Harvey Weinstein,crew_James Horner,crew_James Newton Howard,crew_Jerry Goldsmith,crew_Kerry Barden,crew_Luc Besson,crew_Mary Vernieu,crew_Robert Rodriguez,crew_Steven Spielberg,crew_Tricia Wood,crew_size,days_diff_from_last_movie,genres_Action,genres_Adventure,genres_Animation,genres_Comedy,genres_Crime,genres_Documentary,genres_Drama,genres_Family,genres_Fantasy,genres_Foreign,genres_History,genres_Horror,genres_Music,genres_Mystery,genres_Romance,genres_Science Fiction,genres_TV Movie,genres_Thriller,genres_War,genres_Western,genres_size,homepage,id,imdb_id,key_0,last_period_mean_budget,last_period_mean_popularity,last_period_mean_revenue,last_period_median_budget,last_period_median_popularity,last_period_median_revenue,last_period_movies_count,original_language,original_title,overview_family,overview_father,overview_film,overview_finds,overview_friends,overview_help,overview_life,overview_love,overview_size,overview_story,overview_time,overview_wife,overview_woman,overview_world,overview_year,overview_years,overview_young,popularity,poster_path,production_companies_Amblin Entertainment,production_companies_BBC Films,production_companies_Canal+,production_companies_Columbia Pictures,production_companies_Columbia Pictures Corporation,production_companies_Dimension Films,production_companies_DreamWorks SKG,production_companies_Dune Entertainment,production_companies_Fox 2000 Pictures,production_companies_Fox Searchlight Pictures,production_companies_Hollywood Pictures,production_companies_Lionsgate,production_companies_Metro-Goldwyn-Mayer (MGM),production_companies_Miramax Films,production_companies_New Line Cinema,production_companies_Orion Pictures,production_companies_Paramount Pictures,production_companies_Regency Enterprises,production_companies_Relativity Media,production_companies_StudioCanal,production_companies_Summit Entertainment,production_companies_Touchstone Pictures,production_companies_TriStar Pictures,production_companies_Twentieth Century Fox Film Corporation,production_companies_United Artists,production_companies_Universal Pictures,production_companies_Village Roadshow Pictures,production_companies_Walt Disney Pictures,production_companies_Warner Bros.,production_companies_Working Title Films,production_companies_size,production_countries_Australia,production_countries_Belgium,production_countries_Canada,production_countries_China,production_countries_Denmark,production_countries_France,production_countries_Germany,production_countries_Hong Kong,production_countries_India,production_countries_Ireland,production_countries_Italy,production_countries_Japan,production_countries_Mexico,production_countries_Netherlands,production_countries_Russia,production_countries_South Korea,production_countries_Spain,production_countries_Sweden,production_countries_United Kingdom,production_countries_United States of America,production_countries_size,release_date,release_date_Month,release_date_day_of_month,release_date_is_quarter_start,release_date_week_day,revenue,runtime,spoken_languages_,spoken_languages_Deutsch,spoken_languages_English,spoken_languages_Español,spoken_languages_Français,spoken_languages_Italiano,spoken_languages_Polski,spoken_languages_Português,spoken_languages_Pусский,spoken_languages_size,spoken_languages_العربية,spoken_languages_हिन्दी,spoken_languages_广州话 / 廣州話,spoken_languages_日本語,spoken_languages_普通话,spoken_languages_한국어/조선말,status,tagline,title,train_test,budget_to_runtime,budget_to_cast_crew_size,budget_to_popularity,belongs_to_collection_bool,original_title_words,job_Producer_gender_0,job_Producer_gender_1,job_Producer_gender_2,job_Director_gender_0,job_Director_gender_1,job_Director_gender_2,order_0_gender_0,order_0_gender_1,order_0_gender_2,order_1_gender_0,order_1_gender_1,order_1_gender_2
1763,0,0,0,0,0,0,0,0,0,0,0,0,14.0,0,0,0,250000.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,35.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,12.0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,2,,1764,tt0012349,1764,,,,,,,,en,The Kid,0,0,0,0,0,0,0,0,28,0,0,0.0,0,1,,0,0,8.17,/drgMcyTsySQBnUPGaBThCHGdlWT.jpg,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1.0,1921-01-21,1.0,21.0,False,4.0,2500000.0,68.0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,Released,6 reels of Joy.,The Kid,train,3676.47,5319.15,0.0,0,2,0,0,1,0,0,1,0,0,1,1,0,0
2992,0,0,0,0,0,0,0,0,0,0,0,0,12.0,0,0,0,1135654.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,21.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,28.0,1152,1,1,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,5,,2993,tt0015400,2993,250000.0,8.17,2500000.0,250000.0,8.17,2500000.0,1.0,en,The Thief of Bagdad,0,0,1,0,0,0,0,0,9,0,0,0.0,0,0,,0,0,3.88,/a6IzXkwZRDimfn8HATzP6Pi6Ois.jpg,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1.0,1924-03-18,3.0,18.0,False,1.0,1213880.0,149.0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,Released,"""Happiness Must Be Earned""",The Thief of Bagdad,train,7621.84,23176.61,0.0,0,4,1,0,0,0,0,1,1,0,0,0,0,1
1917,0,0,0,0,0,0,0,0,0,0,0,0,4.0,0,0,0,592.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,17.0,526,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,2,,1918,tt0016104,1918,1135654.0,3.88,1213880.0,1135654.0,3.88,1213880.0,1.0,en,The Merry Widow,0,0,0,0,0,1,0,0,27,0,0,0.0,0,0,,1,1,0.29,/dCVkB0POblxtn3BegTNcwTPMKUP.jpg,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1.0,1925-08-26,8.0,26.0,False,2.0,1.0,137.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Released,,The Merry Widow,train,4.32,23.68,0.0,0,3,0,0,1,1,0,0,1,0,0,0,0,1
1361,0,0,0,0,0,0,0,0,0,0,0,0,13.0,0,1,0,1135654.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,23.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,20.0,120,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,2,,1362,tt0015648,1362,1135654.0,3.88,1213880.0,1135654.0,3.88,1213880.0,1.0,ru,Броненосец «Потёмкин»,0,0,0,0,0,0,0,1,21,0,0,0.0,1,0,,0,0,12.91,/A5kk0FA4kS9sHLYzC6NI72OOhPc.jpg,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1.0,1925-12-24,12.0,24.0,False,3.0,45100.0,75.0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,Released,"Revolution is the only lawful, equal, effectua...",Battleship Potemkin,train,15142.05,26410.56,0.0,0,2,0,0,1,0,0,1,0,0,1,1,0,0
2680,0,0,0,0,0,0,0,0,0,0,0,0,5.0,0,0,0,463455.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,15.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5.0,141,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,,2681,tt0017423,2681,296.0,6.6,22550.5,296.0,6.6,22550.5,2.0,en,Sparrows,0,0,0,0,0,0,0,0,39,0,1,0.0,0,0,,0,0,0.45,/3ZPfhlwZ9HIYViUCgoZusCszohT.jpg,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1.0,1926-05-14,5.0,14.0,False,4.0,966878.0,84.0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,Released,,Sparrows,train,5517.32,23172.75,0.0,0,1,0,1,0,0,0,1,0,1,0,0,0,1


In [115]:
df.shape

(7398, 197)

In [0]:
#import pandas_profiling
#report = pandas_profiling.ProfileReport(df)
#report.to_file("your_report.html")

# Modeling

In [11]:
model_df = df.copy()
# model_df['id'] =  model_df.index
model_df = model_df.drop(["homepage", "imdb_id", "original_language", "original_title", "poster_path", "release_date", "status", "tagline", "title"], axis=1)

train = model_df[model_df['train_test'] == 'train']
test = model_df[model_df['train_test'] == 'test']

#train_y = train[['revenue']]
#test_y = test[['revenue']]
train = train.drop(['train_test'],axis=1)
test = test.drop(['train_test'] ,axis=1)

# scale_train = StandardScaler().fit(train)
# scale_df = scale_train.transform(train)
# train = pd.DataFrame(scale_df,columns=train.columns)#.merge(train_y, left_index=True, right_index=True)

# scale_test = StandardScaler().fit(test)
# scale_df = scale_test.transform(test)
# test = pd.DataFrame(scale_df,columns=test.columns)#.merge(test_y, left_index=True, right_index=True)


def score(data, y):
    validation_res = pd.DataFrame(
    {"id": data["id"].values,
     "transactionrevenue": data["revenue"].values,
     "predictedrevenue": np.expm1(y)}) 

    print(validation_res.head())

    validation_res = validation_res.groupby("id")["transactionrevenue", "predictedrevenue"].sum().reset_index()
    
    print(validation_res.head())
    return  np.sqrt(mean_squared_error((np.log1p(validation_res["transactionrevenue"].values)), #np.log1p
                                     (np.log1p(validation_res["predictedrevenue"].values)))) #np.log1p
    
class KFoldValidation():
    def __init__(self, data, n_splits=5):
        unique_vis = np.array(sorted(data['id'].astype(str).unique()))
        folds = GroupKFold(n_splits)
        ids = np.arange(data.shape[0])
        
        self.fold_ids = []
        for trn_vis, val_vis in folds.split(X=unique_vis, y=unique_vis, groups=unique_vis):
            self.fold_ids.append([
                    ids[data['id'].astype(str).isin(unique_vis[trn_vis])],
                    ids[data['id'].astype(str).isin(unique_vis[val_vis])]
                ])
            
    def validate(self, train, test, features, model, name="", prepare_stacking=False, 
                 fit_params={"early_stopping_rounds": 500, "verbose": 100, "eval_metric": "rmse"}):
        model.FI = pd.DataFrame(index=features)
        full_score = 0
        
        if prepare_stacking:
            test[name] = 0
            train[name] = np.NaN
        
        for fold_id, (trn, val) in enumerate(self.fold_ids):
            devel = train[features].iloc[trn]
            y_devel = np.log1p(train["revenue"].iloc[trn])
            valid = train[features].iloc[val]
            y_valid = np.log1p(train["revenue"].iloc[val])
                       
            print("Fold ", fold_id, ":")
            model.fit(devel, y_devel, eval_set=[(valid, y_valid)], **fit_params)
            
            if len(model.feature_importances_) == len(features):  # some bugs in catboost?
                model.FI['fold' + str(fold_id)] = model.feature_importances_ / model.feature_importances_.sum()

            predictions = model.predict(valid)
            predictions[predictions < 0] = 0
            print("Fold ", fold_id, " error: ", mean_squared_error(y_valid, predictions)**0.5)
            
            fold_score = score(train.iloc[val], predictions)
            full_score += np.expm1(fold_score) / len(self.fold_ids)
            print('look at me:', len(self.fold_ids))
            print("Fold ", fold_id, " score: ", fold_score)
            
            if prepare_stacking:
                train[name].iloc[val] = np.expm1(predictions)
                
                test_predictions = model.predict(test[features])
                print(test_predictions)
                test_predictions[test_predictions < 0] = 0
                test[name] += np.expm1(test_predictions) / len(self.fold_ids)
                
        print("Final rmsle score: ", full_score)
        return full_score

test['revenue'].value_counts()

0.00    4396
Name: revenue, dtype: int64

## Models config

In [0]:
lgbmodel = lgb.LGBMRegressor(n_estimators=5000, 
                             objective='regression', 
                             metric='rmse',
                             max_depth = 25,
                             num_leaves=10, 
                             min_child_samples=100,
                             learning_rate=0.001,
                             boosting = 'gbdt',
                             min_data_in_leaf= 10,
                             feature_fraction = 0.2,
                             bagging_freq = 1,
                             bagging_fraction = 0.9,
                             importance_type='gain',
                             lambda_l1 = 0.3,
                             bagging_seed=42, 
                             subsample=.8, 
                             colsample_bytree=.9,
                             use_best_model=True)

xgbmodel = xgb.XGBRegressor(max_depth=10, 
                            learning_rate=0.01, 
                            n_estimators=5000, 
                            objective='reg:linear', 
                            gamma=1.45, 
                            seed=42, 
                            silent=True,
                            subsample=0.7, 
                            colsample_bytree=0.8, 
                            colsample_bylevel=0.50)

catmodel = cat.CatBoostRegressor(iterations=5000, 
                                 learning_rate=0.01, 
                                 depth=6,
                                 loss_function = "RMSE",
                                 boost_from_average = True,
                                 colsample_bylevel=0.8,
                                 bagging_temperature = 0.2,
                                 metric_period = None,
                                 random_seed=2345,
                                 l2_leaf_reg = 0.5)

In [0]:
Kfolder = KFoldValidation(train)

In [0]:
Kfolder.validate(train, test, train.columns.drop('revenue') , lgbmodel, name="lgbfinal", prepare_stacking=True) 

In [15]:
train.head()

Unnamed: 0,Keywords_aftercreditsstinger,Keywords_based on novel,Keywords_biography,Keywords_duringcreditsstinger,Keywords_dystopia,Keywords_friendship,Keywords_independent film,Keywords_love,Keywords_murder,Keywords_police,Keywords_revenge,Keywords_sex,Keywords_size,Keywords_sport,Keywords_violence,Keywords_woman director,budget,cast_Bruce Willis,cast_Christopher Walken,cast_J.K. Simmons,cast_John Goodman,cast_Julianne Moore,cast_Liam Neeson,cast_Matt Damon,cast_Morgan Freeman,cast_Nicolas Cage,cast_Robert De Niro,cast_Robin Williams,cast_Samuel L. Jackson,cast_Steve Buscemi,cast_Sylvester Stallone,cast_Willem Dafoe,cast_size,crew_Avy Kaufman,crew_Deborah Aquila,crew_Francine Maisler,crew_Hans Zimmer,crew_Harvey Weinstein,crew_James Horner,crew_James Newton Howard,crew_Jerry Goldsmith,crew_Kerry Barden,crew_Luc Besson,crew_Mary Vernieu,crew_Robert Rodriguez,crew_Steven Spielberg,crew_Tricia Wood,crew_size,days_diff_from_last_movie,genres_Action,genres_Adventure,genres_Animation,genres_Comedy,genres_Crime,genres_Documentary,genres_Drama,genres_Family,genres_Fantasy,genres_Foreign,genres_History,genres_Horror,genres_Music,genres_Mystery,genres_Romance,genres_Science Fiction,genres_TV Movie,genres_Thriller,genres_War,genres_Western,genres_size,id,key_0,last_period_mean_budget,last_period_mean_popularity,last_period_mean_revenue,last_period_median_budget,last_period_median_popularity,last_period_median_revenue,last_period_movies_count,overview_family,overview_father,overview_film,overview_finds,overview_friends,overview_help,overview_life,overview_love,overview_size,overview_story,overview_time,overview_wife,overview_woman,overview_world,overview_year,overview_years,overview_young,popularity,production_companies_Amblin Entertainment,production_companies_BBC Films,production_companies_Canal+,production_companies_Columbia Pictures,production_companies_Columbia Pictures Corporation,production_companies_Dimension Films,production_companies_DreamWorks SKG,production_companies_Dune Entertainment,production_companies_Fox 2000 Pictures,production_companies_Fox Searchlight Pictures,production_companies_Hollywood Pictures,production_companies_Lionsgate,production_companies_Metro-Goldwyn-Mayer (MGM),production_companies_Miramax Films,production_companies_New Line Cinema,production_companies_Orion Pictures,production_companies_Paramount Pictures,production_companies_Regency Enterprises,production_companies_Relativity Media,production_companies_StudioCanal,production_companies_Summit Entertainment,production_companies_Touchstone Pictures,production_companies_TriStar Pictures,production_companies_Twentieth Century Fox Film Corporation,production_companies_United Artists,production_companies_Universal Pictures,production_companies_Village Roadshow Pictures,production_companies_Walt Disney Pictures,production_companies_Warner Bros.,production_companies_Working Title Films,production_companies_size,production_countries_Australia,production_countries_Belgium,production_countries_Canada,production_countries_China,production_countries_Denmark,production_countries_France,production_countries_Germany,production_countries_Hong Kong,production_countries_India,production_countries_Ireland,production_countries_Italy,production_countries_Japan,production_countries_Mexico,production_countries_Netherlands,production_countries_Russia,production_countries_South Korea,production_countries_Spain,production_countries_Sweden,production_countries_United Kingdom,production_countries_United States of America,production_countries_size,release_date_Month,release_date_day_of_month,release_date_is_quarter_start,release_date_week_day,revenue,runtime,spoken_languages_,spoken_languages_Deutsch,spoken_languages_English,spoken_languages_Español,spoken_languages_Français,spoken_languages_Italiano,spoken_languages_Polski,spoken_languages_Português,spoken_languages_Pусский,spoken_languages_size,spoken_languages_العربية,spoken_languages_हिन्दी,spoken_languages_广州话 / 廣州話,spoken_languages_日本語,spoken_languages_普通话,spoken_languages_한국어/조선말,budget_to_runtime,budget_to_cast_crew_size,budget_to_popularity,belongs_to_collection_bool,original_title_words,job_Producer_gender_0,job_Producer_gender_1,job_Producer_gender_2,job_Director_gender_0,job_Director_gender_1,job_Director_gender_2,order_0_gender_0,order_0_gender_1,order_0_gender_2,order_1_gender_0,order_1_gender_1,order_1_gender_2,lgbfinal
1763,0,0,0,0,0,0,0,0,0,0,0,0,14.0,0,0,0,250000.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,35.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,12.0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,2,1764,1764,,,,,,,,0,0,0,0,0,0,0,0,28,0,0,0.0,0,1,,0,0,8.17,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1.0,1.0,21.0,False,4.0,2500000.0,68.0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,3676.47,5319.15,0.0,0,2,0,0,1,0,0,1,0,0,1,1,0,0,6044964.03
2992,0,0,0,0,0,0,0,0,0,0,0,0,12.0,0,0,0,1135654.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,21.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,28.0,1152,1,1,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,5,2993,2993,250000.0,8.17,2500000.0,250000.0,8.17,2500000.0,1.0,0,0,1,0,0,0,0,0,9,0,0,0.0,0,0,,0,0,3.88,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1.0,3.0,18.0,False,1.0,1213880.0,149.0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,7621.84,23176.61,0.0,0,4,1,0,0,0,0,1,1,0,0,0,0,1,5802820.45
1917,0,0,0,0,0,0,0,0,0,0,0,0,4.0,0,0,0,592.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,17.0,526,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,2,1918,1918,1135654.0,3.88,1213880.0,1135654.0,3.88,1213880.0,1.0,0,0,0,0,0,1,0,0,27,0,0,0.0,0,0,,1,1,0.29,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1.0,8.0,26.0,False,2.0,1.0,137.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4.32,23.68,0.0,0,3,0,0,1,1,0,0,1,0,0,0,0,1,1309716.58
1361,0,0,0,0,0,0,0,0,0,0,0,0,13.0,0,1,0,1135654.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,23.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,20.0,120,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,2,1362,1362,1135654.0,3.88,1213880.0,1135654.0,3.88,1213880.0,1.0,0,0,0,0,0,0,0,1,21,0,0,0.0,1,0,,0,0,12.91,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1.0,12.0,24.0,False,3.0,45100.0,75.0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,15142.05,26410.56,0.0,0,2,0,0,1,0,0,1,0,0,1,1,0,0,5666815.2
2680,0,0,0,0,0,0,0,0,0,0,0,0,5.0,0,0,0,463455.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,15.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5.0,141,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2681,2681,296.0,6.6,22550.5,296.0,6.6,22550.5,2.0,0,0,0,0,0,0,0,0,39,0,1,0.0,0,0,,0,0,0.45,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1.0,5.0,14.0,False,4.0,966878.0,84.0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,5517.32,23172.75,0.0,0,1,0,1,0,0,0,1,0,1,0,0,0,1,3281830.46


In [0]:
Kfolder.validate(train, test, train.columns.drop('revenue'), xgbmodel, name="xgbfinal", prepare_stacking=True)

In [28]:
Kfolder.validate(train, test, train.columns.drop('revenue') , catmodel, name="catfinal", prepare_stacking=True,
               fit_params={"use_best_model": True, "verbose": 100})

Fold  0 :
0:	learn: 3.0243532	test: 3.1393015	best: 3.1393015 (0)	total: 5.85ms	remaining: 29.2s
100:	learn: 2.2893893	test: 2.4853477	best: 2.4853477 (100)	total: 474ms	remaining: 23s
200:	learn: 2.0680825	test: 2.3469695	best: 2.3469695 (200)	total: 966ms	remaining: 23.1s
300:	learn: 1.9679173	test: 2.3136256	best: 2.3136256 (300)	total: 1.47s	remaining: 23s
400:	learn: 1.9016719	test: 2.3046641	best: 2.3042921 (399)	total: 1.97s	remaining: 22.6s
500:	learn: 1.8461659	test: 2.2999709	best: 2.2997971 (497)	total: 2.48s	remaining: 22.3s
600:	learn: 1.7979948	test: 2.2997460	best: 2.2994934 (591)	total: 2.97s	remaining: 21.8s
700:	learn: 1.7517704	test: 2.2990332	best: 2.2989876 (625)	total: 3.44s	remaining: 21.1s
800:	learn: 1.7056031	test: 2.3008913	best: 2.2989588 (705)	total: 3.94s	remaining: 20.6s
900:	learn: 1.6577296	test: 2.3045235	best: 2.2989588 (705)	total: 4.51s	remaining: 20.5s
1000:	learn: 1.6117726	test: 2.3073433	best: 2.2989588 (705)	total: 4.97s	remaining: 19.8s
1100:	

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


0:	learn: 3.1037692	test: 2.8136194	best: 2.8136194 (0)	total: 5.04ms	remaining: 25.2s
100:	learn: 2.3669886	test: 2.1446551	best: 2.1446551 (100)	total: 561ms	remaining: 27.2s
200:	learn: 2.1484033	test: 2.0013886	best: 2.0013886 (200)	total: 1.08s	remaining: 25.7s
300:	learn: 2.0509047	test: 1.9738483	best: 1.9738483 (300)	total: 1.58s	remaining: 24.6s
400:	learn: 1.9865393	test: 1.9638266	best: 1.9638266 (400)	total: 2.11s	remaining: 24.2s
500:	learn: 1.9349094	test: 1.9649157	best: 1.9636223 (416)	total: 2.63s	remaining: 23.6s
600:	learn: 1.8859028	test: 1.9650302	best: 1.9636223 (416)	total: 3.13s	remaining: 22.9s
700:	learn: 1.8425455	test: 1.9644414	best: 1.9634475 (685)	total: 3.63s	remaining: 22.3s
800:	learn: 1.7958808	test: 1.9646689	best: 1.9631238 (776)	total: 4.11s	remaining: 21.6s
900:	learn: 1.7469002	test: 1.9655084	best: 1.9631238 (776)	total: 4.63s	remaining: 21.1s
1000:	learn: 1.6972244	test: 1.9654776	best: 1.9631238 (776)	total: 5.13s	remaining: 20.5s
1100:	learn:

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


0:	learn: 3.0560533	test: 3.0141416	best: 3.0141416 (0)	total: 15.2ms	remaining: 1m 15s
100:	learn: 2.3277766	test: 2.3417276	best: 2.3417276 (100)	total: 498ms	remaining: 24.2s
200:	learn: 2.1131573	test: 2.1869258	best: 2.1869258 (200)	total: 997ms	remaining: 23.8s
300:	learn: 2.0154696	test: 2.1455712	best: 2.1455712 (300)	total: 1.48s	remaining: 23s
400:	learn: 1.9516721	test: 2.1354063	best: 2.1352947 (398)	total: 1.95s	remaining: 22.4s
500:	learn: 1.8997099	test: 2.1303090	best: 2.1300936 (487)	total: 2.41s	remaining: 21.7s
600:	learn: 1.8497482	test: 2.1284275	best: 2.1280282 (591)	total: 2.89s	remaining: 21.2s
700:	learn: 1.8048022	test: 2.1250188	best: 2.1250188 (700)	total: 3.36s	remaining: 20.6s
800:	learn: 1.7569443	test: 2.1267078	best: 2.1249301 (707)	total: 3.83s	remaining: 20.1s
900:	learn: 1.7057827	test: 2.1278486	best: 2.1249301 (707)	total: 4.37s	remaining: 19.9s
1000:	learn: 1.6556788	test: 2.1289317	best: 2.1249301 (707)	total: 4.84s	remaining: 19.3s
1100:	learn: 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


0:	learn: 3.0635225	test: 2.9840894	best: 2.9840894 (0)	total: 3.91ms	remaining: 19.6s
100:	learn: 2.3642738	test: 2.2155941	best: 2.2155941 (100)	total: 495ms	remaining: 24s
200:	learn: 2.1646765	test: 2.0328650	best: 2.0328650 (200)	total: 971ms	remaining: 23.2s
300:	learn: 2.0654714	test: 1.9781331	best: 1.9781331 (300)	total: 1.56s	remaining: 24.4s
400:	learn: 1.9941722	test: 1.9565649	best: 1.9565649 (400)	total: 2.09s	remaining: 23.9s
500:	learn: 1.9357570	test: 1.9481078	best: 1.9477931 (467)	total: 2.59s	remaining: 23.3s
600:	learn: 1.8837730	test: 1.9458889	best: 1.9453846 (560)	total: 3.08s	remaining: 22.5s
700:	learn: 1.8318007	test: 1.9423904	best: 1.9423904 (700)	total: 3.64s	remaining: 22.3s
800:	learn: 1.7821418	test: 1.9411958	best: 1.9409183 (764)	total: 4.11s	remaining: 21.6s
900:	learn: 1.7304960	test: 1.9418728	best: 1.9393696 (865)	total: 4.58s	remaining: 20.9s
1000:	learn: 1.6796761	test: 1.9442630	best: 1.9393696 (865)	total: 5.15s	remaining: 20.6s
1100:	learn: 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


0:	learn: 2.9886001	test: 3.2779173	best: 3.2779173 (0)	total: 5.13ms	remaining: 25.6s
100:	learn: 2.2392974	test: 2.9284652	best: 2.9262504 (94)	total: 494ms	remaining: 24s
200:	learn: 2.0331872	test: 2.8952125	best: 2.8938587 (199)	total: 981ms	remaining: 23.4s
300:	learn: 1.9430702	test: 2.8909018	best: 2.8902196 (299)	total: 1.45s	remaining: 22.7s
400:	learn: 1.8839128	test: 2.8752489	best: 2.8752489 (400)	total: 1.92s	remaining: 22s
500:	learn: 1.8351229	test: 2.8781798	best: 2.8717922 (444)	total: 2.43s	remaining: 21.9s
600:	learn: 1.7883508	test: 2.8782602	best: 2.8717922 (444)	total: 2.94s	remaining: 21.5s
700:	learn: 1.7445003	test: 2.8770911	best: 2.8717922 (444)	total: 3.4s	remaining: 20.9s
800:	learn: 1.6971227	test: 2.8763503	best: 2.8717922 (444)	total: 3.88s	remaining: 20.3s
900:	learn: 1.6478873	test: 2.8736116	best: 2.8688162 (870)	total: 4.34s	remaining: 19.7s
1000:	learn: 1.5988924	test: 2.8721508	best: 2.8688162 (870)	total: 4.83s	remaining: 19.3s
1100:	learn: 1.550

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


8.545596220467129

# Model Stacking

In [77]:
test.head(5)

Unnamed: 0,Keywords_aftercreditsstinger,Keywords_based on novel,Keywords_biography,Keywords_duringcreditsstinger,Keywords_dystopia,Keywords_friendship,Keywords_independent film,Keywords_love,Keywords_murder,Keywords_police,Keywords_revenge,Keywords_sex,Keywords_size,Keywords_sport,Keywords_violence,Keywords_woman director,budget,cast_Bruce Willis,cast_Christopher Walken,cast_J.K. Simmons,cast_John Goodman,cast_Julianne Moore,cast_Liam Neeson,cast_Matt Damon,cast_Morgan Freeman,cast_Nicolas Cage,cast_Robert De Niro,cast_Robin Williams,cast_Samuel L. Jackson,cast_Steve Buscemi,cast_Sylvester Stallone,cast_Willem Dafoe,cast_size,crew_Avy Kaufman,crew_Deborah Aquila,crew_Francine Maisler,crew_Hans Zimmer,crew_Harvey Weinstein,crew_James Horner,crew_James Newton Howard,crew_Jerry Goldsmith,crew_Kerry Barden,crew_Luc Besson,crew_Mary Vernieu,crew_Robert Rodriguez,crew_Steven Spielberg,crew_Tricia Wood,crew_size,days_diff_from_last_movie,genres_Action,genres_Adventure,genres_Animation,genres_Comedy,genres_Crime,genres_Documentary,genres_Drama,genres_Family,genres_Fantasy,genres_Foreign,genres_History,genres_Horror,genres_Music,genres_Mystery,genres_Romance,genres_Science Fiction,genres_TV Movie,genres_Thriller,genres_War,genres_Western,genres_size,id,last_period_mean_budget,last_period_mean_popularity,last_period_mean_revenue,last_period_median_budget,last_period_median_popularity,last_period_median_revenue,last_period_movies_count,overview_family,overview_father,overview_film,overview_finds,overview_friends,overview_help,overview_life,overview_love,overview_size,overview_story,overview_time,overview_wife,overview_woman,overview_world,overview_year,overview_years,overview_young,popularity,production_companies_Amblin Entertainment,production_companies_BBC Films,production_companies_Canal+,production_companies_Columbia Pictures,production_companies_Columbia Pictures Corporation,production_companies_Dimension Films,production_companies_DreamWorks SKG,production_companies_Dune Entertainment,production_companies_Fox 2000 Pictures,production_companies_Fox Searchlight Pictures,production_companies_Hollywood Pictures,production_companies_Lionsgate,production_companies_Metro-Goldwyn-Mayer (MGM),production_companies_Miramax Films,production_companies_New Line Cinema,production_companies_Orion Pictures,production_companies_Paramount Pictures,production_companies_Regency Enterprises,production_companies_Relativity Media,production_companies_StudioCanal,production_companies_Summit Entertainment,production_companies_Touchstone Pictures,production_companies_TriStar Pictures,production_companies_Twentieth Century Fox Film Corporation,production_companies_United Artists,production_companies_Universal Pictures,production_companies_Village Roadshow Pictures,production_companies_Walt Disney Pictures,production_companies_Warner Bros.,production_companies_Working Title Films,production_companies_size,production_countries_Australia,production_countries_Belgium,production_countries_Canada,production_countries_China,production_countries_Denmark,production_countries_France,production_countries_Germany,production_countries_Hong Kong,production_countries_India,production_countries_Ireland,production_countries_Italy,production_countries_Japan,production_countries_Mexico,production_countries_Netherlands,production_countries_Russia,production_countries_South Korea,production_countries_Spain,production_countries_Sweden,production_countries_United Kingdom,production_countries_United States of America,production_countries_size,release_date_Month,release_date_day_of_month,release_date_is_quarter_start,release_date_week_day,revenue,runtime,spoken_languages_,spoken_languages_Deutsch,spoken_languages_English,spoken_languages_Español,spoken_languages_Français,spoken_languages_Italiano,spoken_languages_Polski,spoken_languages_Português,spoken_languages_Pусский,spoken_languages_size,spoken_languages_العربية,spoken_languages_हिन्दी,spoken_languages_广州话 / 廣州話,spoken_languages_日本語,spoken_languages_普通话,spoken_languages_한국어/조선말,budget_to_runtime,budget_to_cast_crew_size,budget_to_popularity,belongs_to_collection_bool,original_title_words,job_Producer_gender_0,job_Producer_gender_1,job_Producer_gender_2,job_Director_gender_0,job_Director_gender_1,job_Director_gender_2,order_0_gender_0,order_0_gender_1,order_0_gender_2,order_1_gender_0,order_1_gender_1,order_1_gender_2,lgbfinal
2307,0,0,0,0,0,0,0,0,0,0,0,0,2.0,0,0,0,250000.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5.0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,2,5308,,,,,,,,0,0,0,0,0,0,0,0,9,0,0,,0,0,0.0,0,0,0.21,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1.0,8.0,1.0,False,3.0,,93.0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,2688.17,19230.77,0.0,0,1,0,0,1,0,0,1,0,1,0,0,0,1,2534557.2
3193,0,0,0,0,0,0,0,0,0,0,0,0,3.0,0,0,0,1100000.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,20.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,22.0,1259,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,6194,250000.0,0.21,0.0,250000.0,0.21,0.0,1.0,0,0,0,0,0,0,0,0,56,0,0,,0,0,0.0,0,0,0.84,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1.0,1.0,11.0,False,2.0,0.0,117.0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,9401.71,26190.48,0.0,0,2,0,0,1,1,0,0,0,0,1,1,0,0,3609282.37
2925,0,0,0,0,0,0,0,0,0,0,0,0,2.0,0,0,0,1100000.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,23.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,13.0,445,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,2,5926,1100000.0,0.84,0.0,1100000.0,0.84,0.0,1.0,0,0,0,0,0,0,0,0,11,0,0,,0,0,0.0,0,0,4.39,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1.0,4.0,1.0,True,6.0,0.0,70.0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,15714.29,30555.56,0.0,0,2,0,0,1,1,0,0,0,0,1,0,1,0,3740540.77
3007,0,0,0,0,0,0,0,1,0,0,0,0,12.0,0,0,0,923000.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,40.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,20.0,816,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,3,6008,0.0,4.39,0.0,0.0,4.39,0.0,1.0,0,0,0,0,0,0,0,1,18,0,0,,0,0,0.0,0,0,6.2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1.0,6.0,25.0,False,3.0,0.0,95.0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,9715.79,15383.33,0.0,0,3,0,0,1,0,0,1,0,0,1,0,1,0,7204908.27
4013,0,0,0,0,0,0,0,0,0,0,0,0,3.0,0,0,0,103000.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,26.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10.0,83,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,3,7014,0.0,4.39,0.0,0.0,4.39,0.0,1.0,0,0,0,0,0,0,0,0,16,0,0,,0,0,0.0,0,0,1.35,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1.0,9.0,16.0,False,2.0,0.0,86.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1197.67,2861.11,0.0,0,3,0,0,1,0,0,1,0,0,1,0,1,0,3572980.14


In [0]:
test['PredictedLogRevenue'] = 0.4 * test["lgbfinal"] + \
                               0.2 * test["xgbfinal"] + \
                               0.4 * test["catfinal"]



score(test, test.PredictedLogRevenue)

In [0]:
test.head(12)

In [0]:
submission = test[['id','PredictedLogRevenue']]
submission.to_csv('sub.csv')

# Charts 

In [0]:
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

plt.figure(figsize=(20,7))
lgb.plot_importance(lgbmodel,figsize=(10,40))
plt.yticks(fontsize=20)
plt.show()