In [0]:
# Clone the entire repo.
!git clone -l -s git://github.com/Idan707/Kaggle_TMDB_Box_Office_Prediction.git cloned-repo
%cd cloned-repo
# !ls

In [0]:
#pip install pandas==0.25
pip install catboost

# Import & Load

In [0]:
# imports
import pandas as pd
import  numpy as np
import ast
import json
import gensim
import datetime
import ast
import nltk
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from sklearn.preprocessing import MultiLabelBinarizer ,normalize
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import confusion_matrix, roc_auc_score, classification_report
from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures, StandardScaler
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
import lightgbm as lgb
import catboost as cat
from sklearn.metrics import mean_squared_error, mean_squared_log_error
from sklearn.model_selection import GroupKFold
import xgboost as xgb

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.options.display.float_format = '{:.2f}'.format

train = pd.read_csv('/content/cloned-repo/Data/train.csv',
                    parse_dates=['release_date'])
test = pd.read_csv('/content/cloned-repo/Data/test.csv',
                    parse_dates=['release_date'])

# Date Fix
train['release_date'] = train['release_date'].mask(train['release_date'].dt.year > 2017, 
                                                   train['release_date'] - pd.offsets.DateOffset(years=100))
test['release_date'] = test['release_date'].mask(test['release_date'].dt.year > 2017, 
                                                   test['release_date'] - pd.offsets.DateOffset(years=100))

test['revenue'] = 0

mlb = MultiLabelBinarizer()

# Feature Engineering 

In [41]:
def convert_string_to_list(strVal):
    if type(strVal) is not str:
        return  []
    else:
        return ast.literal_eval(strVal)
    
def format_dict_column_and_extract_names(strVal, col="name"):
    listOfItems = convert_string_to_list(strVal)

    return list(map(lambda x: x[col], listOfItems))

def add_x_length_column(df, col):
    df[col + '_size'] = df[col].apply(lambda x: len(x))
    
    return df

def extract_genres(df):
    df['genres'] = df['genres'].apply(format_dict_column_and_extract_names)
    df = add_x_length_column(df, col='genres')
    temp = mlb.fit_transform(df.pop('genres'))
    temp_df = pd.DataFrame(temp, columns=list(map(lambda x: 'genres'+'_'+x,mlb.classes_)), 
                              index=df.index)
    
    df = pd.concat([df, temp_df], axis=1)
    
    return df

def extract_common(df, col, limit, apply_dict_to_col=True):
    if apply_dict_to_col == True:
        df[col] = df[col].apply(format_dict_column_and_extract_names)
        
    companiesCount = df[col].apply(pd.Series).stack().value_counts().sort_values(axis=0, ascending=False)
    companiesToKeep = companiesCount[:limit].keys()
    
    add_x_length_column(df, col)
    df[col] = df[col].apply(lambda x: list(filter(lambda i: i in companiesToKeep, x)))
    
    temp = mlb.fit_transform(df.pop(col))
    temp_df = pd.DataFrame(temp, columns=list(map(lambda x: col+'_'+x,mlb.classes_)), 
                              index=df.index)

    df = pd.concat([df, temp_df], axis=1)

    return df

def last_year_metrics(df, agg_col='release_date_Year', calc_col=['revenue','imdb_id','popularity','budget'], leg=1):
    
    temp = df.groupby([agg_col]).agg(last_period_mean_revenue = (calc_col[0], 'mean'),
                                     last_period_median_revenue = (calc_col[0], 'median'),
                                     last_period_movies_count = (calc_col[1], 'count'),
                                     last_period_mean_popularity = (calc_col[2], 'mean'),
                                     last_period_median_popularity = (calc_col[2], 'median'),
                                     last_period_mean_budget = (calc_col[3], 'mean'),
                                     last_period_median_budget = (calc_col[3], 'median')).shift(leg).reset_index()

    return pd.merge(df, temp, on=['release_date_Year'], how='left')

def days_diff_from_last_movie(df, col='release_date'):#see if there is a need in buckts
    df = df.sort_values(by=[col])
    df['days_diff_from_last_movie'] = df[col].diff().fillna(0).dt.days
    
    return df

def ratios(df):
    df['budget_to_runtime'] = df['budget']/df['runtime']
    df['budget_to_cast_crew_size'] = df['budget']/(df['cast_size'] + df['crew_size'])
    df['budget_to_popularity'] = df['budget']/df['popularity']
    
    return df

def add_datepart(df,col ='release_date'):
    df[col] = pd.to_datetime(df[col])
    df[col +'_Year']  = df[col].dt.year
    df[col +'_Month'] = df[col].dt.month
    df[col +'_day_of_month'] = df[col].dt.day
    df[col +'_week_day'] = df[col].dt.dayofweek
    df[col +'_is_quarter_start'] = df[col].dt.is_quarter_start
    df[col +'_is_quarter_start'] = df[col +'_is_quarter_start'].fillna(False)
    
    return df 

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(token)
            
    return result

def extract_common_overview_words(df, col):
    df[col] = df[col].fillna('').astype(str).map(preprocess)
    df[col] = df[col].map(lambda x: list(set(x)))
    df = extract_common(df, col, limit=15, apply_dict_to_col=False)
    
    return df
  
def extract_gender(strVal, dict_key, dict_value):
    listOfItems = convert_string_to_list(strVal)

    Item = (list(filter(lambda lst: lst[dict_key] == dict_value, listOfItems)) or [None])[0]
    if type(Item) is dict:
        return Item['gender']
    else:
        return None
      
def add_gender(df, col, dict_key_in , dict_value_in):
    df[dict_key_in + '_' + str(dict_value_in) + '_gender'] = df[col].apply(extract_gender, args=[dict_key_in, dict_value_in])
    df[dict_key_in + '_' + str(dict_value_in) + '_gender'] = df[dict_key_in + '_' + str(dict_value_in) + '_gender'].fillna(2).astype('int64')
    
    return df

def run_main(train, test):

  test['train_test'] = 'test'  
  train['train_test'] = 'train'

  # work on full df
  print('working on full df..')
  df = pd.concat([train, test],axis=0, sort=True)
  df = extract_genres(df)
  df = extract_common(df, col='production_companies', limit=30)
  df = extract_common(df, col='production_countries', limit=20)
  df = extract_common(df, col='spoken_languages', limit=15)
  df = extract_common(df, col='Keywords', limit=15)
  
  df = add_gender(df, col='crew', dict_key_in='job', dict_value_in='Producer')
  df = add_gender(df, col='crew', dict_key_in='job', dict_value_in='Director')
  df = add_gender(df, col='cast', dict_key_in='order', dict_value_in=0)
  df = add_gender(df, col='cast', dict_key_in='order', dict_value_in=1)
  df = extract_common(df, col='cast', limit=15)
  df = extract_common(df, col='crew', limit=15)

  # fill missing values 1
  print('fill missing values 1...')
  df['cast_size'] = np.where(df['cast_size'] == 0, df['cast_size'].mean(), df['cast_size'])
  df['crew_size'] = np.where(df['crew_size'] == 0, df['crew_size'].mean(), df['crew_size'])
  
  # add datepart
  print('add datepart...')
  df = add_datepart(df,col = 'release_date') 

  # split for data leak prevention
  print('split for data leak prevention and run main...')
  train = df[df['train_test'] == 'train']
  test = df[df['train_test'] == 'test']

  train = last_year_metrics(train)
  train = days_diff_from_last_movie(train)
  train['revenue'] = np.where(train['revenue'] == 0, train['last_period_median_revenue'], train['revenue'])
  train['revenue'] = np.where(train['revenue'] == 0, train["revenue"].mean(), train['revenue'])
  train = extract_common_overview_words(train, col='overview')

  test = last_year_metrics(test)
  test = days_diff_from_last_movie(test)
  test['revenue'] = np.where(test['revenue'] == 0, test['last_period_median_revenue'], test['revenue'])
  test['revenue'] = np.where(test['revenue'] == 0, test["revenue"].mean(), test['revenue'])
  test = extract_common_overview_words(test, col='overview')

  df = pd.concat([train, test],axis=0, sort=True)

  # fill missing values 2
  print('fill missing values 2...')
  df['budget'] = np.where(df['budget'] == 0, df['last_period_median_budget'], df['budget'])
  df['budget'] = np.where(df['budget'] == 0, df["budget"].mean(), df['budget'])
  df['production_companies_size'] = np.where(df['production_companies_size'] == 0, df["production_companies_size"].mean(), df['production_companies_size'])
  df['production_countries_size'] = np.where(df['production_countries_size'] == 0, df["production_countries_size"].mean(), df['production_countries_size'])
  df['Keywords_size'] = np.where(df['Keywords_size'] == 0, df["Keywords_size"].mean(), df['Keywords_size'])

  # run ratios
  print('run ratios...')
  df = ratios(df)

  # fill missing values 3
  print('fill missing values 3...')
  df = df.replace([np.inf, -np.inf], np.nan)
  df['budget_to_runtime'] = np.where(df['budget_to_runtime'] == np.nan, df["budget_to_runtime"].mean(), df['budget_to_runtime'])
  df['belongs_to_collection_bool'] = np.where(df['belongs_to_collection'].isna(), 0, 1)
  df.drop(['belongs_to_collection'], axis=1, inplace=True)

  # counting the numbers of words in the movie title 
  print('counting the numbers of words in the movie title..')
  df['original_title_words'] = df.original_title.str.split()
  df['original_title_words'] = df['original_title_words'].apply(lambda x: len(x))

  df = pd.get_dummies(df, prefix=['job_Producer_gender', 'job_Director_gender', 'order_0_gender', 'order_1_gender'], 
                          columns=['job_Producer_gender', 'job_Director_gender', 'order_0_gender', 'order_1_gender'])

  # removing highly correlated features
  print('removing highly correlated features...')
  df.drop(["crew_Bob Weinstein", "release_date_Year"], inplace=True, axis=1)

  # normalize features
  scaler = preprocessing.MinMaxScaler()
  df["budget_to_popularity"] = scaler.fit_transform(df[["budget_to_popularity"]])

  return df

df = run_main(train, test)
df.head()

working on full df..
fill missing values 1...
add datepart...
split for data leak prevention and run main...




fill missing values 2...
run ratios...
fill missing values 3...
counting the numbers of words in the movie title..
removing highly correlated features...


Unnamed: 0,Keywords_aftercreditsstinger,Keywords_based on novel,Keywords_biography,Keywords_duringcreditsstinger,Keywords_dystopia,Keywords_friendship,Keywords_independent film,Keywords_love,Keywords_murder,Keywords_police,Keywords_revenge,Keywords_sex,Keywords_size,Keywords_sport,Keywords_violence,Keywords_woman director,budget,cast_Bruce Willis,cast_Christopher Walken,cast_J.K. Simmons,cast_John Goodman,cast_Julianne Moore,cast_Liam Neeson,cast_Matt Damon,cast_Morgan Freeman,cast_Nicolas Cage,cast_Robert De Niro,cast_Robin Williams,cast_Samuel L. Jackson,cast_Steve Buscemi,cast_Sylvester Stallone,cast_Willem Dafoe,cast_size,crew_Avy Kaufman,crew_Deborah Aquila,crew_Francine Maisler,crew_Hans Zimmer,crew_Harvey Weinstein,crew_James Horner,crew_James Newton Howard,crew_Jerry Goldsmith,crew_Kerry Barden,crew_Luc Besson,crew_Mary Vernieu,crew_Robert Rodriguez,crew_Steven Spielberg,crew_Tricia Wood,crew_size,days_diff_from_last_movie,genres_Action,genres_Adventure,genres_Animation,genres_Comedy,genres_Crime,genres_Documentary,genres_Drama,genres_Family,genres_Fantasy,genres_Foreign,genres_History,genres_Horror,genres_Music,genres_Mystery,genres_Romance,genres_Science Fiction,genres_TV Movie,genres_Thriller,genres_War,genres_Western,genres_size,homepage,id,imdb_id,last_period_mean_budget,last_period_mean_popularity,last_period_mean_revenue,last_period_median_budget,last_period_median_popularity,last_period_median_revenue,last_period_movies_count,original_language,original_title,overview_family,overview_father,overview_film,overview_finds,overview_friends,overview_help,overview_life,overview_love,overview_size,overview_story,overview_time,overview_wife,overview_woman,overview_world,overview_year,overview_years,overview_young,popularity,poster_path,production_companies_Amblin Entertainment,production_companies_BBC Films,production_companies_Canal+,production_companies_Columbia Pictures,production_companies_Columbia Pictures Corporation,production_companies_Dimension Films,production_companies_DreamWorks SKG,production_companies_Dune Entertainment,production_companies_Fox 2000 Pictures,production_companies_Fox Searchlight Pictures,production_companies_Hollywood Pictures,production_companies_Lionsgate,production_companies_Metro-Goldwyn-Mayer (MGM),production_companies_Miramax Films,production_companies_New Line Cinema,production_companies_Orion Pictures,production_companies_Paramount Pictures,production_companies_Regency Enterprises,production_companies_Relativity Media,production_companies_StudioCanal,production_companies_Summit Entertainment,production_companies_Touchstone Pictures,production_companies_TriStar Pictures,production_companies_Twentieth Century Fox Film Corporation,production_companies_United Artists,production_companies_Universal Pictures,production_companies_Village Roadshow Pictures,production_companies_Walt Disney Pictures,production_companies_Warner Bros.,production_companies_Working Title Films,production_companies_size,production_countries_Australia,production_countries_Belgium,production_countries_Canada,production_countries_China,production_countries_Denmark,production_countries_France,production_countries_Germany,production_countries_Hong Kong,production_countries_India,production_countries_Ireland,production_countries_Italy,production_countries_Japan,production_countries_Mexico,production_countries_Netherlands,production_countries_Russia,production_countries_South Korea,production_countries_Spain,production_countries_Sweden,production_countries_United Kingdom,production_countries_United States of America,production_countries_size,release_date,release_date_Month,release_date_day_of_month,release_date_is_quarter_start,release_date_week_day,revenue,runtime,spoken_languages_,spoken_languages_Deutsch,spoken_languages_English,spoken_languages_Español,spoken_languages_Français,spoken_languages_Italiano,spoken_languages_Polski,spoken_languages_Português,spoken_languages_Pусский,spoken_languages_size,spoken_languages_العربية,spoken_languages_हिन्दी,spoken_languages_广州话 / 廣州話,spoken_languages_日本語,spoken_languages_普通话,spoken_languages_한국어/조선말,status,tagline,title,train_test,budget_to_runtime,budget_to_cast_crew_size,budget_to_popularity,belongs_to_collection_bool,original_title_words,job_Producer_gender_0,job_Producer_gender_1,job_Producer_gender_2,job_Director_gender_0,job_Director_gender_1,job_Director_gender_2,order_0_gender_0,order_0_gender_1,order_0_gender_2,order_1_gender_0,order_1_gender_1,order_1_gender_2
1763,0,0,0,0,0,0,0,0,0,0,0,0,14.0,0,0,0,250000.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,35.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,12.0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,2,,1764,tt0012349,,,,,,,,en,The Kid,0,0,0,1,0,0,0,0,28,0,0,0.0,0,0,,0,0,8.17,/drgMcyTsySQBnUPGaBThCHGdlWT.jpg,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1.0,1921-01-21,1.0,21.0,False,4.0,2500000.0,68.0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,Released,6 reels of Joy.,The Kid,train,3676.47,5319.15,0.0,0,2,0,0,1,0,0,1,0,0,1,1,0,0
2992,0,0,0,0,0,0,0,0,0,0,0,0,12.0,0,0,0,1135654.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,21.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,28.0,1152,1,1,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,5,,2993,tt0015400,250000.0,8.17,2500000.0,250000.0,8.17,2500000.0,1.0,en,The Thief of Bagdad,0,0,0,0,0,0,0,0,9,0,0,0.0,0,0,,0,0,3.88,/a6IzXkwZRDimfn8HATzP6Pi6Ois.jpg,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1.0,1924-03-18,3.0,18.0,False,1.0,1213880.0,149.0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,Released,"""Happiness Must Be Earned""",The Thief of Bagdad,train,7621.84,23176.61,0.0,0,4,1,0,0,0,0,1,1,0,0,0,0,1
1917,0,0,0,0,0,0,0,0,0,0,0,0,4.0,0,0,0,592.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,17.0,526,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,2,,1918,tt0016104,1135654.0,3.88,1213880.0,1135654.0,3.88,1213880.0,1.0,en,The Merry Widow,0,0,0,0,0,0,0,1,27,0,0,0.0,0,0,,0,0,0.29,/dCVkB0POblxtn3BegTNcwTPMKUP.jpg,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1.0,1925-08-26,8.0,26.0,False,2.0,1.0,137.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Released,,The Merry Widow,train,4.32,23.68,0.0,0,3,0,0,1,1,0,0,1,0,0,0,0,1
1361,0,0,0,0,0,0,0,0,0,0,0,0,13.0,0,1,0,1135654.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,23.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,20.0,120,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,2,,1362,tt0015648,1135654.0,3.88,1213880.0,1135654.0,3.88,1213880.0,1.0,ru,Броненосец «Потёмкин»,0,0,1,0,0,0,0,0,21,0,0,0.0,0,0,,0,0,12.91,/A5kk0FA4kS9sHLYzC6NI72OOhPc.jpg,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1.0,1925-12-24,12.0,24.0,False,3.0,45100.0,75.0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,Released,"Revolution is the only lawful, equal, effectua...",Battleship Potemkin,train,15142.05,26410.56,0.0,0,2,0,0,1,0,0,1,0,0,1,1,0,0
2680,0,0,0,0,0,0,0,0,0,0,0,0,5.0,0,0,0,463455.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,15.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5.0,141,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,,2681,tt0017423,296.0,6.6,22550.5,296.0,6.6,22550.5,2.0,en,Sparrows,0,0,0,0,0,0,0,0,39,0,0,0.0,0,0,,0,0,0.45,/3ZPfhlwZ9HIYViUCgoZusCszohT.jpg,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1.0,1926-05-14,5.0,14.0,False,4.0,966878.0,84.0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,Released,,Sparrows,train,5517.32,23172.75,0.0,0,1,0,1,0,0,0,1,0,1,0,0,0,1


In [42]:
df.shape

(7398, 197)

# Modeling

In [0]:
model_df = df.copy()
# model_df['id'] =  model_df.index
model_df = model_df.drop(["homepage", "imdb_id", "original_language", "original_title", "poster_path", "release_date", "status", "tagline", "title"], axis=1)

train = model_df[model_df['train_test'] == 'train']
test = model_df[model_df['train_test'] == 'test']

#train_y = train[['revenue']]
#test_y = test[['revenue']]
train = train.drop(['train_test'],axis=1)
test = test.drop(['train_test'] ,axis=1)

# scale_train = StandardScaler().fit(train)
# scale_df = scale_train.transform(train)
# train = pd.DataFrame(scale_df,columns=train.columns)#.merge(train_y, left_index=True, right_index=True)

# scale_test = StandardScaler().fit(test)
# scale_df = scale_test.transform(test)
# test = pd.DataFrame(scale_df,columns=test.columns)#.merge(test_y, left_index=True, right_index=True)


def score(data, y):
    validation_res = pd.DataFrame(
    {"id": data["id"].values,
     "transactionrevenue": data["revenue"].values,
     "predictedrevenue": np.expm1(y)}) 

    validation_res = validation_res.groupby("id")["transactionrevenue", "predictedrevenue"].sum().reset_index()
    return  np.sqrt(mean_squared_error((np.log1p(validation_res["transactionrevenue"].values)), #np.log1p
                                     (np.log1p(validation_res["predictedrevenue"].values)))) #np.log1p
    
class KFoldValidation():
    def __init__(self, data, n_splits=5):
        unique_vis = np.array(sorted(data['id'].astype(str).unique()))
        folds = GroupKFold(n_splits)
        ids = np.arange(data.shape[0])
        
        self.fold_ids = []
        for trn_vis, val_vis in folds.split(X=unique_vis, y=unique_vis, groups=unique_vis):
            self.fold_ids.append([
                    ids[data['id'].astype(str).isin(unique_vis[trn_vis])],
                    ids[data['id'].astype(str).isin(unique_vis[val_vis])]
                ])
            
    def validate(self, train, test, features, model, name="", prepare_stacking=False, 
                 fit_params={"early_stopping_rounds": 500, "verbose": 100, "eval_metric": "rmse"}):
        model.FI = pd.DataFrame(index=features)
        full_score = 0
        
        if prepare_stacking:
            test[name] = 0
            train[name] = np.NaN
        
        for fold_id, (trn, val) in enumerate(self.fold_ids):
            devel = train[features].iloc[trn]
            y_devel = np.log1p(train["revenue"].iloc[trn])
            valid = train[features].iloc[val]
            y_valid = np.log1p(train["revenue"].iloc[val])
                       
            print("Fold ", fold_id, ":")
            model.fit(devel, y_devel, eval_set=[(valid, y_valid)], **fit_params)
            
            if len(model.feature_importances_) == len(features):  # some bugs in catboost?
                model.FI['fold' + str(fold_id)] = model.feature_importances_ / model.feature_importances_.sum()

            predictions = model.predict(valid)
            predictions[predictions < 0] = 0
            print("Fold ", fold_id, " error: ", mean_squared_error(y_valid, predictions)**0.5)
            
            fold_score = score(train.iloc[val], predictions)
            full_score += fold_score / len(self.fold_ids)
            print("Fold ", fold_id, " score: ", fold_score)
            
            if prepare_stacking:
                train[name].iloc[val] = predictions
                
                test_predictions = model.predict(test[features])
                test_predictions[test_predictions < 0] = 0
                test[name] += test_predictions / len(self.fold_ids)
                
        print("Final rmsle score: ", full_score)
        return full_score

## Models config

In [0]:
lgbmodel = lgb.LGBMRegressor(n_estimators=1000, 
                             objective='regression', 
                             metric='rmse',
                             max_depth = 5,
                             num_leaves=5, 
                             min_child_samples=100,
                             learning_rate=0.001,
                             boosting = 'gbdt',
                             min_data_in_leaf= 10,
                             feature_fraction = 0.2,
                             bagging_freq = 1,
                             bagging_fraction = 0.9,
                             importance_type='gain',
                             lambda_l1 = 0.2,
                             bagging_seed=42, 
                             subsample=.8, 
                             colsample_bytree=.9,
                             use_best_model=True)

xgbmodel = xgb.XGBRegressor(max_depth=6, 
                            learning_rate=0.01, 
                            n_estimators=1000, 
                            objective='reg:linear', 
                            gamma=1.45, 
                            seed=42, 
                            silent=True,
                            subsample=0.7, 
                            colsample_bytree=0.8, 
                            colsample_bylevel=0.50)

catmodel = cat.CatBoostRegressor(iterations=1000, 
                                 learning_rate=0.01, 
                                 depth=6,
                                 loss_function = "RMSE",
                                 boost_from_average = True,
                                 colsample_bylevel=0.8,
                                 bagging_temperature = 0.2,
                                 metric_period = None,
                                 random_seed=2345,
                                 l2_leaf_reg = 2.0)

In [0]:
Kfolder = KFoldValidation(train)

In [0]:
Kfolder.validate(train, test, train.columns.drop('revenue') , lgbmodel, name="lgbfinal", prepare_stacking=True) 

In [0]:
Kfolder.validate(train, test, train.columns.drop('revenue'), xgbmodel, name="xgbfinal", prepare_stacking=True)

In [0]:
Kfolder.validate(train, test, train.columns.drop('revenue') , catmodel, name="catfinal", prepare_stacking=True,
               fit_params={"use_best_model": True, "verbose": 100})

# Model Stacking

In [0]:
test["lgbfinal"].head(100)

In [0]:
test['PredictedLogRevenue'] = 0.4 * test["lgbfinal"] + \
                               0.2 * test["xgbfinal"] + \
                               0.4 * test["catfinal"]



score(test, test.PredictedLogRevenue)

In [0]:
test.head(12)

In [0]:
submission = test[['id','PredictedLogRevenue']]
submission.to_csv('sub.csv')

# Charts 

In [0]:
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

plt.figure(figsize=(20,7))
lgb.plot_importance(lgbmodel,figsize=(10,40))
plt.yticks(fontsize=20)
plt.show()