In [1]:
sub_file_name = "../submission/sub_180803_03.csv"
save_model_name = "../output/trained_model_180803_01.txt"
save_sub_train_df_name = "../output/sub_train_df.csv"
save_sub_test_df_name = "../output/sub_test_df.csv"

SEED = 1
isValid = False # if False , create train-test data for submission or train for submission 
isHyperOpt = False 
useTrainedModel = True 
split_date = 32

if isHyperOpt:
    isValid =True

In [2]:
### ver 180723
import time

import numpy as np
import pandas as pd
import os
import gc
from datetime import datetime
import copy
from itertools import product

# Models Packages
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn import feature_selection
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from math import sqrt
from io import StringIO
from sklearn.model_selection import KFold

import hyperopt
from hyperopt import hp, tpe, Trials, fmin

# Gradient Boosting
import lightgbm as lgb
from sklearn.linear_model import Ridge

#Viz
#import seaborn as sns
#import re
import matplotlib.pyplot as plt
%matplotlib inline
import string
from matplotlib.ticker import *

notebookstart = time.time()

In [3]:
for p in [np, pd, hyperopt, lgb]:
    print (p.__name__, p.__version__)

# My version
#numpy 1.14.3
#pandas 0.23.0
#hyperopt 0.1
#lightgbm 2.1.2
#To use hyperotp, networkx version should be networkx==1.11

numpy 1.14.3
pandas 0.23.0
hyperopt 0.1
lightgbm 2.1.2


### - Function

In [4]:
# Thanks You Guillaume Martin for the Awesome Memory Optimizer!
# https://www.kaggle.com/gemartin/load-data-reduce-memory-usage
def reduce_mem_usage2(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                df[col] = df[col].astype(np.float16)
        else: df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

### - DataLoad

In [5]:
train = pd.read_csv("../input/sales_train.csv")
train["date"] = train["date"].map(lambda x: datetime.strptime(x, '%d.%m.%Y'))
test = pd.read_csv("../input/test.csv", index_col = "ID")

sample_sub = pd.read_csv("../input/sample_submission.csv")
item_cate = pd.read_csv("../input/item_categories.csv")
items = pd.read_csv("../input/items.csv")
shops = pd.read_csv("../input/shops.csv")

### - Observation

In [6]:
train.describe()

Unnamed: 0,date_block_num,shop_id,item_id,item_price,item_cnt_day
count,2935849.0,2935849.0,2935849.0,2935849.0,2935849.0
mean,14.56991,33.00173,10197.23,890.8532,1.242641
std,9.422988,16.22697,6324.297,1729.8,2.618834
min,0.0,0.0,0.0,-1.0,-22.0
25%,7.0,22.0,4476.0,249.0,1.0
50%,14.0,31.0,9343.0,399.0,1.0
75%,23.0,47.0,15684.0,999.0,1.0
max,33.0,59.0,22169.0,307980.0,2169.0


In [7]:
print("Train Shape: ", train.shape)
print("Test Shape: ", test.shape)
print("Sample Submission: ", sample_sub.shape)
print("Test/Train Ratio: {:.1f}%".format(test.shape[0]/train.shape[0] * 100))

Train Shape:  (2935849, 6)
Test Shape:  (214200, 2)
Sample Submission:  (214200, 2)
Test/Train Ratio: 7.3%


In [8]:
# Test item_id is a part of all item_id.
# Test shop_id is also.
# So, I think that it is good to use only test item_id & shop_id for validation.

ntrain_i = train["item_id"].nunique()
ntest_i = test["item_id"].nunique()
print("Train item_id: ", ntrain_i)
print("Test item_id: ", ntest_i)
print("Test /Train item_id Ratio: {:.2f}%".format(ntest_i/ntrain_i * 100))

ntrain_s = train["shop_id"].nunique()
ntest_s = test["shop_id"].nunique()
print("Train shop_id: ", ntrain_s)
print("Test shop_id: ", ntest_s)
print("Test /Train item_id Ratio: {:.2f}%".format(ntest_s/ntrain_s * 100))

Train item_id:  21807
Test item_id:  5100
Test /Train item_id Ratio: 23.39%
Train shop_id:  60
Test shop_id:  42
Test /Train item_id Ratio: 70.00%


In [9]:
print("Item Category Shape: ", item_cate.shape)
print("Items Shape: ", items.shape)
print("Shops: ", shops.shape)

Item Category Shape:  (84, 2)
Items Shape:  (22170, 3)
Shops:  (60, 2)


### - Item Category & Item

In [10]:
# Item_category_name can be divided into large(first) classification and small(second) classification.

item_cate["first_cate"] = item_cate["item_category_name"].map(lambda x:x.split(" - ")[0])
item_cate["second_cate"] = item_cate["item_category_name"].map(lambda x:x.split(" - ")[-1])
print("item_cate shape: ", item_cate.shape)
print("Number of First Category: ", item_cate["first_cate"].nunique())
print("NUmber of Second Category: ", item_cate["second_cate"].nunique())
item_cate.head()

item_cate shape:  (84, 4)
Number of First Category:  20
NUmber of Second Category:  67


Unnamed: 0,item_category_name,item_category_id,first_cate,second_cate
0,PC - Гарнитуры/Наушники,0,PC,Гарнитуры/Наушники
1,Аксессуары - PS2,1,Аксессуары,PS2
2,Аксессуары - PS3,2,Аксессуары,PS3
3,Аксессуары - PS4,3,Аксессуары,PS4
4,Аксессуары - PSP,4,Аксессуары,PSP


In [11]:
items = pd.merge(items, item_cate, how="left", on="item_category_id")

In [12]:
items.head()

Unnamed: 0,item_name,item_id,item_category_id,item_category_name,first_cate,second_cate
0,! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D,0,40,Кино - DVD,Кино,DVD
1,!ABBYY FineReader 12 Professional Edition Full...,1,76,Программы - Для дома и офиса (Цифра),Программы,Для дома и офиса (Цифра)
2,***В ЛУЧАХ СЛАВЫ (UNV) D,2,40,Кино - DVD,Кино,DVD
3,***ГОЛУБАЯ ВОЛНА (Univ) D,3,40,Кино - DVD,Кино,DVD
4,***КОРОБКА (СТЕКЛО) D,4,40,Кино - DVD,Кино,DVD


In [13]:
lbl = preprocessing.LabelEncoder()
for col in ["first_cate", "second_cate"]:
    items[col].fillna("Unknown")
    items[col] = lbl.fit_transform(items[col].astype(str))
items.drop(["item_name", "item_category_name"], axis=1, inplace=True)
gc.collect()
items.head()

Unnamed: 0,item_id,item_category_id,first_cate,second_cate
0,0,40,11,6
1,1,76,15,33
2,2,40,11,6
3,3,40,11,6
4,4,40,11,6


### - Shops

In [14]:
shops["city"] = shops["shop_name"].map(lambda x:x.split(" ")[0])
shops["store_form"] = shops["shop_name"].map(lambda x:x.split(" ")[1])
shops["shop_store_name"] = shops["shop_name"].map(lambda x:x.split(" ")[-1])

In [15]:
print("Shops shape: ", shops.shape)
print("Number of city: ", shops["city"].nunique())
print("NUmber of Store Form: ", shops["store_form"].nunique())
print("NUmber of shop_store_name: ", shops["shop_store_name"].nunique())

Shops shape:  (60, 5)
Number of city:  32
NUmber of Store Form:  14
NUmber of shop_store_name:  51


In [16]:
### I think that shop_id 10 and 11 is same name. So I made the name of shop_id 11 to shop_id 10.

print("Shop_id 10 Shop_name: ", shops.iloc[10]["shop_name"])
print("Shop_id 11 Shop_name: ", shops.iloc[11]["shop_name"])

print("Shop_id 10 Train: ", train[(train["shop_id"] == 10)].shape)
print("Shop_id 11 Train: ", train[(train["shop_id"] == 11)].shape)

print("Shop_id 10 Test: ", test[(test["shop_id"] == 10)].shape)
print("Shop_id 11 Test: ", test[(test["shop_id"] == 11)].shape)

Shop_id 10 Shop_name:  Жуковский ул. Чкалова 39м?
Shop_id 11 Shop_name:  Жуковский ул. Чкалова 39м²
Shop_id 10 Train:  (21397, 6)
Shop_id 11 Train:  (499, 6)
Shop_id 10 Test:  (5100, 2)
Shop_id 11 Test:  (0, 2)


In [17]:
lbl = preprocessing.LabelEncoder()
for col in ["city", "store_form", "shop_store_name"]:
    shops[col].fillna("Unknown")
    shops[col] = lbl.fit_transform(shops[col].astype(str))
shops.drop(["shop_name"], axis=1, inplace=True)
gc.collect()
shops.head()

Unnamed: 0,shop_id,city,store_form,shop_store_name
0,0,0,4,50
1,1,0,9,50
2,2,1,9,12
3,3,2,7,14
4,4,3,9,41


### - Train Group

In [18]:
train.drop(["date"], axis=1, inplace=True)
gc.collect()
train.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,0,59,22154,999.0,1.0
1,0,25,2552,899.0,1.0
2,0,25,2552,899.0,-1.0
3,0,25,2554,1709.05,1.0
4,0,25,2555,1099.0,1.0


In [19]:
train_gp1 = train.groupby(["date_block_num", "shop_id", "item_id"])

item_cnt_month_df = train_gp1.sum()
item_cnt_month_df.drop(["item_price"], axis=1, inplace=True)
item_cnt_month_df.rename(columns={"item_cnt_day":"item_cnt_month"}, inplace=True)
item_cnt_month_df.reset_index(inplace=True)

item_price_mean_df = train_gp1.mean()
item_price_mean_df.reset_index(inplace=True)
item_price_mean_df = item_price_mean_df[["date_block_num", "shop_id", "item_id", "item_price"]]

In [20]:
item_cnt_month_df.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month
0,0,0,32,6.0
1,0,0,33,3.0
2,0,0,35,1.0
3,0,0,43,1.0
4,0,0,51,2.0


In [21]:
train_gp = item_cnt_month_df
train_gp = pd.merge(train_gp, item_price_mean_df, how="left", on=["date_block_num", "shop_id", "item_id"])
train_gp.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month,item_price
0,0,0,32,6.0,221.0
1,0,0,33,3.0,347.0
2,0,0,35,1.0,247.0
3,0,0,43,1.0,221.0
4,0,0,51,2.0,128.5


In [22]:
train_gp = reduce_mem_usage2(train_gp)

Memory usage of dataframe is 73.66 MB
Memory usage after optimization is: 24.55 MB
Decreased by 66.7%


# - Test 

In [23]:
test["date_block_num"] = 34
test["year"] = 2015
test["month"] = 11
test.head()

Unnamed: 0_level_0,shop_id,item_id,date_block_num,year,month
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,5,5037,34,2015,11
1,5,5320,34,2015,11
2,5,5233,34,2015,11
3,5,5232,34,2015,11
4,5,5268,34,2015,11


In [24]:
item_price_mean_df_test = item_price_mean_df
item_price_mean_df_test.sort_values(["date_block_num"], inplace=True)
print(item_price_mean_df_test.shape)

item_price_mean_df_test.drop_duplicates(subset=["shop_id", "item_id"], keep="last", inplace=True)
item_price_mean_df_test.drop(["date_block_num"], axis=1, inplace=True)
print(item_price_mean_df_test.shape)

(1609124, 4)
(424124, 3)


In [25]:
test = pd.merge(test, item_price_mean_df, how="left", on=["shop_id", "item_id"])
test = pd.merge(test, shops, how="left", on="shop_id")
test = pd.merge(test, items, how="left", on="item_id")
test = reduce_mem_usage2(test)

Memory usage of dataframe is 21.24 MB
Memory usage after optimization is: 4.70 MB
Decreased by 77.9%


### - All train shops items

In [26]:
train_all_v1 = False
if (train_all_v1):
    date_number = train["date_block_num"].max() + 1
    shop_number = shops["shop_id"].max() + 1
    item_number = items["item_id"].max() + 1

    train_all = pd.DataFrame({"total_id":pd.Series([i for i in range(date_number * shop_number * item_number)])})
    train_all["date_block_num"] = train_all["total_id"] // (shop_number * item_number)
    train_all["shop_id"] = train_all["total_id"] % (shop_number * item_number) // (item_number)
    train_all["item_id"] = train_all["total_id"] % item_number
    train_all["year"] = train_all["date_block_num"] // 12 + 2013
    train_all["month"] = train_all["date_block_num"] % 12 + 1
    train_all.drop(["total_id"], axis=1, inplace=True)

    train_all = reduce_mem_usage2(train_all)
    gc.collect()
    train_all = pd.merge(train_all, train_gp, how="left", on=["date_block_num", "shop_id", "item_id"])

In [27]:
train_all_v2 = True
if (train_all_v2):
    index_cols=["shop_id", "item_id", "date_block_num"]
    
    # For every month we create a grid from all shops/items combinations from that month
    grid = []
    for block_num in train["date_block_num"].unique():
        cur_shops = train[train["date_block_num"]==block_num]["shop_id"].unique()
        cur_items = train[train["date_block_num"]==block_num]["item_id"].unique()
        grid.append(np.array(list(product(*[cur_shops, cur_items, [block_num]])), dtype="int32"))
        
    #turn the grid into pandas dataframe
    grid = pd.DataFrame(np.vstack(grid), columns = index_cols, dtype=np.int32)
    
    #get aggregated values for (shop_id, item_id, month)
    gb = train.groupby(index_cols, as_index=False).agg({"item_cnt_day":{"item_cnt_month":"sum"}})
    
    #fix column names
    gb.columns = [col[0] if col[-1] == "" else col[-1] for col in gb.columns.values]
    #join aggregated data to the grid
    train_all = pd.merge(grid, gb, how="left", on=index_cols).fillna(0)
    #sort the data
    train_all.sort_values(["date_block_num", "shop_id", "item_id"], inplace=True)
    
    train_all["year"] = train_all["date_block_num"] // 12 + 2013
    train_all["month"] = train_all["date_block_num"] % 12 + 1
    train_gp.drop(["item_cnt_month"], axis=1, inplace=True)
    train_all = pd.merge(train_all, train_gp, how="left", on=["date_block_num", "shop_id", "item_id"])

  return super(DataFrameGroupBy, self).aggregate(arg, *args, **kwargs)


In [28]:
train_all.head()

Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_month,year,month,item_price
0,0,19,0,0.0,2013,1,
1,0,27,0,0.0,2013,1,
2,0,28,0,0.0,2013,1,
3,0,29,0,0.0,2013,1,
4,0,32,0,6.0,2013,1,221.0


In [29]:
train_all = pd.merge(train_all, shops, how="left", on="shop_id")
train_all = pd.merge(train_all, items, how="left", on="item_id")
train_all = reduce_mem_usage2(train_all)
gc.collect()

Memory usage of dataframe is 895.11 MB
Memory usage after optimization is: 260.21 MB
Decreased by 70.9%


90

### Feature Engnieering (Train_all)

#### fillna of item_cnt_month (Before Mean encoding)

In [30]:
train_all["item_cnt_month"].fillna(0, inplace=True)
train_all["item_cnt_month"] = train_all["item_cnt_month"].astype(np.int32)

#### -FE1- prev_sales

In [31]:
#prev_sales grouped by date_block_num & item_id & shop_id
try_this = True
if (try_this):
    for prev in [1, 2, 3, 4, 5]:
        prev_sales = train_all[["date_block_num", "shop_id", "item_id", "item_cnt_month"]]
        prev_sales["item_cnt_month"].clip(0, 20, inplace=True)
        prev_sales["date_block_num"] = prev_sales["date_block_num"] + prev
        prev_sales.rename(columns = {"item_cnt_month":"prev_month_cnt_dsi_"+str(prev)}, inplace=True)

        train_all = pd.merge(train_all, prev_sales, how="left", on=["date_block_num", "shop_id", "item_id"])
        test = pd.merge(test, prev_sales, how="left", on=["date_block_num", "shop_id", "item_id"])

        train_all["prev_month_cnt_dsi_"+str(prev)].fillna(0, inplace=True)
        test["prev_month_cnt_dsi_"+str(prev)].fillna(0, inplace=True)
        
        train_all["prev_month_cnt_dsi_"+str(prev)] = train_all["prev_month_cnt_dsi_"+str(prev)].astype(np.int32)  
        test["prev_month_cnt_dsi_"+str(prev)] = test["prev_month_cnt_dsi_"+str(prev)].astype(np.int32)  

        del prev_sales
        gc.collect()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(result)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)


In [32]:
#prev_sales grouped by date_block_num & item_id
try_this = True
if (try_this):
    for prev in [1, 2, 3, 4, 5]:
        prev_sales = train_all[["date_block_num", "shop_id", "item_id", "item_cnt_month"]]
        prev_sales["item_cnt_month"].clip(0.0, 20.0, inplace=True)
        prev_sales_gp1 = (prev_sales[["date_block_num", "item_id", "item_cnt_month"]]
                          .groupby(["date_block_num", "item_id"])
                         )

        prev_sales_di = (prev_sales_gp1
                         .sum()
                         .reset_index()
                        )
        prev_sales_di.rename(columns = {"item_cnt_month":"prev_month_cnt_di_"+str(prev)}, inplace=True)

        prev_sales_di["date_block_num"] = prev_sales_di["date_block_num"] + prev
        prev_sales_di["prev_month_cnt_di_"+str(prev)].clip(0.0, 400.0, inplace=True)

        train_all = pd.merge(train_all, prev_sales_di, how="left", on=["date_block_num", "item_id"])
        test = pd.merge(test, prev_sales_di, how="left", on=["date_block_num", "item_id"])

        train_all["prev_month_cnt_di_"+str(prev)].fillna(4, inplace=True) # 大体4が平均値
        test["prev_month_cnt_di_"+str(prev)].fillna(4, inplace=True) # 大体4が平均値

        train_all["prev_month_cnt_di_"+str(prev)] = train_all["prev_month_cnt_di_"+str(prev)].astype(np.int32)  
        test["prev_month_cnt_di_"+str(prev)] = test["prev_month_cnt_di_"+str(prev)].astype(np.int32)  
        
        del prev_sales, prev_sales_di
        gc.collect()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(result)


In [33]:
#prev_sales grouped by date_block_num & shop_id
try_this = True
if (try_this):
    for prev in [1, 2, 3, 4, 5]:
        prev_sales = train_all[["date_block_num", "shop_id", "item_id", "item_cnt_month"]]
        prev_sales["item_cnt_month"].clip(0.0, 20.0, inplace=True)
        prev_sales_gp2 = (prev_sales[["date_block_num", "shop_id", "item_cnt_month"]]
                          .groupby(["date_block_num", "shop_id"])
                         )

        prev_sales_ds = (prev_sales_gp2
                         .sum()
                         .reset_index()
                        )
        prev_sales_ds.rename(columns = {"item_cnt_month":"prev_month_cnt_ds_"+str(prev)}, inplace=True)

        prev_sales_ds["date_block_num"] = prev_sales_ds["date_block_num"] + prev
        prev_sales_ds["prev_month_cnt_ds_"+str(prev)].clip(0.0, 4000.0, inplace=True)

        train_all = pd.merge(train_all, prev_sales_ds, how="left", on=["date_block_num", "shop_id"])
        test = pd.merge(test, prev_sales_ds, how="left", on=["date_block_num", "shop_id"])

        train_all["prev_month_cnt_ds_"+str(prev)].fillna(1140, inplace=True) # 大体1138が平均値
        test["prev_month_cnt_ds_"+str(prev)].fillna(1140, inplace=True) # 大体1138が平均値

        train_all["prev_month_cnt_ds_"+str(prev)] = train_all["prev_month_cnt_ds_"+str(prev)].astype(np.int32)  
        test["prev_month_cnt_ds_"+str(prev)] = test["prev_month_cnt_ds_"+str(prev)].astype(np.int32)  
        
        del prev_sales, prev_sales_ds
        gc.collect()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(result)


In [34]:
#delta of prev_sales grouped by date_block_num & item_id
try_this = True
if (try_this):
    for prev in [1, 2, 3, 4]:
        train_all["delta_prev_month_cnt_di_"+str(prev)+"-"+str(prev+1)] = train_all["prev_month_cnt_di_"+str(prev)] - train_all["prev_month_cnt_di_"+str(prev+1)] 
        test["delta_prev_month_cnt_di_"+str(prev)+"-"+str(prev+1)] = test["prev_month_cnt_di_"+str(prev)] - test["prev_month_cnt_di_"+str(prev+1)] 
        
        train_all["delta_prev_month_cnt_di_"+str(prev)+"-"+str(prev+1)] = train_all["delta_prev_month_cnt_di_"+str(prev)+"-"+str(prev+1)].astype(np.int32)    
        test["delta_prev_month_cnt_di_"+str(prev)+"-"+str(prev+1)] = test["delta_prev_month_cnt_di_"+str(prev)+"-"+str(prev+1)].astype(np.int32)    

        gc.collect()

#### -FE2- Mean encoding (Advanced Features I: mean encodings)

In [35]:
# Finally I don't use this feature.
# Valid score improve. But Pablic score umimporve.
try_this = False
if (try_this):
    train_all["ME_item_mean"] = 0.3343 # average of item_cnt_month

    y_tr = train_all["item_cnt_month"].values
    y_tr = y_tr.clip(0.0, 20.0)
    kf = KFold(n_splits=5, shuffle=True, random_state=SEED)
    for tr_ind, val_ind in kf.split(train_all, y_tr):
        X_tr, X_val = train_all.iloc[tr_ind], train_all.iloc[val_ind]
        means = X_val["item_id"].map(X_tr.groupby(["item_id"])["item_cnt_month"].mean())
        X_val["ME_item_mean"] = means
        train_all.iloc[val_ind] = X_val

    #Mean encoding for Valid Data    
    if (isValid):
        tr_ind = train_all[(train_all["date_block_num"] <= split_date)].index
        val_ind = train_all[(train_all["date_block_num"] > split_date)].index
        X_tr, X_val = train_all.iloc[tr_ind], train_all.iloc[val_ind]
        means = X_val["item_id"].map(X_tr.groupby(["item_id"])["ME_item_mean"].mean())
        X_val["ME_item_mean"] = means
        train_all.iloc[val_ind] = X_val

    #Mean encoding for Test Data   
    means = test["item_id"].map(train_all.groupby(["item_id"])["ME_item_mean"].mean())
    test["ME_item_mean"] = means

    train_all["ME_item_mean"].fillna(0.3343, inplace=True)
    test["ME_item_mean"].fillna(0.3343, inplace=True)

    del X_tr, X_val, y_tr, tr_ind, val_ind, means
    gc.collect()

#### -FE3- item_price (Advanced Features II)

In [36]:
if (isValid):
    train_for_gp = train[(train["date_block_num"] <= split_date)]
else:
    train_for_gp = train

# Std of item_price (Also Coefficient of variation)
item_price_std_srs = train_for_gp.groupby(["item_id"])["item_price"].std()
item_price_std_srs.name = "item_price_std"
train_all["item_price_std"] = train_all["item_id"].map(item_price_std_srs).fillna(0)
train_all["price_CoefVar"] = train_all["item_price_std"]/train_gp["item_price"]   
test["item_price_std"] = test["item_id"].map(item_price_std_srs).fillna(0)
test.loc[~(test["item_price"].isnull()), "price_CoefVar"] = test["item_price_std"]/test["item_price"]

# MaxPrice, MinPrice of item_price
item_price_max_srs = train_for_gp.groupby(["item_id"])["item_price"].max()
item_price_min_srs = train_for_gp.groupby(["item_id"])["item_price"].min()

train_all["max_price"] = train_all["item_id"].map(item_price_max_srs)
train_all["min_price"] = train_all["item_id"].map(item_price_min_srs)
test["max_price"] = test["item_id"].map(item_price_max_srs)
test["min_price"] = test["item_id"].map(item_price_min_srs)

#### -FE4- first_sell_day, last_sell_day

In [37]:
try_this = True
if (try_this):   
    train_sell_period = train.sort_values("date_block_num")
    if (isValid):
        train_sell_period = train_sell_period[(train_sell_period["date_block_num"] <= split_date )]
    
    first_sell_df = train_sell_period.drop_duplicates(keep="first", subset=["item_id"])
    first_sell_df = first_sell_df[["item_id", "date_block_num"]]
    first_sell_df.rename(columns={"date_block_num":"first_sell_month"}, inplace=True)
    first_sell_df["first_sell_month"] = first_sell_df["first_sell_month"].astype(np.int8)  

    last_sell_df = train_sell_period.drop_duplicates(keep="last", subset=["item_id"])
    last_sell_df = last_sell_df[["item_id", "date_block_num"]]
    last_sell_df.rename(columns={"date_block_num":"last_sell_month"}, inplace=True)
    last_sell_df["last_sell_month"] = last_sell_df["last_sell_month"].astype(np.int8)  

    train_all = pd.merge(train_all, first_sell_df, how="left", on="item_id")
    train_all = pd.merge(train_all, last_sell_df, how="left", on="item_id")
    test = pd.merge(test, first_sell_df, how="left", on="item_id")
    test = pd.merge(test, last_sell_df, how="left", on="item_id")

    train_all["first_last_period"] = train_all["last_sell_month"] - train_all["first_sell_month"]
    test["first_last_period"] = test["last_sell_month"] - test["first_sell_month"]

    del train_sell_period, first_sell_df, last_sell_df
    gc.collect()

### Fillna, log of item_price, del

In [38]:
train_all = reduce_mem_usage2(train_all)
test = reduce_mem_usage2(test)

Memory usage of dataframe is 1436.34 MB
Memory usage after optimization is: 718.17 MB
Decreased by 50.0%
Memory usage of dataframe is 31.66 MB
Memory usage after optimization is: 14.30 MB
Decreased by 54.8%


In [39]:
### Fill missing values
# https://www.kaggle.com/seiya1998/fill-missing-values-simple-method

def fill_missing_target(df, train_df, target, lis0):
    lis = copy.deepcopy(lis0) #local変数として対応
    print("*** "+target+" fillna ***")
    print("[Before FillNa] "+target+"_nan_sum : "+str(df[target].isnull().sum()))
    
    count = 0
    while 0 < len(lis):
        if df[target].isnull().sum() == 0:
            print("the number of NaN is 0.")
            break
            
        count += 1
        colname = "median_target"+str(count)
        print("groupby_"+",".join(lis)+"_median")
        tmp = train_df.groupby(lis)[[target]].median()
        tmp.reset_index(inplace = True)
        tmp.columns = lis + [colname]
        df = pd.merge(df, tmp, how='left', on=lis)
        df.loc[df[target].isnull(),target] = df.loc[df[target].isnull(),colname]

        del(df[colname])
        lis.pop()

        print(target+"_nan_sum : "+str(df[target].isnull().sum()))
    
    return df
###

In [40]:
print("--fillna Train--")
train_lis = ['item_category_id', 'item_id','date_block_num']#, 'shop_id']
train_all = fill_missing_target(train_all, train_all, "item_price", train_lis)

train_lis = ['item_category_id', 'item_id']
#train_all = fill_missing_target(train_all, train_all, "item_price_std", train_lis) # fillna(0)済
train_all = fill_missing_target(train_all, train_all, "price_CoefVar", train_lis)
train_all = fill_missing_target(train_all, train_all, "max_price", train_lis)
train_all = fill_missing_target(train_all, train_all, "min_price", train_lis)

print("--fillna Test--")
test_lis = ['item_category_id','item_id','shop_id']
test = fill_missing_target(test, train_all, "item_price", test_lis)
test = fill_missing_target(test, train_all, "item_price_std", test_lis)
test = fill_missing_target(test, train_all, "price_CoefVar", test_lis)
test = fill_missing_target(test, train_all, "max_price", test_lis)
test = fill_missing_target(test, train_all, "min_price", test_lis)

train_all["item_price"] = np.log(train_all["item_price"]+0.001)
test["item_price"] = np.log(test["item_price"]+0.001)

--fillna Train--
*** item_price fillna ***
[Before FillNa] item_price_nan_sum : 9304726
groupby_item_category_id,item_id,date_block_num_median
item_price_nan_sum : 0
the number of NaN is 0.
*** price_CoefVar fillna ***
[Before FillNa] price_CoefVar_nan_sum : 9304726
groupby_item_category_id,item_id_median
price_CoefVar_nan_sum : 3950302
groupby_item_category_id_median
price_CoefVar_nan_sum : 173691
*** max_price fillna ***
[Before FillNa] max_price_nan_sum : 0
the number of NaN is 0.
*** min_price fillna ***
[Before FillNa] min_price_nan_sum : 0
the number of NaN is 0.
--fillna Test--
*** item_price fillna ***
[Before FillNa] item_price_nan_sum : 102796
groupby_item_category_id,item_id,shop_id_median
item_price_nan_sum : 16102
groupby_item_category_id,item_id_median
item_price_nan_sum : 15246
groupby_item_category_id_median
item_price_nan_sum : 0
*** item_price_std fillna ***
[Before FillNa] item_price_std_nan_sum : 0
the number of NaN is 0.
*** price_CoefVar fillna ***
[Before FillNa]

In [41]:
train_all["item_cnt_month"].clip(0, 20, inplace=True)

In [42]:
del item_price_mean_df_test
del items, shops
gc.collect()

294

### Before Modeling State

In [43]:
train_all = reduce_mem_usage2(train_all)
test = reduce_mem_usage2(test)

Memory usage of dataframe is 718.17 MB
Memory usage after optimization is: 707.76 MB
Decreased by 1.4%
Memory usage of dataframe is 14.30 MB
Memory usage after optimization is: 14.30 MB
Decreased by 0.0%


In [44]:
print("Train")
print("Shape:", train_all.shape)
print("Columns:", train_all.columns)
print("Test")
print("Shape:", test.shape)
print("Columns:", test.columns)

Train
Shape: (10913850, 39)
Columns: Index(['shop_id', 'item_id', 'date_block_num', 'item_cnt_month', 'year',
       'month', 'item_price', 'city', 'store_form', 'shop_store_name',
       'item_category_id', 'first_cate', 'second_cate', 'prev_month_cnt_dsi_1',
       'prev_month_cnt_dsi_2', 'prev_month_cnt_dsi_3', 'prev_month_cnt_dsi_4',
       'prev_month_cnt_dsi_5', 'prev_month_cnt_di_1', 'prev_month_cnt_di_2',
       'prev_month_cnt_di_3', 'prev_month_cnt_di_4', 'prev_month_cnt_di_5',
       'prev_month_cnt_ds_1', 'prev_month_cnt_ds_2', 'prev_month_cnt_ds_3',
       'prev_month_cnt_ds_4', 'prev_month_cnt_ds_5',
       'delta_prev_month_cnt_di_1-2', 'delta_prev_month_cnt_di_2-3',
       'delta_prev_month_cnt_di_3-4', 'delta_prev_month_cnt_di_4-5',
       'item_price_std', 'price_CoefVar', 'max_price', 'min_price',
       'first_sell_month', 'last_sell_month', 'first_last_period'],
      dtype='object')
Test
Shape: (214200, 38)
Columns: Index(['shop_id', 'item_id', 'date_block_num', '

In [45]:
train_all.head()

Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_month,year,month,item_price,city,store_form,shop_store_name,...,delta_prev_month_cnt_di_2-3,delta_prev_month_cnt_di_3-4,delta_prev_month_cnt_di_4-5,item_price_std,price_CoefVar,max_price,min_price,first_sell_month,last_sell_month,first_last_period
0,0,19,0,0,2013,1,3.332031,0,4,50,...,0,0,0,0.0,0.0,28.0,28.0,0,0,0
1,0,27,0,0,2013,1,7.824219,0,4,50,...,0,0,0,610.0,1.757812,2500.0,498.0,0,17,17
2,0,28,0,0,2013,1,6.308594,0,4,50,...,0,0,0,118.1875,0.478516,549.0,126.0,0,14,14
3,0,29,0,0,2013,1,7.824219,0,4,50,...,0,0,0,612.5,2.771484,2500.0,978.0,0,14,14
4,0,32,0,6,2013,1,5.398438,0,4,50,...,0,0,0,100.0,0.77832,349.0,70.625,0,33,33


In [46]:
test.head()

Unnamed: 0,shop_id,item_id,date_block_num,year,month,item_price,city,store_form,shop_store_name,item_category_id,...,delta_prev_month_cnt_di_2-3,delta_prev_month_cnt_di_3-4,delta_prev_month_cnt_di_4-5,item_price_std,price_CoefVar,max_price,min_price,first_sell_month,last_sell_month,first_last_period
0,5,5037,34,2015,11,6.621094,4,8,11,19,...,-11,65,-51,655.0,0.874023,2600.0,749.0,20.0,33.0,13.0
1,5,5320,34,2015,11,5.699219,4,8,11,55,...,0,0,0,0.0,0.057648,299.0,150.25,,,
2,5,5233,34,2015,11,7.089844,4,8,11,19,...,-70,113,-82,282.0,0.235107,1199.0,599.0,27.0,33.0,6.0
3,5,5232,34,2015,11,6.394531,4,8,11,23,...,-17,61,0,277.0,0.462402,1199.0,599.0,31.0,33.0,2.0
4,5,5268,34,2015,11,7.863281,4,8,11,20,...,0,0,0,0.0,,3000.0,1431.0,,,


In [47]:
features = test.columns.tolist()
features.remove("date_block_num")

In [48]:
categorical = ['shop_id', 
               'item_id', 
               'city',
               'store_form',
               'item_category_id',
               'first_cate',
               'second_cate',
               'shop_store_name'
              ]

In [49]:
#train_all.to_csv(save_sub_train_df_name, index=False, header=True)
#test.to_csv(save_sub_test_df_name, index=False, header=True)

## Light Gradient Boosting Regressor

### Function for LGBM

In [50]:
def TrainValidSplit(df, split_date, test_item_id, test_shop_id):
    X_train = df[(df["date_block_num"] <= split_date) &
                 (df["date_block_num"] >= 5)]
    X_valid = df[(df["date_block_num"] > split_date) &
                 (df["item_id"].isin(test_item_id)) &
                 (df["shop_id"].isin(test_shop_id))]
    y_train = copy.deepcopy(X_train["item_cnt_month"])
    y_valid = copy.deepcopy(X_valid["item_cnt_month"])

    X_train.drop(["item_cnt_month"], axis=1, inplace=True)
    X_train.drop(["date_block_num"], axis=1, inplace=True)
    X_valid.drop(["item_cnt_month"], axis=1, inplace=True)
    X_valid.drop(["date_block_num"], axis=1, inplace=True)

    gc.collect()
    return X_train, X_valid, y_train, y_valid

In [51]:
def subData(train_df, test_df):
    X_train = copy.deepcopy(train_df)
        
    y_train = X_train["item_cnt_month"].copy()
    X_train.drop(["item_cnt_month"], axis=1, inplace=True)
    X_train.drop(["date_block_num"], axis=1, inplace=True)
    test_df.drop(["date_block_num"], axis=1, inplace=True)

    gc.collect()
    return X_train, test_df, y_train

In [52]:
def GoValid(X_train, X_valid, y_train, y_valid, lgbm_params):
    lgtrain = lgb.Dataset(X_train, y_train,
                          feature_name=features,
                          categorical_feature = categorical)
    lgvalid = lgb.Dataset(X_valid, y_valid,
                          feature_name=features,
                          categorical_feature = categorical)

    modelstart = time.time()

    lgb_clf = lgb.train(
        lgbm_params,
        lgtrain,
        num_boost_round=10000, # ←as large as possible
        feature_name=features,
        categorical_feature=categorical,
        valid_sets=[lgtrain, lgvalid],
        valid_names=["train", "valid"],
        early_stopping_rounds=50,
        verbose_eval=200
    )

    valid_score = np.sqrt(metrics.mean_squared_error(y_valid, lgb_clf.predict(X_valid)))
    print("Model Evaluation Stage")
    print('RMSE:', valid_score)
    print("Model Runtime: %0.2f Minutes"%( (time.time() - modelstart)/60) )
    
    f, ax = plt.subplots(figsize=[7, 10])
    lgb.plot_importance(lgb_clf, max_num_features=50, ax=ax)
    plt.title("Light GBM Feature Inportance")
       
    return valid_score

In [53]:
def GoSubTrain(X_train, test_df, y_train, lgbm_params):
    lgtrain = lgb.Dataset(X_train, y_train,
                          feature_name=features,
                          categorical_feature = categorical)
    
    modelstart = time.time()

    lgb_clf = lgb.train(
        lgbm_params,
        lgtrain,
        feature_name=features,
        categorical_feature=categorical,
        num_boost_round=575, # ← based on the result of valid of early stopping
        verbose_eval=200
    )
    
    f, ax = plt.subplots(figsize=[7, 10])
    lgb.plot_importance(lgb_clf, max_num_features=50, ax=ax)
    plt.title("Light GBM Feature Inportance")
    
    lgpred = lgb_clf.predict(test)
    lgb_clf.save_model(save_model_name)
    
    return lgpred

### Let's Start

In [54]:
# for valid & sub　 (not for hyperparameter opt)
if (not isHyperOpt): # simpleValid
    lgbm_params = {
        "task": "train",
        "boosting_type": "gbdt",
        "objective": "regression",
        "metric": "rmse",
        #"max_depth":15,
        "num_leaves": 284,
        "min_child_samples": 9, # 9
        "feature_fraction": 0.18,
        "bagging_fraction": 0.18,
        "bagging_freq": 5,
        "learning_rate": 0.01,
        "verbose": 0,
        "seed":SEED, # random_seed
        #"device_type":"gpu",
        #"max_bin" : 15
    }

In [55]:
test_item_id = test["item_id"].unique().tolist()
test_shop_id = test["shop_id"].unique().tolist()

In [56]:
if (not isHyperOpt):
    if (isValid): # Validation (not hyperparameter tuning)
        valid_score_list = []
        print("Train Date:", split_date, ", Valid Date:", split_date+1)
        X_train, X_valid, y_train, y_valid = TrainValidSplit(df=train_all, split_date=split_date, 
                                                             test_item_id=test_item_id, test_shop_id=test_shop_id)
        print("valid item_id number: ", X_valid["item_id"].unique().shape[0])
        valid_score = GoValid(X_train, X_valid, y_train, y_valid, lgbm_params)
        valid_score_list.append(valid_score)

        print("##################")
        print("Mean Valid Score: {0:.5f}".format(sum(valid_score_list)/len(valid_score_list)))
        print("##################")

    else: #submission
        X_train, test, y_train = subData(train_df=train_all, test_df=test)
        
        if (not useTrainedModel):
            print("Try Submission Train")
            lgpred = GoSubTrain(X_train, test, y_train, lgbm_params)
        elif (useTrainedModel):
            print("Use Trained model And Try Submission")
            lgb_clf = lgb.Booster(model_file=save_model_name)
            lgpred = lgb_clf.predict(test)

        lgsub = pd.DataFrame(lgpred, columns=["item_cnt_month"])
        lgsub["item_cnt_month"].clip(0.0, 20.0, inplace=True) # Between 0 and 1
        lgsub["ID"] = lgsub.index

        print("Mean Item_Cnt_Month", lgsub["item_cnt_month"].mean())
        lgsub.to_csv(sub_file_name, index=False, header=True)
        print("Finished:", sub_file_name)    

Use Trained model And Try Submission
Mean Item_Cnt_Month 0.26523076581238675
Finished: ../submission/sub_180803_03_savedmodel.csv


### CV

In [57]:
# Still in the middle of CV
cvstart = time.time()
max_evals = 100
trials = Trials()

space = {
    #"learning_rate": hp.uniform("learning_rate", 0.008, 0.015),
    #"num_leaves": hp.uniform("num_leaves", 240, 340),
    #"min_child_samples": hp.uniform("min_child_samples", 6, 12),#hp.loguniform("min_child_samples", 1, 4)
    "feature_fraction": hp.uniform("feature_fraction", 0.001, 0.25),
    "bagging_fraction": hp.uniform("bagging_fraction", 0.001, 0.25)
}

#18.8.2 best
#num_boost_round:  190
#learning_rate: 0.01252831788816417
#num_leaves: 284
#min_child_samples: 9

#18.8.4 best
#num_boost_round:  638
#num_leaves: 275
#min_child_samples: 9
#eature_fraction: 0.16969512755898708
#bagging_fraction: 0.1261142310387107

In [58]:
def get_lgbm_params(space):
    lgbm_params = dict()
    lgbm_params["task"] = "train"
    lgbm_params["boosting_type"] = "gbdt"
    lgbm_params["objective"] = "regression"
    lgbm_params["metric"] = "rmse"
    lgbm_params["num_leaves"] = int(space["num_leaves"]) if "num_leaves" in space else 280
    lgbm_params["min_child_samples"] = int(space["min_child_samples"]) if "min_child_samples" in space else 9 
    lgbm_params["feature_fraction"] = space["feature_fraction"] if "feature_fraction" in space else 1.0
    lgbm_params["bagging_fraction"] = space["bagging_fraction"] if "bagging_fraction" in space else 1.0
    lgbm_params["bagging_freq"] = 5
    lgbm_params["learning_rate"] = space["learning_rate"] if "learning_rate" in space else 0.008
    lgbm_params["verbose"] = 0
    lgbm_params["seed"] = SEED
     
    return lgbm_params

In [59]:
obj_call_count = 0
cur_best_loss = np.inf
cur_best_iteration = 0

def objective(space):
    global obj_call_count, cur_best_loss, cur_best_iteration
    
    obj_call_count += 1
    print("\nLGBM objective call #{} cur_best_loss={:7.5f}".format(obj_call_count, cur_best_loss))
       
    X_train, X_valid, y_train, y_valid = TrainValidSplit(df=train_all, split_date=split_date,test_item_id=test_item_id, test_shop_id=test_shop_id)
    lgtrain = lgb.Dataset(X_train, y_train,feature_name=features,categorical_feature = categorical)
    lgvalid = lgb.Dataset(X_valid, y_valid,feature_name=features,categorical_feature = categorical)
    
    lgbm_params = get_lgbm_params(space)
    
    #sorted_params = sorted(space.iteritems(), key=lambda z: z[0])
    #params_str = str.join(" ", ["{}={}".format(k, v) for k, v in sorted_params])
    #print("Params: {}".format(params_str))
    #print("num_leaves: {}, min_child_samples: {}".format(lgbm_params["num_leaves"], lgbm_params["min_child_samples"]))
    for k in space.keys():
        print("{}: {}".format(k,lgbm_params[k]))
    
    lgb_clf = lgb.train(
        lgbm_params,
        lgtrain,
        num_boost_round=10000, # ←As large as possible
        feature_name=features,
        categorical_feature=categorical,
        valid_sets=[lgtrain, lgvalid],
        valid_names=["train", "valid"],
        early_stopping_rounds=50,
        verbose_eval=10000
    )
    
    #nb_trees = lgb_clf.best_iteration
    #val_loss = lgb_clf.best_score
    #print('nb_trees={:7.5f}, train rmse={:7.5f}, valid rmse={:7.5f}'.format(nb_trees,val_loss["train"]["rmse"],val_loss["valid"]["rmse"]))

    valid_score = np.sqrt(metrics.mean_squared_error(y_valid, lgb_clf.predict(X_valid)))
    if valid_score < cur_best_loss:
        cur_best_loss = valid_score
        cur_best_iteration = lgb_clf.best_iteration
    gc.collect()
    
    return valid_score

In [60]:
if (isHyperOpt):
    best = fmin(fn=objective,
                space=space,
                algo=tpe.suggest,
                max_evals=max_evals,
                trials=trials,
                verbose=1)
    
    print("*****Best Parameters******")
    int_group = ["num_leaves", "min_child_samples"]
    print("num_boost_round: ", cur_best_iteration)
    for k in space.keys():
        if k in int_group:
            print("{}: {}".format(k,int(best[k])))
        else:
            print("{}: {}".format(k,best[k]))

In [61]:
if (isHyperOpt):
    print("*****Best Parameters (In case of interruption)******")
    int_group = ["num_leaves", "min_child_samples"]
    print("num_boost_round: ", cur_best_iteration)
    for k in space.keys():
        objective_history = trials.losses()
        min_loss = min(objective_history[0:-1])
        min_index = objective_history.index(min_loss)

        k_history = np.ravel([t["misc"]["vals"][k] for t in trials.trials[0:-1]])
        best_k = k_history[min_index]

        if k in int_group:
            print("{}: {}".format(k,int(best_k)))
        else:
            print("{}: {}".format(k,best_k))

In [62]:
###http://blog.hassaku-labs.com/post/hyperopt/###
#from matplotlib.ticker import *

def get_figure_history(x):
    
    n = len(trials.trials[0:-1])
    
    x_history = np.ravel([t["misc"]["vals"][x] for t in trials.trials[0:-1]])
    objective_history = trials.losses()
    
    fig = plt.figure(figsize=(16, 8))
    cm = plt.get_cmap("jet")
    
    PLOT_NUM = 4
    
    for i, hist_num in enumerate(np.linspace(int(n/PLOT_NUM), n, PLOT_NUM)):
        cmap_cycle = [cm(1.*h/(hist_num-1)) for h in range(int(hist_num) - 1)]
        
        ax1 = plt.subplot(2, PLOT_NUM, i+1)
        ax1.set_color_cycle(cmap_cycle)
        ax1.plot(alpha=0.2)#np.arange(-30, 30, 0.1), function(np.arange(-30, 30, 0.1)), alpha=0.2)
        for j in range(int(hist_num) ):
            ax1.plot(x_history[j], objective_history[j], ".")
        ax1.set_title("times: {times}".format(times=int(hist_num)))
        #ax1.set_ylim([np.min(objective_history)*0.95, np.max(objective_history)*1.05])
        ax1.set_xlim([np.min(x_history)*0.95, np.max(x_history)*1.05])
        if i == 0:
            ax1.set_ylabel("valid score")
        ax1.yaxis.set_major_locator(MultipleLocator(0.05)) 
            
        ax2 = plt.subplot(2, PLOT_NUM, PLOT_NUM+i+1)
        ax2.hist(x_history[:int(hist_num)], bins=50)
        if i == 0:
            ax2.set_ylabel("histogram of "+x)
        ax2.set_xlabel(x)
        ax2.set_xlim([np.min(x_history)*0.95, np.max(x_history)*1.05])

In [63]:
if (isHyperOpt):
    for k in space.keys():
        get_figure_history(k)
    print("CV Runtime: %0.2f Minutes"%( (time.time() - cvstart)/60) )

In [64]:
print("Notebook Runtime: %0.2f Minutes"%( (time.time() - notebookstart)/60) )

Notebook Runtime: 3.07 Minutes
