# Cluster Approach 

This is the same XGB vJ approach but with a plot twist! 

We are breaking the datasets into Clusters and we are going to train a model per cluster. 

To see how we computed the cluster please refer to Pablo's code on H2O 

In [1]:
predict_w11 = True #I'm going to use this variable to know when to use w10 for training
dataset_file = 'jorge_dataset_modifiedwPred.csv'

### Imports

In [2]:
import os

mingw_path = 'C:\\Program Files\\mingw-w64\\x86_64-5.3.0-posix-seh-rt_v4-rev0\\mingw64\\bin'

os.environ['PATH'] = mingw_path + ';' + os.environ['PATH']

import xgboost as xgb
import warnings
warnings.filterwarnings("ignore")




In [3]:
import pandas as pd
import numpy as np
import re 
import os
import time
from IPython.display import display
import matplotlib.pyplot as plt
from sklearn.cross_validation import LabelKFold
from sklearn.grid_search import GridSearchCV
import seaborn as sns
%matplotlib inline

In [4]:
clusters_by_ClientDemand = pd.read_csv("h2o-clustByDem_Cliente_ID.csv")

# Load computed Dataset
def load_dataset(folder = ''):
    
    
    filename = dataset_file
    filepath = os.path.join(folder,filename)        
    
    df_train = pd.read_csv(filepath, 
                           usecols = {'Producto_ID',
                                     'Semana',
                                     'Cliente_ID',
                                     'Agencia_ID',
                                     'Canal_ID',
                                     'Ruta_SAK',
                                     'ZipCode',      
                                     'week_ct','brand',
                                     'Log_Target_mean_lag1','Log_Target_mean_lag2',
                                     'Log_Target_mean_lag3','Log_Target_mean_lag4',
                                     'Lags_sum','Qty_Ruta_SAK_Bin','num_prod','num_prod_uni',
                                     'id',
                                     'Last_per_Ruta_SAK','Last_per_Cliente_ID',                                      
                                     'Demanda_uni_equil'},
                           dtype  = {'Producto_ID':'int32',
                                     'Semana':'int8',
                                     'Cliente_ID':'int32',
                                     'Agencia_ID':'uint16',
                                     'Canal_ID':'int8',
                                     'Ruta_SAK':'int32',
                                     'ZipCode':'int16',  
                                     'id':'int32',
                                     'week_ct':'int8',
                                     'Log_Target_mean_lag1':'float32','Log_Target_mean_lag2':'float32',
                                     'Log_Target_mean_lag3':'float32','Log_Target_mean_lag4':'float32',
                                     'Lags_sum':'float32','Qty_Ruta_SAK_Bin':'float32','num_prod':'float32','num_prod_uni':'float32',                                     
                                     'Last_per_Ruta_SAK':'float32','Last_per_Cliente_ID':'float32',
                                     'Demanda_uni_equil':'float32'})
    
    
    return df_train

df = load_dataset()


In [5]:
print df.Cliente_ID.unique().shape[0], clusters_by_ClientDemand.Cliente_ID.unique().shape[0]

888272 842629


In [6]:
print df.Cliente_ID.unique().shape[0] - clusters_by_ClientDemand.Cliente_ID.unique().shape[0]

45643


In [7]:
df = df.merge(clusters_by_ClientDemand, on='Cliente_ID',how='left')   

In [8]:
df.dtypes

Agencia_ID               uint16
Canal_ID                   int8
Cliente_ID                int32
Demanda_uni_equil       float32
Producto_ID               int32
Ruta_SAK                  int32
Semana                     int8
ZipCode                   int16
id                        int32
Last_per_Cliente_ID     float32
Last_per_Ruta_SAK       float32
week_ct                    int8
Log_Target_mean_lag1    float32
Log_Target_mean_lag2    float32
Log_Target_mean_lag3    float32
Log_Target_mean_lag4    float32
Lags_sum                float32
brand                   float64
Qty_Ruta_SAK_Bin        float32
num_prod                float32
num_prod_uni            float32
Cliente_ID_clust_ID     float64
dtype: object

I don't know why but I have more Cliente_ID than Pablo's Cluster ID table

Now, For each Cluster I want to train using my best_param found in my XGB vJ Models notebook

In [9]:
validation = 9 

if predict_w11:
    validation = 10
    
df.Demanda_uni_equil[df.Demanda_uni_equil < 0] = 0
df_validation = df[df.Semana == validation]
df_train = df[(df.Semana != 3) & (df.Semana < validation)]
df_test = df[df.Semana > validation]
del df

In [10]:
best_param = {'max_depth':[10], 
        'n_estimators':[100] ,
        'learning_rate':[0.1],         
         'subsample': [0.5],
         #'reg_alpha':[0,1], #L2 term
         #'reg_lambda':[0,1]#, #L1 tem            
         #'silent': [False]
        }

param = {'max_depth':[10,6], 
        'n_estimators':[50] ,
        'learning_rate':[0.1,0.05],         
         'subsample': [0.5,1],
         #'reg_alpha':[0,1], #L2 term
         #'reg_lambda':[0,1]#, #L1 tem            
         #'silent': [False]
        }


In [11]:
# Remove Columns that are not features 
#drop_columns = ['id','Semana','Demanda_uni_equil','DemandaAgencia_ID','DemandaCanal_ID','DemandaRuta_SAK','DemandaZipCode']
drop_columns = ['id','Semana','Demanda_uni_equil']
features = df_train.drop(drop_columns,axis=1).columns
print features

Index([u'Agencia_ID', u'Canal_ID', u'Cliente_ID', u'Producto_ID', u'Ruta_SAK',
       u'ZipCode', u'Last_per_Cliente_ID', u'Last_per_Ruta_SAK', u'week_ct',
       u'Log_Target_mean_lag1', u'Log_Target_mean_lag2',
       u'Log_Target_mean_lag3', u'Log_Target_mean_lag4', u'Lags_sum', u'brand',
       u'Qty_Ruta_SAK_Bin', u'num_prod', u'num_prod_uni',
       u'Cliente_ID_clust_ID'],
      dtype='object')


In [12]:
Semana = df_train.Semana
X_train = df_train[features]

X_valid = df_validation[features]

X_test = df_test[features]

Y_train = df_train['Demanda_uni_equil']
Y_test = df_test['Demanda_uni_equil']
Y_valid = df_validation['Demanda_uni_equil']



In [13]:
X_train[X_train.Cliente_ID_clust_ID == 174].head()

Unnamed: 0,Agencia_ID,Canal_ID,Cliente_ID,Producto_ID,Ruta_SAK,ZipCode,Last_per_Cliente_ID,Last_per_Ruta_SAK,week_ct,Log_Target_mean_lag1,Log_Target_mean_lag2,Log_Target_mean_lag3,Log_Target_mean_lag4,Lags_sum,brand,Qty_Ruta_SAK_Bin,num_prod,num_prod_uni,Cliente_ID_clust_ID
9584,1110,7,2136383,1250,3303,2008,1.94591,2.013359,0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,1.0,1.0,174.0
9585,1110,7,2136383,325,3303,2008,0.693147,1.29488,1,0.0,0.0,0.0,0.0,0.0,32.0,1.0,9.0,9.0,174.0
9586,1110,7,2136383,1242,3303,2008,2.302585,1.718077,1,0.0,0.0,0.0,0.0,0.0,2.0,1.0,9.0,9.0,174.0
9587,1110,7,2136383,1250,3303,2008,1.098612,2.006562,1,1.098612,0.0,0.0,0.0,1.098612,2.0,1.0,9.0,9.0,174.0
9588,1110,7,2136383,5310,3303,2008,1.94591,2.273084,1,0.0,0.0,0.0,0.0,0.0,14.0,1.0,9.0,9.0,174.0


In [14]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.grid_search import ParameterGrid

sick_clusters = []
val_pred = pd.DataFrame()

for cluster in clusters_by_ClientDemand.Cliente_ID_clust_ID.unique():
    
    print "Cluster :", cluster
    for g in ParameterGrid(best_param):
        start = time.time()
        
        x_train = X_train[X_train.Cliente_ID_clust_ID == cluster]
        y_train = df_train['Demanda_uni_equil'][df_train.Cliente_ID_clust_ID == cluster]
        
        x_valid = X_valid[X_valid.Cliente_ID_clust_ID == cluster]
        y_valid = df_validation['Demanda_uni_equil'][df_validation.Cliente_ID_clust_ID == cluster]

        grid_gbm = xgb.XGBRegressor(**g)
        grid_gbm.fit(x_train, y_train,eval_set=[(x_valid,y_valid)] ,eval_metric='rmse',verbose='False',early_stopping_rounds=10)
        
        score = grid_gbm.evals_result_.values()[-1].values()[-1][-1]
        print "Score for this combination: ", score
        
        tmp = pd.DataFrame()
        tmp['target'] = y_valid
        tmp['pred'] =  grid_gbm.predict(x_valid)
        tmp['cluster'] = cluster
        val_pred =pd.concat([val_pred,tmp], ignore_index=True)
        
        if(score > 0.5):
            sick_clusters.append((cluster,score))

        print("total time taken this loop: ", time.time() - start)

    #pred_train = gbm.predict(X_train)
    #pred_val = gbm.predict(X_valid)

    #print "R2 Train = ",r2_score(np.float64(Y_train),np.float64(pred_train))
    #print "RMSE Train", np.sqrt(mean_squared_error(np.float64(Y_train),np.float64(pred_train)))
    
    #print "R2 Validation= ",r2_score(np.float64(Y_valid),np.float64(pred_val))
    #print "RMSE Validation ", np.sqrt(mean_squared_error(np.float64(Y_valid),np.float64(pred_val)))
    


Cluster : 176
Will train until validation_0-rmse hasn't improved in 10 rounds.
Stopping. Best iteration:
[24]	validation_0-rmse:0.687273

Score for this combination:  0.700155
('total time taken this loop: ', 19.874000072479248)
Cluster : 307
Will train until validation_0-rmse hasn't improved in 10 rounds.
Stopping. Best iteration:
[28]	validation_0-rmse:0.738238

Score for this combination:  0.758507
('total time taken this loop: ', 1.7349998950958252)
Cluster : 139
Will train until validation_0-rmse hasn't improved in 10 rounds.
Stopping. Best iteration:
[25]	validation_0-rmse:0.798207

Score for this combination:  0.815009
('total time taken this loop: ', 4.0940001010894775)
Cluster : 19
Will train until validation_0-rmse hasn't improved in 10 rounds.
Stopping. Best iteration:
[20]	validation_0-rmse:1.40199

Score for this combination:  1.428401
('total time taken this loop: ', 0.4679999351501465)
Cluster : 335
Will train until validation_0-rmse hasn't improved in 10 rounds.
Stoppin

XGBoostError: [03:47:37] src/metric/elementwise_metric.cc:27: Check failed: (info.labels.size()) != (0) label set cannot be empty

In [None]:
print "Total Sick Clusters :", len(sick_clusters)

In [None]:
import seaborn as sns


sns.distplot(df_train.Cliente_ID_clust_ID.fillna(-999))

I like that the 400 clusters are somewhat of uniform distribution

In [None]:
print "R2 Validation= ",r2_score(np.float64(val_pred.target),np.float64(val_pred.pred))
print "RMSE Validation ", np.sqrt(mean_squared_error(val_pred.target,np.float64(val_pred.pred)))

In [None]:
df_sick = pd.DataFrame(sick_clusters, columns=['ClusID','RMSE'])

In [None]:
print 100* np.float(X_train[X_train.Cliente_ID_clust_ID.isin(df_sick.ClusID)].shape[0]) / X_train.shape[0]

Let's train using all the sick clusters:

In [None]:

sick_val_pred = pd.DataFrame()

    

for g in ParameterGrid(best_param):
    start = time.time()
        
    x_train = X_train[X_train.Cliente_ID_clust_ID.isin(df_sick.ClusID)]
    y_train = df_train['Demanda_uni_equil'][df_train.Cliente_ID_clust_ID.isin(df_sick.ClusID)]
        
    x_valid = X_valid[X_valid.Cliente_ID_clust_ID.isin(df_sick.ClusID)]
    y_valid = df_validation['Demanda_uni_equil'][df_validation.Cliente_ID_clust_ID.isin(df_sick.ClusID)]

    sick_gbm = xgb.XGBRegressor(**g)
    sick_gbm.fit(x_train, y_train,eval_set=[(x_valid,y_valid)] ,eval_metric='rmse',verbose='True',early_stopping_rounds=10)
        
    score = sick_gbm.evals_result_.values()[-1].values()[-1][-1]
    print "Score for this combination: ", score
        
    tmp = pd.DataFrame()
    tmp['target'] = y_valid
    tmp['pred'] =  sick_gbm.predict(x_valid)
    tmp['cluster'] = df_train.Cliente_ID_clust_ID
    sick_val_pred =pd.concat([sick_val_pred,tmp], ignore_index=True)
        
    print("total time taken this loop: ", time.time() - start)

    #pred_train = gbm.predict(X_train)
    #pred_val = gbm.predict(X_valid)

    #print "R2 Train = ",r2_score(np.float64(Y_train),np.float64(pred_train))
    #print "RMSE Train", np.sqrt(mean_squared_error(np.float64(Y_train),np.float64(pred_train)))
    
    #print "R2 Validation= ",r2_score(np.float64(Y_valid),np.float64(pred_val))
    #print "RMSE Validation ", np.sqrt(mean_squared_error(np.float64(Y_valid),np.float64(pred_val)))
    


In [None]:
good_pred = val_pred[~val_pred.cluster.isin(sick_clusters)]
total_pred = pd.concat([good_pred, sick_val_pred], ignore_index=True)
print "R2 Validation= ",r2_score(np.float64(total_pred.target),np.float64(total_pred.pred))
print "RMSE Validation ", np.sqrt(mean_squared_error(total_pred.target,np.float64(total_pred.pred)))

In [None]:
print "R2 Validation= ",r2_score(np.float64(good_pred.target),np.float64(good_pred.pred))
print "RMSE Validation ", np.sqrt(mean_squared_error(good_pred.target,np.float64(good_pred.pred)))

print "R2 Validation= ",r2_score(np.float64(sick_val_pred.target),np.float64(sick_val_pred.pred))
print "RMSE Validation ", np.sqrt(mean_squared_error(sick_val_pred.target,np.float64(sick_val_pred.pred)))

In [None]:
df_all = df_train.append(df_validation, ignore_index=True)

X = df_all[features]
Y = df_all['Demanda_uni_equil']

In [None]:
good_clusters = np.setdiff1d(clusters_by_ClientDemand.Cliente_ID_clust_ID.unique(),df_sick.ClusID)

val_pred = pd.DataFrame()


from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.grid_search import ParameterGrid

for cluster in clusters_by_ClientDemand.Cliente_ID_clust_ID.unique():

    for g in ParameterGrid(best_param):
        start = time.time()
        

        x = X
        y = df_all['Demanda_uni_equil']

        gbm = xgb.XGBRegressor(**g)
        print y.shape
        display(y.head())
        display(x.head())
        print x.shape
        gbm.fit(x, y ,eval_metric='rmse',early_stopping_rounds=10)


        print("total time taken this loop: ", time.time() - start)

    tmp = pd.DataFrame()
    tmp['target'] = y
    tmp['pred'] =  gbm.predict(x)
    tmp['cluster'] = X.Cliente_ID_clust_ID
    val_pred =pd.concat([val_pred,tmp], ignore_index=True)
    
    pred = gbm.predict(x)

    print "R2 = ",r2_score(np.float64(y),np.float64(pred))
    print "RMSE", np.sqrt(mean_squared_error(np.float64(y),np.float64(pred)))

In [None]:
sick_val_pred = pd.DataFrame()

for g in ParameterGrid(best_param):
    start = time.time()
        
    x = X[X.Cliente_ID_clust_ID.isin(df_sick.ClusID)]
    y = df_all['Demanda_uni_equil'][df_.Cliente_ID_clust_ID.isin(df_sick.ClusID)]
        
    
    sick_gbm = xgb.XGBRegressor(**g)
    sick_gbm.fit(x, y ,eval_metric='rmse',verbose='True',early_stopping_rounds=10)
        
    score = sick_gbm.evals_result_.values()[-1].values()[-1][-1]
    print "Score for this combination: ", score
        
    tmp = pd.DataFrame()
    tmp['target'] = y
    tmp['pred'] =  sick_gbm.predict(x)
    tmp['cluster'] = x.Cliente_ID_clust_ID
    sick_val_pred =pd.concat([sick_val_pred,tmp], ignore_index=True)
        
    print("total time taken this loop: ", time.time() - start)
    
    pred = gbm.predict(x)

    print "R2 = ",r2_score(np.float64(y),np.float64(pred))
    print "RMSE", np.sqrt(mean_squared_error(np.float64(y),np.float64(pred)))
