In [1]:
import pandas as pd
import numpy as np
from math import sqrt
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

  return f(*args, **kwds)


In [2]:
# evaluation metric
def rmspe(y,y_pred):
    summ = 0
    for i in range(len(y)):
        if y.iloc[i] != 0:
            summ += (1-y_pred[i]/y.iloc[i])**2
    return sqrt(summ/len(y))

def promo_splitting(df):
    # getting IsPromoMonth as a new column
    month2str = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun', \
         7:'Jul', 8:'Aug', 9:'Sept', 10:'Oct', 11:'Nov', 12:'Dec'}
    df['monthStr'] = df.Month.map(month2str)
    df.loc[df.PromoInterval == -1, 'PromoInterval'] = ''
    df['IsPromoMonth'] = 0
    for interval in df.PromoInterval.unique():
            if interval != '':
                for month in interval.split(','):
                    df.loc[(df.monthStr == month) & (df.PromoInterval == interval), 'IsPromoMonth'] = 1
    df.drop(['monthStr'], axis=1, inplace=True)
    return df

# class for stacking different models
class Ensemble(object):
    def __init__(self, n_splits, stacker, base_models):
        self.n_splits = n_splits
        self.stacker = stacker
        self.base_models = base_models

    def fit_predict(self, X, y, T):
        X = np.array(X)
        y = np.array(y)
        T = np.array(T)

        folds = list(KFold(n_splits=self.n_splits, shuffle=True, random_state=2016).split(X, y))

        S_train = np.zeros((X.shape[0], len(self.base_models)))
        S_test = np.zeros((T.shape[0], len(self.base_models)))
        for i, clf in enumerate(self.base_models):

            S_test_i = np.zeros((T.shape[0], self.n_splits))

            for j, (train_idx, test_idx) in enumerate(folds):
                X_train = X[train_idx]
                y_train = y[train_idx]
                X_holdout = X[test_idx]
                y_holdout = y[test_idx]
                print ("Fit Model %d fold %d" % (i, j))
                clf.fit(X_train, y_train)
                y_pred = clf.predict(X_holdout)[:]                

                S_train[test_idx, i] = y_pred
                S_test_i[:, j] = clf.predict(T)[:]
            S_test[:, i] = S_test_i.mean(axis=1)

        results = cross_val_score(self.stacker, S_train, y, cv=5, scoring='r2')
        print("Stacker score: %.4f (%.4f)" % (results.mean(), results.std()))
        self.stacker.fit(S_train, y)
        
        # overall resulst is the average between different models' results
        res = self.stacker.predict(S_test)[:]
        return res

In [3]:
# having date splitted
df_train = pd.read_csv('data/train_v2.csv')
df_train['Date']  = pd.to_datetime(df_train['Date'], errors='coerce')
df_train['Year'] = df_train['Date'].dt.year
df_train['Month'] = df_train['Date'].dt.month
df_train['WeekOfYear'] = df_train['Date'].dt.weekofyear
df_train['Day'] = df_train['Date'].dt.day
df_train = df_train.drop('Date', axis=1)

df_test = pd.read_csv('data/test_v2.csv')
df_test['Date']  = pd.to_datetime(df_test['Date'], errors='coerce')
df_test['Year'] = df_test['Date'].dt.year
df_test['Month'] = df_test['Date'].dt.month
df_test['WeekOfYear'] = df_test['Date'].dt.weekofyear
df_test['Day'] = df_test['Date'].dt.day
df_test = df_test.drop('Date', axis=1)

df_store = pd.read_csv('data/store.csv')
# filling missed values
df_store = df_store.fillna(-1)

# put away rows where there are closed stores
df_train = df_train[df_train['Open'] != 0]

# joining train and store and dropping redundant columns
df_train_store = df_train.join(df_store.set_index('Store'), on='Store')
df_train_store = promo_splitting(df_train_store)
df_train_store.drop(['Open','PromoInterval','Year','Month','WeekOfYear'], axis=1, inplace=True)

df_test_store = df_test.join(df_store.set_index('Store'), on='Store')
df_test_store = promo_splitting(df_test_store)
df_test_store.drop( ['Open','PromoInterval','Year','Month','WeekOfYear'], axis=1, inplace=True)

# map string values to corresponding numbers
mappings = {'0':0, 'a':1, 'b':2, 'c':3, 'd':4}
df_train_store.StateHoliday.replace(mappings, inplace=True)
df_train_store.StoreType.replace(mappings, inplace=True)
df_train_store.Assortment.replace(mappings, inplace=True)
df_test_store.StoreType.replace(mappings, inplace=True)
df_test_store.Assortment.replace(mappings, inplace=True)

df_train_store = df_train_store.astype(np.float32)
df_test_store = df_test_store.astype(np.float32)

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
df_train_store.head()

Unnamed: 0,Store,DayOfWeek,Sales,Customers,Promo,StateHoliday,SchoolHoliday,Day,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,IsPromoMonth
0,1.0,5.0,5577.0,616.0,1.0,0.0,0.0,30.0,3.0,1.0,1270.0,9.0,2008.0,0.0,-1.0,-1.0,0.0
1,2.0,5.0,5919.0,624.0,1.0,0.0,0.0,30.0,1.0,1.0,570.0,11.0,2007.0,1.0,13.0,2010.0,1.0
2,3.0,5.0,6911.0,678.0,1.0,0.0,0.0,30.0,1.0,1.0,14130.0,12.0,2006.0,1.0,14.0,2011.0,1.0
3,4.0,5.0,13307.0,1632.0,1.0,0.0,0.0,30.0,3.0,3.0,620.0,9.0,2009.0,0.0,-1.0,-1.0,0.0
4,5.0,5.0,5640.0,617.0,1.0,0.0,0.0,30.0,1.0,1.0,29910.0,4.0,2015.0,0.0,-1.0,-1.0,0.0


In [5]:
y = df_train_store.Sales
X = df_train_store.drop('Sales', axis=1)

X_test = df_test_store

X_train, X_validation, y_train, y_validation = train_test_split(X, y, train_size=0.85, random_state=1234)



In [6]:
depth=6
estimators=250
learning_rate=0.1

# catboost model parameters
cat_boost_params = {}
cat_boost_params['learning_rate'] = learning_rate

# random forests model parameters
rf_params = {}
rf_params['n_estimators'] = estimators

# random forests model
rf_model = RandomForestRegressor(**rf_params)
cat_boost = CatBoostRegressor(**cat_boost_params)

stack = Ensemble(n_splits=5,
        stacker=LinearRegression(),
        base_models=(cat_boost, rf_model))

y_test = stack.fit_predict(X, y, X_test)

print("submit...")
pred = y_test

# assigning 0 to rows where there were no customers
for i in range (X_test.shape[0]) :
    if df_test['Customers'].iloc[i] == 0 :
        pred[i] = 0

Fit Model 0 fold 0
Fit Model 0 fold 1
Fit Model 0 fold 2
Fit Model 0 fold 3
Fit Model 0 fold 4
Fit Model 1 fold 0
Fit Model 1 fold 1
Fit Model 1 fold 2
Fit Model 1 fold 3
Fit Model 1 fold 4
Stacker score: 0.9474 (0.0062)
submit...


In [9]:
e_out = rmspe(y_validation, pred)
print("----------//----------")
print("e_out is ",e_out)

In [10]:
submission = pd.DataFrame()
submission['Sales'] = pred
# submission.Sales = submission.Sales.astype(int)
cols = ['Id','Sales']
submission['Id'] = submission.index + 1
submission = submission[cols]
submission.to_csv('submission.csv', index=False)