In [7]:
import pandas as pd
import numpy as np
from catboost import Pool, CatBoostRegressor, CatboostIpythonWidget
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

# from xgboost import XGBRegressor
# from lightgbm import LGBMRegressor

In [2]:
def rmspe(y,y_pred):
    summ = 0
    for i in range(len(y)):
        if y.iloc[i] != 0:
            summ += (1-y_pred[i]/y.iloc[i])**2
    return sqrt(summ/len(y))

def substract_cols(df):
    df['CompetitionOpen'] = 12 * (df.Year - df['CompetitionOpenSinceYear']) + (df.Month - df['CompetitionOpenSinceMonth'])
    df['CompetitionOpen'] = df.CompetitionOpen.apply(lambda x: x if x > 0 else 0)
    df.loc[df['CompetitionOpenSinceYear'] < 0, 'CompetitionOpen'] = 0
    df['PromoOpen'] = 12 * (df.Year - df.Promo2SinceYear) + (df['WeekOfYear'] - df['Promo2SinceWeek']) / 4.0
    df['PromoOpen'] = df.PromoOpen.apply(lambda x: x if x > 0 else 0)
    df.loc[df['Promo2SinceYear'] == 0, 'PromoOpen'] = 0
    df.loc[df['Promo2SinceYear'] < 0, 'PromoOpen'] = 0
    df.drop(['CompetitionOpenSinceYear',
             'CompetitionOpenSinceMonth',
             'Promo2SinceYear',
             'Promo2SinceWeek'], axis=1, inplace=True)
    
    month2str = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun', \
         7:'Jul', 8:'Aug', 9:'Sept', 10:'Oct', 11:'Nov', 12:'Dec'}
    df['monthStr'] = df.Month.map(month2str)
    df.loc[df.PromoInterval == -1, 'PromoInterval'] = ''
    df['IsPromoMonth'] = 0
    for interval in df.PromoInterval.unique():
            if interval != '':
                for month in interval.split(','):
                    df.loc[(df.monthStr == month) & (df.PromoInterval == interval), 'IsPromoMonth'] = 1
    df.drop(['monthStr'], axis=1, inplace=True)
    
#     df['p_1'] = df.PromoInterval.apply(lambda x: x[:3] if type(x) == str else 0)
#     df['p_2'] = df.PromoInterval.apply(lambda x: x[4:7] if type(x) == str else 0)
#     df['p_3'] = df.PromoInterval.apply(lambda x: x[8:11] if type(x) == str else 0)
#     df['p_4'] = df.PromoInterval.apply(lambda x: x[12:15] if type(x) == str else 0)
    
    return df

class Ensemble(object):
    def __init__(self, n_splits, stacker, base_models):
        self.n_splits = n_splits
        self.stacker = stacker
        self.base_models = base_models

    def fit_predict(self, X, y, T):
        X = np.array(X)
        y = np.array(y)
        T = np.array(T)

        folds = list(KFold(n_splits=self.n_splits, shuffle=True, random_state=2016).split(X, y))

        S_train = np.zeros((X.shape[0], len(self.base_models)))
        S_test = np.zeros((T.shape[0], len(self.base_models)))
        for i, clf in enumerate(self.base_models):

            S_test_i = np.zeros((T.shape[0], self.n_splits))

            for j, (train_idx, test_idx) in enumerate(folds):
                X_train = X[train_idx]
                y_train = y[train_idx]
                X_holdout = X[test_idx]
                y_holdout = y[test_idx]
                print ("Fit Model %d fold %d" % (i, j))
                clf.fit(X_train, y_train)
                y_pred = clf.predict(X_holdout)[:]                

                S_train[test_idx, i] = y_pred
                S_test_i[:, j] = clf.predict(T)[:]
            S_test[:, i] = S_test_i.mean(axis=1)

        results = cross_val_score(self.stacker, S_train, y, cv=5, scoring='r2')
        print("Stacker score: %.4f (%.4f)" % (results.mean(), results.std()))
        # exit()

        self.stacker.fit(S_train, y)
        res = self.stacker.predict(S_test)[:]
        return res

In [3]:
df_train = pd.read_csv('data/train_v2.csv')
df_train['Date']  = pd.to_datetime(df_train['Date'], errors='coerce')
df_train['Year'] = df_train['Date'].dt.year
df_train['Month'] = df_train['Date'].dt.month
df_train['WeekOfYear'] = df_train['Date'].dt.weekofyear
df_train['Day'] = df_train['Date'].dt.day
df_train = df_train.drop('Date', axis=1)

df_test = pd.read_csv('data/test_v2.csv')
df_test['Date']  = pd.to_datetime(df_test['Date'], errors='coerce')
df_test['Year'] = df_test['Date'].dt.year
df_test['Month'] = df_test['Date'].dt.month
df_test['WeekOfYear'] = df_test['Date'].dt.weekofyear
df_test['Day'] = df_test['Date'].dt.day
df_test = df_test.drop('Date', axis=1)

df_store = pd.read_csv('data/store.csv')
df_store = df_store.fillna(-1)

# df_train = df_train[df_train.Sales > 0]
# df_train = df_train[df_train.Customers > 0]
df_train = df_train[df_train['Open'] != 0]

df_train_store = df_train.join(df_store.set_index('Store'), on='Store')
df_train_store = substract_cols(df_train_store)
df_train_store.drop(['Open','PromoInterval','Year','Month','WeekOfYear'], axis=1, inplace=True)

df_test_store = df_test.join(df_store.set_index('Store'), on='Store')
df_test_store = substract_cols(df_test_store)
df_test_store.drop( ['Open','PromoInterval','Year','Month','WeekOfYear'], axis=1, inplace=True)

mappings = {'0':0, 'a':1, 'b':2, 'c':3, 'd':4}
df_train_store.StateHoliday.replace(mappings, inplace=True)
df_train_store.StoreType.replace(mappings, inplace=True)
df_train_store.Assortment.replace(mappings, inplace=True)
df_test_store.StoreType.replace(mappings, inplace=True)
df_test_store.Assortment.replace(mappings, inplace=True)


df_train_store = df_train_store.astype(np.float32)
df_test_store = df_test_store.astype(np.float32)

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
X = df_train_store.drop('Sales', axis=1)
y = df_train_store.Sales

X_test = df_test_store

categorical_features_indices = [0,1,3,4,5,6,7,8,10,13]

X_train, X_validation, y_train, y_validation = train_test_split(X, y, train_size=0.85, random_state=1234)



In [5]:
X.head()

Unnamed: 0,Store,DayOfWeek,Customers,Promo,StateHoliday,SchoolHoliday,Day,StoreType,Assortment,CompetitionDistance,Promo2,CompetitionOpen,PromoOpen,IsPromoMonth
0,1.0,5.0,616.0,1.0,0.0,0.0,30.0,3.0,1.0,1270.0,0.0,76.0,0.0,0.0
1,2.0,5.0,624.0,1.0,0.0,0.0,30.0,1.0,1.0,570.0,1.0,86.0,58.0,1.0
2,3.0,5.0,678.0,1.0,0.0,0.0,30.0,1.0,1.0,14130.0,1.0,97.0,45.75,1.0
3,4.0,5.0,1632.0,1.0,0.0,0.0,30.0,3.0,3.0,620.0,0.0,64.0,0.0,0.0
4,5.0,5.0,617.0,1.0,0.0,0.0,30.0,1.0,1.0,29910.0,0.0,0.0,0.0,0.0


In [13]:
# RF model
rf_model = RandomForestRegressor(max_depth=6,n_estimators=250)

# ET model
et_model = ExtraTreesRegressor()

# SVR model
# SVM is too slow in more then 10000 set
svr_model = SVR(kernel='rbf', C=1.0, epsilon=0.05)

# DecsionTree model
dt_model = DecisionTreeRegressor()

# AdaBoost model
ada_model = AdaBoostRegressor()

#CatBoost
cat_boost = CatBoostRegressor()


stack = Ensemble(n_splits=3,
        stacker=LinearRegression(),
        base_models=(svr_model, rf_model, et_model, dt_model, ada_model, cat_boost))

y_test = stack.fit_predict(X, y, X_test)

from datetime import datetime
print("submit...")
pre = y_test
# sub = pd.read_csv('../input/sample_submission.csv')
# for c in sub.columns[sub.columns != 'ParcelId']:
#     sub[c] = pre
# submit_file = '{}.csv'.format(datetime.now().strftime('%Y%m%d_%H_%M'))
# sub.to_csv(submit_file, index=False,  float_format='%.4f')

Fit Model 0 fold 0
Fit Model 0 fold 1
Fit Model 0 fold 2
Fit Model 1 fold 0
Fit Model 1 fold 1
Fit Model 1 fold 2
Fit Model 2 fold 0
Fit Model 2 fold 1
Fit Model 2 fold 2
Fit Model 3 fold 0
Fit Model 3 fold 1
Fit Model 3 fold 2
Fit Model 4 fold 0
Fit Model 4 fold 1
Fit Model 4 fold 2
Fit Model 5 fold 0
Fit Model 5 fold 1
Fit Model 5 fold 2
Stacker score: 0.9302 (0.0073)
submit...


In [14]:
nic = pd.read_csv('nic_submission.csv')
nic_pred = nic.Sales

for i in range (X_test.shape[0]) :
    if df_test_store['Customers'].iloc[i] == 0 :
        pre[i] = 0
        
e_nic = rmspe(nic_pred, pre)
print("----------//----------")
print("e_nic is ",e_nic)

----------//----------
e_nic is  0.0601365324664352


In [15]:
submission = pd.DataFrame()
submission['Sales'] = pre
# submission.Sales = submission.Sales.astype(int)
cols = ['Id','Sales']
submission['Id'] = submission.index + 1
submission = submission[cols]
submission.to_csv('submission.csv', index=False)