In [1]:
import pandas as pd
import numpy as np
from catboost import Pool, CatBoostRegressor, CatboostIpythonWidget
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.ensemble import RandomForestRegressor

  return f(*args, **kwds)


In [2]:
def rmspe(y,y_pred):
    summ = 0
    for i in range(len(y)):
        if y.iloc[i] != 0:
            summ += (1-y_pred[i]/y.iloc[i])**2
    return sqrt(summ/len(y))

def substract_cols(df):
    df['CompetitionOpen'] = 12 * (df.Year - df['CompetitionOpenSinceYear']) + (df.Month - df['CompetitionOpenSinceMonth'])
    df['CompetitionOpen'] = df.CompetitionOpen.apply(lambda x: x if x > 0 else 0)
    df.loc[df['CompetitionOpenSinceYear'] < 0, 'CompetitionOpen'] = 0
    df['PromoOpen'] = 12 * (df.Year - df.Promo2SinceYear) + (df['WeekOfYear'] - df['Promo2SinceWeek']) / 4.0
    df['PromoOpen'] = df.PromoOpen.apply(lambda x: x if x > 0 else 0)
    df.loc[df['Promo2SinceYear'] == 0, 'PromoOpen'] = 0
    df.loc[df['Promo2SinceYear'] < 0, 'PromoOpen'] = 0
    df.drop(['CompetitionOpenSinceYear',
             'CompetitionOpenSinceMonth',
             'Promo2SinceYear',
             'Promo2SinceWeek'], axis=1, inplace=True)
    
    month2str = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun', \
         7:'Jul', 8:'Aug', 9:'Sept', 10:'Oct', 11:'Nov', 12:'Dec'}
    df['monthStr'] = df.Month.map(month2str)
    df.loc[df.PromoInterval == -1, 'PromoInterval'] = ''
    df['IsPromoMonth'] = 0
    for interval in df.PromoInterval.unique():
            if interval != '':
                for month in interval.split(','):
                    df.loc[(df.monthStr == month) & (df.PromoInterval == interval), 'IsPromoMonth'] = 1
    
#     df['p_1'] = df.PromoInterval.apply(lambda x: x[:3] if type(x) == str else 0)
#     df['p_2'] = df.PromoInterval.apply(lambda x: x[4:7] if type(x) == str else 0)
#     df['p_3'] = df.PromoInterval.apply(lambda x: x[8:11] if type(x) == str else 0)
#     df['p_4'] = df.PromoInterval.apply(lambda x: x[12:15] if type(x) == str else 0)
    
    return df

In [3]:
df_train = pd.read_csv('data/train_v2.csv')
df_train['Date']  = pd.to_datetime(df_train['Date'], errors='coerce')
df_train['Year'] = df_train['Date'].dt.year
df_train['Month'] = df_train['Date'].dt.month
df_train['WeekOfYear'] = df_train['Date'].dt.weekofyear
df_train['Day'] = df_train['Date'].dt.day
df_train = df_train.drop('Date', axis=1)

df_test = pd.read_csv('data/test_v2.csv')
df_test['Date']  = pd.to_datetime(df_test['Date'], errors='coerce')
df_test['Year'] = df_test['Date'].dt.year
df_test['Month'] = df_test['Date'].dt.month
df_test['WeekOfYear'] = df_test['Date'].dt.weekofyear
df_test['Day'] = df_test['Date'].dt.day
df_test = df_test.drop('Date', axis=1)

df_store = pd.read_csv('data/store.csv')
df_store = df_store.fillna(-1)

# df_train = df_train[df_train.Sales > 0]
# df_train = df_train[df_train.Customers > 0]
df_train = df_train[df_train['Open'] != 0]

df_train_store = df_train.join(df_store.set_index('Store'), on='Store')
df_train_store = substract_cols(df_train_store)
df_train_store.drop(['Open','PromoInterval','Year','Month','WeekOfYear'], axis=1, inplace=True)

df_test_store = df_test.join(df_store.set_index('Store'), on='Store')
df_test_store = substract_cols(df_test_store)
df_test_store.drop( ['Open','PromoInterval','Year','Month','WeekOfYear'], axis=1, inplace=True)

mappings = {'0':0, 'a':1, 'b':2, 'c':3, 'd':4}
df_train_store.StateHoliday.replace(mappings, inplace=True)
df_train_store.StoreType.replace(mappings, inplace=True)
df_train_store.Assortment.replace(mappings, inplace=True)
df_test_store.StoreType.replace(mappings, inplace=True)
df_test_store.Assortment.replace(mappings, inplace=True)


# df_train_store = df_train_store.astype(np.float32)
# df_test_store = df_test_store.astype(np.float32)

# df_train_store.drop(['CompetitionDistance', 'CompetitionOpenSinceMonth',
#        'CompetitionOpenSinceYear', 'Promo2SinceWeek',
#        'Promo2SinceYear', 'PromoInterval'], axis=1, inplace=True)
# df_test_store.drop(['CompetitionDistance', 'CompetitionOpenSinceMonth',
#        'CompetitionOpenSinceYear', 'Promo2SinceWeek',
#        'Promo2SinceYear', 'PromoInterval'], axis=1, inplace=True)

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
df_train_store.head()

Unnamed: 0,Store,DayOfWeek,Sales,Customers,Promo,StateHoliday,SchoolHoliday,Day,StoreType,Assortment,CompetitionDistance,Promo2,CompetitionOpen,PromoOpen,monthStr,IsPromoMonth
0,1,5,5577,616,1,0,0,30,3,1,1270.0,0,76.0,0.0,Jan,0
1,2,5,5919,624,1,0,0,30,1,1,570.0,1,86.0,58.0,Jan,1
2,3,5,6911,678,1,0,0,30,1,1,14130.0,1,97.0,45.75,Jan,1
3,4,5,13307,1632,1,0,0,30,3,3,620.0,0,64.0,0.0,Jan,0
4,5,5,5640,617,1,0,0,30,1,1,29910.0,0,0.0,0.0,Jan,0


In [5]:
df_test_store.head()

Unnamed: 0,Store,DayOfWeek,Customers,Promo,StateHoliday,SchoolHoliday,Day,StoreType,Assortment,CompetitionDistance,Promo2,CompetitionOpen,PromoOpen,monthStr,IsPromoMonth
0,1,5,555,1,0,1,31,3,1,1270.0,0,82.0,0.0,Jul,0
1,2,5,625,1,0,1,31,1,1,570.0,1,92.0,64.5,Jul,1
2,3,5,821,1,0,1,31,1,1,14130.0,1,103.0,52.25,Jul,1
3,4,5,1498,1,0,1,31,3,3,620.0,0,70.0,0.0,Jul,0
4,5,5,559,1,0,1,31,1,1,29910.0,0,3.0,0.0,Jul,0


In [6]:
X = df_train_store.drop('Sales', axis=1)
y = df_train_store.Sales

X_test = df_test_store

categorical_features_indices = [0,1,3,4,5,6,7,8,10,13,14]

X_train, X_validation, y_train, y_validation = train_test_split(X, y, train_size=0.85, random_state=1234)



In [7]:
X.head()

Unnamed: 0,Store,DayOfWeek,Customers,Promo,StateHoliday,SchoolHoliday,Day,StoreType,Assortment,CompetitionDistance,Promo2,CompetitionOpen,PromoOpen,monthStr,IsPromoMonth
0,1,5,616,1,0,0,30,3,1,1270.0,0,76.0,0.0,Jan,0
1,2,5,624,1,0,0,30,1,1,570.0,1,86.0,58.0,Jan,1
2,3,5,678,1,0,0,30,1,1,14130.0,1,97.0,45.75,Jan,1
3,4,5,1632,1,0,0,30,3,3,620.0,0,64.0,0.0,Jan,0
4,5,5,617,1,0,0,30,1,1,29910.0,0,0.0,0.0,Jan,0


In [8]:
model = CatBoostRegressor()
model.fit(
#     X_train, y_train,
    X, y,
    cat_features=categorical_features_indices,
#     eval_set=(X_validation, y_validation),
)

<catboost.core.CatBoostRegressor at 0x1a10120be0>

In [9]:
nic = pd.read_csv('nic_submission.csv')
nic_pred = nic.Sales

pred_test = model.predict(X_test)

for i in range (X_test.shape[0]) :
    if df_test_store['Customers'].iloc[i] == 0 :
        pred_test[i] = 0

In [89]:
submission = pd.DataFrame()
submission['Sales'] = pred_test
# submission.Sales = submission.Sales.astype(int)
cols = ['Id','Sales']
submission['Id'] = submission.index + 1
submission = submission[cols]
submission.to_csv('submission.csv', index=False)

In [5]:
# (df_train_store['CompetitionDistance'] == -999).sum()

# df_train.isnull().sum(axis=0)
# X.dtypes

# (df_train['Sales'] == 0).sum()