In [75]:
import pandas as pd
from catboost import Pool, CatBoostRegressor, CatboostIpythonWidget
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt

In [76]:
def rmspe(y,y_pred):
    summ = 0
    for i in range(len(y)):
        if y.iloc[i] != 0:
            summ += (1-y_pred[i]/y.iloc[i])**2
    return sqrt(summ/len(y))

In [77]:
df_train = pd.read_csv('data/train_v2.csv')
df_train['Date']  = pd.to_datetime(df_train['Date'], errors='coerce')
df_train['Year'] = df_train['Date'].dt.year
df_train['Month'] = df_train['Date'].dt.month
df_train['WeekOfYear'] = df_train['Date'].dt.dayofweek
df_train['Day'] = df_train['Date'].dt.day
df_train = df_train.drop('Date', axis=1)

# df_train = df_train[df_train["Open"] != 0]
# df_train = df_train[df_train["Sales"] > 0]

df_test = pd.read_csv('data/test_v2.csv')
df_test['Date']  = pd.to_datetime(df_test['Date'], errors='coerce')
df_test['Year'] = df_test['Date'].dt.year
df_test['Month'] = df_test['Date'].dt.month
df_test['WeekOfYear'] = df_test['Date'].dt.dayofweek
df_test['Day'] = df_test['Date'].dt.day
df_test = df_test.drop('Date', axis=1)

df_store = pd.read_csv('data/store.csv')
df_store['CompetitionDistance'].fillna(-1,inplace=True)
df_store['CompetitionOpenSinceMonth'].fillna(-1,inplace=True)
df_store['CompetitionOpenSinceYear'].fillna(-1,inplace=True)
df_store['Promo2SinceWeek'].fillna(0,inplace=True)
df_store['Promo2SinceYear'].fillna(0,inplace=True)
df_store['PromoInterval'].fillna(' ',inplace=True)

df_train_store = df_train.join(df_store.set_index('Store'), on='Store')
df_train_store['CompetitionOpen'] = 12 * (df_train_store.Year - df_train_store.CompetitionOpenSinceYear) + \
        (df_train_store.Month - df_train_store.CompetitionOpenSinceMonth)
df_train_store['CompetitionOpen'] = df_train_store.CompetitionOpen.apply(lambda x: x if x > 0 else 0)
df_train_store.loc[df_train_store.CompetitionOpenSinceYear < 0, 'CompetitionOpen'] = 0

df_train_store['PromoOpen'] = 12 * (df_train_store.Year - df_train_store.Promo2SinceYear) + \
        (df_train_store.WeekOfYear - df_train_store.Promo2SinceWeek) / 4.0
    
df_train_store['PromoOpen'] = df_train_store.PromoOpen.apply(lambda x: x if x > 0 else 0)
df_train_store.loc[df_train_store.Promo2SinceYear == 0, 'PromoOpen'] = 0

df_train_store.drop(['CompetitionOpenSinceYear',
                     'CompetitionOpenSinceMonth',
                     'Promo2SinceYear',
                     'Promo2SinceWeek'], axis=1, inplace=True)
df_train_store.drop(['Promo2','PromoInterval'], axis=1, inplace=True)

df_train_store.drop(['Year','Month'], axis=1, inplace=True)

mappings = {'0':0, 'a':1, 'b':2, 'c':3, 'd':4}
df_train_store.StateHoliday.replace(mappings, inplace=True)
df_train_store.StoreType.replace(mappings, inplace=True)
df_train_store.Assortment.replace(mappings, inplace=True)

df_test_store = df_test.join(df_store.set_index('Store'), on='Store')
df_test_store['CompetitionOpen'] = 12 * (df_test_store.Year - df_test_store.CompetitionOpenSinceYear) + \
        (df_test_store.Month - df_test_store.CompetitionOpenSinceMonth)
df_test_store['CompetitionOpen'] = df_test_store.CompetitionOpen.apply(lambda x: x if x > 0 else 0)
df_test_store.loc[df_test_store.CompetitionOpenSinceYear < 0, 'CompetitionOpen'] = 0

df_test_store['PromoOpen'] = 12 * (df_test_store.Year - df_test_store.Promo2SinceYear) + \
        (df_test_store.WeekOfYear - df_test_store.Promo2SinceWeek) / 4.0
    
df_test_store['PromoOpen'] = df_test_store.PromoOpen.apply(lambda x: x if x > 0 else 0)
df_test_store.loc[df_test_store.Promo2SinceYear == 0, 'PromoOpen'] = 0

df_test_store.drop(['CompetitionOpenSinceYear',
                     'CompetitionOpenSinceMonth',
                     'Promo2SinceYear',
                     'Promo2SinceWeek'], axis=1, inplace=True)
df_test_store.drop(['Promo2','PromoInterval'], axis=1, inplace=True)

df_test_store.drop(['Year','Month'], axis=1, inplace=True)

# df_train_store.drop(['CompetitionDistance', 'CompetitionOpenSinceMonth',
#        'CompetitionOpenSinceYear', 'Promo2SinceWeek',
#        'Promo2SinceYear', 'PromoInterval'], axis=1, inplace=True)
# df_test_store.drop(['CompetitionDistance', 'CompetitionOpenSinceMonth',
#        'CompetitionOpenSinceYear', 'Promo2SinceWeek',
#        'Promo2SinceYear', 'PromoInterval'], axis=1, inplace=True)

  interactivity=interactivity, compiler=compiler, result=result)


In [10]:
df_train_store.head()

Unnamed: 0,Store,DayOfWeek,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,WeekOfYear,Day,StoreType,Assortment,CompetitionDistance,CompetitionOpen,PromoOpen
0,1,5,5577,616,1,1,0,0,4,30,3,1,1270.0,76.0,0.0
1,2,5,5919,624,1,1,0,0,4,30,1,1,570.0,86.0,57.75
2,3,5,6911,678,1,1,0,0,4,30,1,1,14130.0,97.0,45.5
3,4,5,13307,1632,1,1,0,0,4,30,3,3,620.0,64.0,0.0
4,5,5,5640,617,1,1,0,0,4,30,1,1,29910.0,0.0,0.0


In [8]:
print (df_train_store.Promo2.unique())
print (df_train_store.PromoInterval.unique())
# df_train.head()

[0 1]
[' ' 'Jan,Apr,Jul,Oct' 'Feb,May,Aug,Nov' 'Mar,Jun,Sept,Dec']


In [7]:
print (df_train_store.StateHoliday.unique())
print (df_train_store.SchoolHoliday.unique())

['0' 'a' 'c' 0]
[0 1]


In [None]:
# # Indicate that sales on that day are in promo interval
#     features.append('IsPromoMonth')
#     month2str = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun', \
#              7:'Jul', 8:'Aug', 9:'Sept', 10:'Oct', 11:'Nov', 12:'Dec'}
#     data['monthStr'] = data.Month.map(month2str)
#     data.loc[data.PromoInterval == 0, 'PromoInterval'] = ''
#     data['IsPromoMonth'] = 0
#     for interval in data.PromoInterval.unique():
#         if interval != '':
#             for month in interval.split(','):
#                 data.loc[(data.monthStr == month) & (data.PromoInterval == interval), 'IsPromoMonth'] = 1

#     return data

In [78]:
X = df_train_store.drop('Sales', axis=1)
y = df_train_store.Sales

X_test = df_test_store

categorical_features_indices = [0,1,3,4,5,6,7,8,9,10]

X_train, X_validation, y_train, y_validation = train_test_split(X, y, train_size=0.85, random_state=1234)



In [79]:
X.head()

Unnamed: 0,Store,DayOfWeek,Customers,Open,Promo,StateHoliday,SchoolHoliday,WeekOfYear,Day,StoreType,Assortment,CompetitionDistance,CompetitionOpen,PromoOpen
0,1,5,616,1,1,0,0,4,30,3,1,1270.0,76.0,0.0
1,2,5,624,1,1,0,0,4,30,1,1,570.0,86.0,57.75
2,3,5,678,1,1,0,0,4,30,1,1,14130.0,97.0,45.5
3,4,5,1632,1,1,0,0,4,30,3,3,620.0,64.0,0.0
4,5,5,617,1,1,0,0,4,30,1,1,29910.0,0.0,0.0


In [86]:
iters=50
model = CatBoostRegressor(iterations=iters)
model.fit(
#     X_train, y_train,
    X, y,
    cat_features=categorical_features_indices,
#     eval_set=(X_validation, y_validation),
)
print("----------//----------")
pred = model.predict(X_validation)
accuracy = rmspe(y_validation, pred)
print("RMSPE is                ",accuracy)

----------//----------
RMSPE is                 0.6518480151980175


If you want to save predictions as .csv

In [87]:
pred_test = model.predict(X_test)

submission = pd.DataFrame()
submission['Sales'] = pred_test
# submission.Sales = submission.Sales.astype(int)
cols = ['Id','Sales']
submission['Id'] = submission.index + 1
submission = submission[cols]
submission.to_csv('submission.csv', index=False)

In [5]:
# (df_train_store['CompetitionDistance'] == -999).sum()

# df_train.isnull().sum(axis=0)
# X.dtypes

# (df_train['Sales'] == 0).sum()