In [1]:
import pandas as pd
import numpy as np

train_df = pd.read_csv('../data/train.csv')
store_df = pd.read_csv('../data/store.csv')
test_df = pd.read_csv('../data/test.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', 100)
pd.set_option('expand_frame_repr', False)
pd.set_option('precision', 6)

In [3]:
train_df = train_df.merge(store_df, on='Store', how='left')
test_df = test_df.merge(store_df, on='Store', how='left')


### Data Clean

In [4]:
def data_clean(df):
    df['Assortment'] = df['Assortment'].astype('str')
    df['StoreType'] = df['StoreType'].astype('str')
    df['StateHoliday'] = df['StateHoliday'].astype('str')

    df['CompetitionDistance'] = df.CompetitionDistance.fillna(0)

    df['CompetitionOpenSinceYear'] = df.CompetitionOpenSinceYear.fillna(1900).astype(np.int32)
    df['CompetitionOpenSinceMonth'] = df.CompetitionOpenSinceMonth.fillna(1).astype(np.int32)
    df['Promo2SinceYear'] = df.Promo2SinceYear.fillna(1900).astype(np.int32)
    df['Promo2SinceWeek'] = df.Promo2SinceWeek.fillna(1).astype(np.int32)
    
    df['Date'] = pd.to_datetime(df['Date'])
    df['Year'] = df['Date'].dt.year
    df['Month'] = df['Date'].dt.month
    df["DayOfMonth"] = df["Date"].dt.day
    df["WeekOfYear"] = df["Date"].dt.weekofyear
    df["DayOfYear"] = df["Date"].dt.dayofyear
    
    df.loc[df['Open'].isnull(),'Open'] = 1

    
    return df

train_df = data_clean(train_df)
test_df = data_clean(test_df)

### Sales Features Extraction

In [5]:
def extract_sales_feature_basic(train_df):

    sales_feature_df = train_df.loc[train_df['Sales'] > 0]

    total_sales_store = sales_feature_df.groupby([sales_feature_df['Store']])['Sales'].sum()
    total_open_store = sales_feature_df.groupby([sales_feature_df['Store']])['Open'].count()

    total_customers_store = sales_feature_df.groupby([sales_feature_df['Store']])['Customers'].sum()
    median_customers_store = sales_feature_df.groupby([sales_feature_df['Store']])['Customers'].median()

    avg_sales_store = total_sales_store / total_open_store
    avg_customers_store = total_customers_store/ total_open_store
    avg_sales_customer_store = avg_sales_store / avg_customers_store

    sales_feature_df = pd.merge(sales_feature_df, avg_sales_store.reset_index(name='avg_sales_store'), how='left', on=['Store'])
    sales_feature_df = pd.merge(sales_feature_df, avg_customers_store.reset_index(name='avg_customers_store'), how='left', on=['Store'])
    sales_feature_df = pd.merge(sales_feature_df, avg_sales_customer_store.reset_index(name='avg_sales_customer_store'), how='left', on=['Store'])
    sales_feature_df = pd.merge(sales_feature_df, median_customers_store.reset_index(name='median_customers_store'), how='left', on=['Store'])

    return sales_feature_df

sales_feature_basic = extract_sales_feature_basic(train_df)
sales_feature_basic.head()

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,Year,Month,DayOfMonth,WeekOfYear,DayOfYear,avg_sales_store,avg_customers_store,avg_sales_customer_store,median_customers_store
0,1,5,2015-07-31,5263,555,1,1,0,1,c,a,1270.0,9,2008,0,1,1900,,2015,7,31,31,212,4759.096031,564.049936,8.437366,550.0
1,2,5,2015-07-31,6064,625,1,1,0,1,a,a,570.0,11,2007,1,13,2010,"Jan,Apr,Jul,Oct",2015,7,31,31,212,4953.90051,583.998724,8.482725,575.5
2,3,5,2015-07-31,8314,821,1,1,0,1,a,a,14130.0,12,2006,1,14,2011,"Jan,Apr,Jul,Oct",2015,7,31,31,212,6942.568678,750.077022,9.255808,744.0
3,4,5,2015-07-31,13995,1498,1,1,0,1,c,c,620.0,9,2009,0,1,1900,,2015,7,31,31,212,9638.401786,1321.752551,7.292138,1301.5
4,5,5,2015-07-31,4822,559,1,1,0,1,a,a,29910.0,4,2015,0,1,1900,,2015,7,31,31,212,4676.274711,537.34018,8.702634,564.0


In [6]:
def extract_sales_feature_dow(sales_feature_df):

    avg_sales_store_dow = sales_feature_df.groupby(['Store', 'DayOfWeek'])['Sales'].mean()
    median_sales_store_dow = sales_feature_df.groupby(['Store', 'DayOfWeek'])['Sales'].median()
    avg_customers_store_dow = sales_feature_df.groupby(['Store', 'DayOfWeek'])['Customers'].mean()
    median_customers_store_dow = sales_feature_df.groupby(['Store', 'DayOfWeek'])['Customers'].median()
    
    sales_feature_df = pd.merge(sales_feature_df, avg_sales_store_dow.reset_index(name='avg_sales_store_dow'), how='left', on=['Store', 'DayOfWeek'])
    sales_feature_df = pd.merge(sales_feature_df, median_sales_store_dow.reset_index(name='median_sales_store_dow'), how='left', on=['Store', 'DayOfWeek'])
    sales_feature_df = pd.merge(sales_feature_df, avg_customers_store_dow.reset_index(name='avg_customers_store_dow'), how='left', on=['Store', 'DayOfWeek'])
    sales_feature_df = pd.merge(sales_feature_df, median_customers_store_dow.reset_index(name='median_customers_store_dow'), how='left', on=['Store', 'DayOfWeek'])
    
    return sales_feature_df

sales_feature = extract_sales_feature_dow(sales_feature_basic)
sales_feature.head()


Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,Year,Month,DayOfMonth,WeekOfYear,DayOfYear,avg_sales_store,avg_customers_store,avg_sales_customer_store,median_customers_store,avg_sales_store_dow,median_sales_store_dow,avg_customers_store_dow,median_customers_store_dow
0,1,5,2015-07-31,5263,555,1,1,0,1,c,a,1270.0,9,2008,0,1,1900,,2015,7,31,31,212,4759.096031,564.049936,8.437366,550.0,4726.48062,4651.0,562.248062,552.0
1,2,5,2015-07-31,6064,625,1,1,0,1,a,a,570.0,11,2007,1,13,2010,"Jan,Apr,Jul,Oct",2015,7,31,31,212,4953.90051,583.998724,8.482725,575.5,4669.589147,4671.0,562.457364,545.0
2,3,5,2015-07-31,8314,821,1,1,0,1,a,a,14130.0,12,2006,1,14,2011,"Jan,Apr,Jul,Oct",2015,7,31,31,212,6942.568678,750.077022,9.255808,744.0,7193.203125,7111.0,787.335938,768.0
3,4,5,2015-07-31,13995,1498,1,1,0,1,c,c,620.0,9,2009,0,1,1900,,2015,7,31,31,212,9638.401786,1321.752551,7.292138,1301.5,9470.410853,9353.0,1302.565891,1286.0
4,5,5,2015-07-31,4822,559,1,1,0,1,a,a,29910.0,4,2015,0,1,1900,,2015,7,31,31,212,4676.274711,537.34018,8.702634,564.0,4858.054688,4792.0,569.125,563.0


In [7]:
sales_feature.head()

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,Year,Month,DayOfMonth,WeekOfYear,DayOfYear,avg_sales_store,avg_customers_store,avg_sales_customer_store,median_customers_store,avg_sales_store_dow,median_sales_store_dow,avg_customers_store_dow,median_customers_store_dow
0,1,5,2015-07-31,5263,555,1,1,0,1,c,a,1270.0,9,2008,0,1,1900,,2015,7,31,31,212,4759.096031,564.049936,8.437366,550.0,4726.48062,4651.0,562.248062,552.0
1,2,5,2015-07-31,6064,625,1,1,0,1,a,a,570.0,11,2007,1,13,2010,"Jan,Apr,Jul,Oct",2015,7,31,31,212,4953.90051,583.998724,8.482725,575.5,4669.589147,4671.0,562.457364,545.0
2,3,5,2015-07-31,8314,821,1,1,0,1,a,a,14130.0,12,2006,1,14,2011,"Jan,Apr,Jul,Oct",2015,7,31,31,212,6942.568678,750.077022,9.255808,744.0,7193.203125,7111.0,787.335938,768.0
3,4,5,2015-07-31,13995,1498,1,1,0,1,c,c,620.0,9,2009,0,1,1900,,2015,7,31,31,212,9638.401786,1321.752551,7.292138,1301.5,9470.410853,9353.0,1302.565891,1286.0
4,5,5,2015-07-31,4822,559,1,1,0,1,a,a,29910.0,4,2015,0,1,1900,,2015,7,31,31,212,4676.274711,537.34018,8.702634,564.0,4858.054688,4792.0,569.125,563.0


In [8]:
# Merge event features
event_feature_df = pd.read_csv('../event_feature.csv')


In [9]:
event_feature_df.head()

Unnamed: 0,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Customers,Date,DayOfWeek,Open,Promo,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,Sales,SchoolHoliday,StateHoliday,Store,StoreType,data_type,Year,Month,CompetitionOpenDate,DaysAfterCompetitionOpen,MonthAfterCompetitionOpen,Promo2Since,DaysAfterPromo2,WeekAfterPromo2,isStateHoliday,isSchoolHoliday,isPromo,daysAfterStateHoliday,daysAfterSchoolHoliday,daysAfterPromo,daysBeforeStateHoliday,daysBeforeSchoolHoliday,daysBeforePromo,StateHolidayDay,last_7d_state_holiday,last_7d_school_holiday,last_7d_promo,coming_7d_state_holiday,coming_7d_school_holiday,coming_7d_promo
0,a,1270.0,9,2008,,2015-09-17,4,1.0,1,0,1,1900,,,0,0,1,c,3,2015,9,2008-09-15,2558,24,1900-01-01,0,0,False,False,True,105.0,13.0,0.0,-1.0,-1.0,0.0,0,0.0,0.0,4.0,0.0,0.0,1.0
1,a,1270.0,9,2008,,2015-09-16,3,1.0,1,0,1,1900,,,0,0,1,c,3,2015,9,2008-09-15,2557,24,1900-01-01,0,0,False,False,True,104.0,12.0,0.0,-1.0,-1.0,0.0,0,0.0,0.0,3.0,0.0,0.0,2.0
2,a,1270.0,9,2008,,2015-09-15,2,1.0,1,0,1,1900,,,0,0,1,c,3,2015,9,2008-09-15,2556,24,1900-01-01,0,0,False,False,True,103.0,11.0,0.0,-1.0,-1.0,0.0,0,0.0,0.0,2.0,0.0,0.0,3.0
3,a,1270.0,9,2008,,2015-09-14,1,1.0,1,0,1,1900,,,0,0,1,c,3,2015,9,2008-09-15,2555,24,1900-01-01,0,0,False,False,True,102.0,10.0,0.0,-1.0,-1.0,0.0,0,0.0,0.0,1.0,0.0,0.0,4.0
4,a,1270.0,9,2008,,2015-09-13,7,0.0,0,0,1,1900,,,0,0,1,c,3,2015,9,2008-09-15,2554,24,1900-01-01,0,0,False,False,False,101.0,9.0,9.0,-1.0,-1.0,1.0,0,0.0,0.0,0.0,0.0,0.0,4.0


In [10]:
event_features = ['DaysAfterCompetitionOpen', 'MonthAfterCompetitionOpen', 'DaysAfterPromo2', 'WeekAfterPromo2', 'daysAfterStateHoliday', 'daysAfterSchoolHoliday', 'daysAfterPromo', 'last_7d_state_holiday', 'last_7d_school_holiday', 'last_7d_promo']

event_feature_df['Date'] = pd.to_datetime(event_feature_df['Date'])

train_features = sales_feature.merge(event_feature_df[['Store', 'Date'] + event_features], how='inner', on =['Store', 'Date'])

train_features.head()

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,Year,Month,DayOfMonth,WeekOfYear,DayOfYear,avg_sales_store,avg_customers_store,avg_sales_customer_store,median_customers_store,avg_sales_store_dow,median_sales_store_dow,avg_customers_store_dow,median_customers_store_dow,DaysAfterCompetitionOpen,MonthAfterCompetitionOpen,DaysAfterPromo2,WeekAfterPromo2,daysAfterStateHoliday,daysAfterSchoolHoliday,daysAfterPromo,last_7d_state_holiday,last_7d_school_holiday,last_7d_promo
0,1,5,2015-07-31,5263,555,1,1,0,1,c,a,1270.0,9,2008,0,1,1900,,2015,7,31,31,212,4759.096031,564.049936,8.437366,550.0,4726.48062,4651.0,562.248062,552.0,2510,24,0,0,57.0,0.0,0.0,0.0,5.0,5.0
1,2,5,2015-07-31,6064,625,1,1,0,1,a,a,570.0,11,2007,1,13,2010,"Jan,Apr,Jul,Oct",2015,7,31,31,212,4953.90051,583.998724,8.482725,575.5,4669.589147,4671.0,562.457364,545.0,2815,24,1950,25,67.0,0.0,0.0,0.0,5.0,5.0
2,3,5,2015-07-31,8314,821,1,1,0,1,a,a,14130.0,12,2006,1,14,2011,"Jan,Apr,Jul,Oct",2015,7,31,31,212,6942.568678,750.077022,9.255808,744.0,7193.203125,7111.0,787.335938,768.0,3150,24,1579,25,57.0,0.0,0.0,0.0,5.0,5.0
3,4,5,2015-07-31,13995,1498,1,1,0,1,c,c,620.0,9,2009,0,1,1900,,2015,7,31,31,212,9638.401786,1321.752551,7.292138,1301.5,9470.410853,9353.0,1302.565891,1286.0,2145,24,0,0,67.0,0.0,0.0,0.0,5.0,5.0
4,5,5,2015-07-31,4822,559,1,1,0,1,a,a,29910.0,4,2015,0,1,1900,,2015,7,31,31,212,4676.274711,537.34018,8.702634,564.0,4858.054688,4792.0,569.125,563.0,107,3,0,0,57.0,0.0,0.0,0.0,5.0,5.0


In [11]:
import calendar

month_dict = dict((k,v) for k,v in enumerate(calendar.month_abbr))

def isPromoMonth(x):
    if month_dict[x[0]] in x[1].split(',') and x[2] > 0:
        return 1
    return 0

def feature_isPromoMonth(df):
    df['PromoInterval'] = df['PromoInterval'].fillna('')
    df['isPromoMonth'] = df[['Month', 'PromoInterval', 'DaysAfterPromo2']].apply(isPromoMonth, axis=1)

feature_isPromoMonth(train_features)

train_features.head()

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,Year,Month,DayOfMonth,WeekOfYear,DayOfYear,avg_sales_store,avg_customers_store,avg_sales_customer_store,median_customers_store,avg_sales_store_dow,median_sales_store_dow,avg_customers_store_dow,median_customers_store_dow,DaysAfterCompetitionOpen,MonthAfterCompetitionOpen,DaysAfterPromo2,WeekAfterPromo2,daysAfterStateHoliday,daysAfterSchoolHoliday,daysAfterPromo,last_7d_state_holiday,last_7d_school_holiday,last_7d_promo,isPromoMonth
0,1,5,2015-07-31,5263,555,1,1,0,1,c,a,1270.0,9,2008,0,1,1900,,2015,7,31,31,212,4759.096031,564.049936,8.437366,550.0,4726.48062,4651.0,562.248062,552.0,2510,24,0,0,57.0,0.0,0.0,0.0,5.0,5.0,0
1,2,5,2015-07-31,6064,625,1,1,0,1,a,a,570.0,11,2007,1,13,2010,"Jan,Apr,Jul,Oct",2015,7,31,31,212,4953.90051,583.998724,8.482725,575.5,4669.589147,4671.0,562.457364,545.0,2815,24,1950,25,67.0,0.0,0.0,0.0,5.0,5.0,1
2,3,5,2015-07-31,8314,821,1,1,0,1,a,a,14130.0,12,2006,1,14,2011,"Jan,Apr,Jul,Oct",2015,7,31,31,212,6942.568678,750.077022,9.255808,744.0,7193.203125,7111.0,787.335938,768.0,3150,24,1579,25,57.0,0.0,0.0,0.0,5.0,5.0,1
3,4,5,2015-07-31,13995,1498,1,1,0,1,c,c,620.0,9,2009,0,1,1900,,2015,7,31,31,212,9638.401786,1321.752551,7.292138,1301.5,9470.410853,9353.0,1302.565891,1286.0,2145,24,0,0,67.0,0.0,0.0,0.0,5.0,5.0,0
4,5,5,2015-07-31,4822,559,1,1,0,1,a,a,29910.0,4,2015,0,1,1900,,2015,7,31,31,212,4676.274711,537.34018,8.702634,564.0,4858.054688,4792.0,569.125,563.0,107,3,0,0,57.0,0.0,0.0,0.0,5.0,5.0,0


In [12]:
test_feature = test_df.merge(train_features[['Store', 'DayOfWeek','avg_sales_store', 'avg_customers_store', 'avg_sales_customer_store', \
                        'median_customers_store', 'avg_sales_store_dow', 'median_sales_store_dow', \
                        'avg_customers_store_dow', 'median_customers_store_dow']], how='left', on =['Store', 'DayOfWeek'])

test_feature.drop_duplicates(['Store', 'Date'], 'first', inplace=True)

test_feature.head()

Unnamed: 0,Id,Store,DayOfWeek,Date,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,Year,Month,DayOfMonth,WeekOfYear,DayOfYear,avg_sales_store,avg_customers_store,avg_sales_customer_store,median_customers_store,avg_sales_store_dow,median_sales_store_dow,avg_customers_store_dow,median_customers_store_dow
0,1,1,4,2015-09-17,1.0,1,0,0,c,a,1270.0,9,2008,0,1,1900,,2015,9,17,38,260,4759.096031,564.049936,8.437366,550.0,4457.83871,4380.0,537.177419,525.0
124,2,3,4,2015-09-17,1.0,1,0,0,a,a,14130.0,12,2006,1,14,2011,"Jan,Apr,Jul,Oct",2015,9,17,38,260,6942.568678,750.077022,9.255808,744.0,6936.435484,6940.0,771.137097,763.0
248,3,7,4,2015-09-17,1.0,1,0,0,a,c,24000.0,4,2013,0,1,1900,,2015,9,17,38,260,8817.050891,948.561069,9.295185,938.5,8508.023438,8195.0,934.71875,906.5
376,4,8,4,2015-09-17,1.0,1,0,0,a,a,7520.0,10,2014,0,1,1900,,2015,9,17,38,260,5539.358418,658.197704,8.415949,672.5,6348.527559,6309.0,797.889764,793.0
503,5,9,4,2015-09-17,1.0,1,0,0,a,c,2030.0,8,2000,0,1,1900,,2015,9,17,38,260,6562.337612,579.816431,11.317957,561.0,6131.629032,5875.0,553.056452,540.0


In [13]:
test_feature = test_feature.merge(event_feature_df[['Store', 'Date'] + event_features], how='left', on =['Store', 'Date'])

test_feature.head()

Unnamed: 0,Id,Store,DayOfWeek,Date,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,Year,Month,DayOfMonth,WeekOfYear,DayOfYear,avg_sales_store,avg_customers_store,avg_sales_customer_store,median_customers_store,avg_sales_store_dow,median_sales_store_dow,avg_customers_store_dow,median_customers_store_dow,DaysAfterCompetitionOpen,MonthAfterCompetitionOpen,DaysAfterPromo2,WeekAfterPromo2,daysAfterStateHoliday,daysAfterSchoolHoliday,daysAfterPromo,last_7d_state_holiday,last_7d_school_holiday,last_7d_promo
0,1,1,4,2015-09-17,1.0,1,0,0,c,a,1270.0,9,2008,0,1,1900,,2015,9,17,38,260,4759.096031,564.049936,8.437366,550.0,4457.83871,4380.0,537.177419,525.0,2558,24,0,0,105.0,13.0,0.0,0.0,0.0,4.0
1,2,3,4,2015-09-17,1.0,1,0,0,a,a,14130.0,12,2006,1,14,2011,"Jan,Apr,Jul,Oct",2015,9,17,38,260,6942.568678,750.077022,9.255808,744.0,6936.435484,6940.0,771.137097,763.0,3198,24,1627,25,105.0,37.0,0.0,0.0,0.0,4.0
2,3,7,4,2015-09-17,1.0,1,0,0,a,c,24000.0,4,2013,0,1,1900,,2015,9,17,38,260,8817.050891,948.561069,9.295185,938.5,8508.023438,8195.0,934.71875,906.5,885,24,0,0,115.0,20.0,0.0,0.0,0.0,4.0
3,4,8,4,2015-09-17,1.0,1,0,0,a,a,7520.0,10,2014,0,1,1900,,2015,9,17,38,260,5539.358418,658.197704,8.415949,672.5,6348.527559,6309.0,797.889764,793.0,337,11,0,0,115.0,20.0,0.0,0.0,0.0,4.0
4,5,9,4,2015-09-17,1.0,1,0,0,a,c,2030.0,8,2000,0,1,1900,,2015,9,17,38,260,6562.337612,579.816431,11.317957,561.0,6131.629032,5875.0,553.056452,540.0,5511,24,0,0,105.0,37.0,0.0,0.0,0.0,4.0


In [14]:
feature_isPromoMonth(test_feature)

In [15]:
test_feature['StateHoliday'] = test_feature['StateHoliday'].astype('category').cat.codes
test_feature['StoreType'] = test_feature['StoreType'].astype('category').cat.codes
test_feature['Assortment'] = test_feature['Assortment'].astype('category').cat.codes

In [11]:
train_features.to_csv('train_features.csv', index=False)


In [19]:
def save_features(features, i):
    fn = 'features_' + str(len(features)) + '{0:0=3d}'.format(i) + '.txt'
    with open('output/features/{}'.format(fn), 'w') as outfile:
        outfile.write(str(features))
        
train_features['StateHoliday'] = train_features['StateHoliday'].astype('category').cat.codes
train_features['StoreType'] = train_features['StoreType'].astype('category').cat.codes
train_features['Assortment'] = train_features['Assortment'].astype('category').cat.codes
train_features['SalesLog'] = np.log1p(train_features['Sales'])

train_val_split_date = datetime.strptime('2015-06-18', '%Y-%m-%d')

val_set = train_features.loc[train_features['Date'] >= train_val_split_date]
train_set = train_features.loc[train_features['Date'] < train_val_split_date]


In [17]:
import logging

logger = logging.getLogger("RossmanTrain")
logger.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(message)s')
fh = logging.FileHandler('model_optimizer.log')
fh.setFormatter(formatter)
fh.setLevel(logging.DEBUG)
logger.addHandler(fh)

def log_evaluation(period=1, show_stdv=True):
    """Create a callback that logs evaluation result with logger.

    Parameters
    ----------
    period : int
        The period to log the evaluation results

    show_stdv : bool, optional
         Whether show stdv if provided

    Returns
    -------
    callback : function
        A callback that logs evaluation every period iterations into logger.
    """

    def _fmt_metric(value, show_stdv=True):
        """format metric string"""
        if len(value) == 2:
            return '%s:%g' % (value[0], value[1])
        elif len(value) == 3:
            if show_stdv:
                return '%s:%g+%g' % (value[0], value[1], value[2])
            else:
                return '%s:%g' % (value[0], value[1])
        else:
            raise ValueError("wrong metric value")

    def callback(env):
        if env.rank != 0 or len(env.evaluation_result_list) == 0 or period is False:
            return
        i = env.iteration
        if i % period == 0 or i + 1 == env.begin_iteration or i + 1 == env.end_iteration:
            msg = '\t'.join([_fmt_metric(x, show_stdv) for x in env.evaluation_result_list])
            logger.info('[%d]\t%s\n' % (i, msg))

    return callback

param = {'max_depth':10,
         'eta':0.03,
         'subsample':0.9,
         'colsample_bytree':0.7, 
         'verbosity':1,
         'objective':'reg:linear',
         'seed': 41,
         'nthread':6}

num_round = 1000

# the number of feature selections performed 
num_of_models = 100
basic_features = ['Store', 'DayOfWeek', 'CompetitionDistance']

sales_features = ['avg_sales_store', 'avg_customers_store', 'avg_sales_customer_store', \
                        'median_customers_store', 'avg_sales_store_dow', 'median_sales_store_dow', \
                        'avg_customers_store_dow', 'median_customers_store_dow', 'StoreType', 'Assortment', 'CompetitionOpenSinceYear']

event_features = ['Year','Month','DayOfMonth','WeekOfYear','DayOfYear','DaysAfterCompetitionOpen', \
                         'MonthAfterCompetitionOpen', 'DaysAfterPromo2', 'WeekAfterPromo2', 'daysAfterStateHoliday', \
                         'daysAfterSchoolHoliday', 'daysAfterPromo', 'last_7d_state_holiday', 'last_7d_school_holiday', \
                         'last_7d_promo', 'isPromoMonth']


def ToWeight(y):
    w = np.zeros(y.shape, dtype=float)
    ind = y != 0
    w[ind] = 1./(y[ind]**2)
    return w

def rmspe(yhat, y):
    w = ToWeight(y)
    rmspe = np.sqrt(np.mean( w * (y - yhat)**2 ))
    return rmspe

def rmspe_xg(yhat, y):
    # y = y.values
    y = y.get_label()
    y = np.exp(y) - 1
    yhat = np.exp(yhat) - 1
    w = ToWeight(y)
    rmspe = np.sqrt(np.mean(w * (y - yhat)**2))
    return "rmspe", rmspe


# a dict to save 100 models' infos
model_dicts = dict()

import random
import xgboost as xgb

for i in range(num_of_models):
    model_info = {}

    sales_feature_num = random.randint(3, 6)
    sales_features_used = random.sample(sales_features, sales_feature_num)
    
    event_feature_num = random.randint(5, 12)
    event_features_used = random.sample(event_features, event_feature_num)
    
    features_used = basic_features + sales_features_used + event_features_used
    model_info['features_used'] = features_used
    save_features(features_used, i)

    logger.info("train No.{} xgboost model".format(i))
    X_train = train_set[features_used]
    y_train = train_set['SalesLog']

    X_val = val_set[features_used]
    y_val = val_set['SalesLog']

    dtrain = xgb.DMatrix(X_train, y_train)
    dtest = xgb.DMatrix(X_val, y_val)

    evallist = [(dtrain, 'train'), (dtest, 'test')]

    plst = param.items()
    callbacks = [log_evaluation(25, True)]
    bst = xgb.train(plst, dtrain, num_round, evallist, feval=rmspe_xg, verbose_eval=25, early_stopping_rounds=100, callbacks=callbacks)

    model_name = str(len(features_used)) + '{0:0=3d}'.format(i)

    val_yhat = bst.predict(xgb.DMatrix(X_val))

    error = rmspe(np.expm1(val_yhat), val_set.Sales.values)
    model_info['valid_error'] = error

    model_dicts[model_name] = model_info

    logger.info('Make predictions on the test set, model: {}'.format(model_name))
    dtest = xgb.DMatrix(test_feature[features_used])
    test_predict = bst.predict(dtest)

    result = pd.DataFrame({'Id': test_feature['Id'], 'Sales': np.expm1(test_predict)})
    result.to_csv('output/test_prediction/{}.csv'.format('test_'+model_name), index=False)

models_df = pd.DataFrame(model_dicts).T 
models_df.to_csv('output/model_infos.csv')


In [31]:
# Tuning xgboost parameters
import random
import xgboost as xgb

for model_index in [17093]:
    features_used = []
    with open('output/features/{}.txt'.format('features_' + str(model_index)), 'r') as ft:
        fts = ft.readlines()[0].split("'")
        fts = [f for f in fts if len(f) > 2]
        features_used += fts

    features_used = list(set(features_used))
    
    num_round = 5000

    model_dicts = {}
    model_num = 0
    
    for max_depth in range(10,13):
        for subsample in range(7,10):
            for colsample_bytree in range(5,8):

                params = {'objective': 'reg:linear',
                           'eta': 0.03,
                           'verbosity':1,
                           'seed': 41,
                           'nthread':6
                          }


                params['colsample_bytree'] = colsample_bytree * 0.1
                params['subsample'] = subsample * 0.1
                params['max_depth'] = max_depth

                print("truning xgboost model:{} hyper-parameters.".format(model_num))
                X_train = train_set[features_used]
                y_train = train_set['SalesLog']

                X_val = val_set[features_used]
                y_val = val_set['SalesLog']
                dtrain = xgb.DMatrix(X_train, y_train)
                dtest = xgb.DMatrix(X_val, y_val)

                evallist = [(dtrain, 'train'), (dtest, 'test')]
                plst = params.items()
                callbacks = [log_evaluation(25, True)]
                bst = xgb.train(plst, dtrain, num_round, evallist, feval=rmspe_xg, verbose_eval=100, early_stopping_rounds=100, callbacks=callbacks)

                val_yhat = bst.predict(xgb.DMatrix(X_val))
                val_set['PredictSales'] = np.expm1(val_yhat)
                val_set[['Store', 'Date', 'PredictSales', 'Sales']].to_csv('output/val_prediction/val_{}.csv'.format(model_name), index=False)

                error = rmspe(np.expm1(val_yhat), val_set.Sales.values)
                params['valid_error'] = error

                model_name = 'model' + str(model_index) + '_p' +str(model_num)
                model_dicts[model_name] = params

                logger.info('Make predictions on the test set, model: {}'.format(model_name))
                dtest = xgb.DMatrix(test_feature[features_used])
                test_predict = bst.predict(dtest)

                # output
                result = pd.DataFrame({'Id': test_feature['Id'], 'Sales': np.expm1(test_predict)})
                result.to_csv('output/test_prediction/result_{}.csv'.format(model_name), index=False)

                model_num += 1


    models_df = pd.DataFrame(model_dicts).T 
    models_df.to_csv('output/model_{}_param_tuning.csv'.format(model_index))


truning xgboost model:0 hyper-parameters.
[0]	train-rmse:8.02023	test-rmse:8.03243	train-rmspe:0.999809	test-rmspe:0.999813
Multiple eval metrics have been passed: 'test-rmspe' will be used for early stopping.

Will train until test-rmspe hasn't improved in 100 rounds.
[100]	train-rmse:0.404041	test-rmse:0.419097	train-rmspe:0.336386	test-rmspe:0.333699
[200]	train-rmse:0.10411	test-rmse:0.122806	train-rmspe:0.123489	test-rmspe:0.122318
[300]	train-rmse:0.092006	test-rmse:0.114248	train-rmspe:0.107112	test-rmspe:0.117711
[400]	train-rmse:0.085915	test-rmse:0.111597	train-rmspe:0.097242	test-rmspe:0.115412
[500]	train-rmse:0.082072	test-rmse:0.110523	train-rmspe:0.090708	test-rmspe:0.114379
[600]	train-rmse:0.079135	test-rmse:0.109714	train-rmspe:0.086199	test-rmspe:0.113648
[700]	train-rmse:0.07655	test-rmse:0.109256	train-rmspe:0.082593	test-rmspe:0.113158
[800]	train-rmse:0.074446	test-rmse:0.108887	train-rmspe:0.079805	test-rmspe:0.11284
[900]	train-rmse:0.072598	test-rmse:0.108623	

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


truning xgboost model:1 hyper-parameters.
[0]	train-rmse:8.02023	test-rmse:8.03243	train-rmspe:0.999809	test-rmspe:0.999813
Multiple eval metrics have been passed: 'test-rmspe' will be used for early stopping.

Will train until test-rmspe hasn't improved in 100 rounds.
[100]	train-rmse:0.402279	test-rmse:0.412709	train-rmspe:0.332315	test-rmspe:0.329927
[200]	train-rmse:0.101824	test-rmse:0.120149	train-rmspe:0.119934	test-rmspe:0.11993
[300]	train-rmse:0.090456	test-rmse:0.112768	train-rmspe:0.104798	test-rmspe:0.116057
[400]	train-rmse:0.084634	test-rmse:0.110577	train-rmspe:0.094277	test-rmspe:0.113989
[500]	train-rmse:0.080893	test-rmse:0.109556	train-rmspe:0.088653	test-rmspe:0.11295
[600]	train-rmse:0.07786	test-rmse:0.108873	train-rmspe:0.08413	test-rmspe:0.112271
[700]	train-rmse:0.075299	test-rmse:0.10831	train-rmspe:0.080623	test-rmspe:0.111695
[800]	train-rmse:0.073185	test-rmse:0.10793	train-rmspe:0.077749	test-rmspe:0.111406
[900]	train-rmse:0.07132	test-rmse:0.107772	trai