# Set environment and load data

In [1]:
import os
import pprint
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn.utils.testing import all_estimators
from sklearn import base

### Allow multi-line results
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

### See all dataframe output
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)



In [2]:
# Dynamically load data from filenames
folder = "/Users/Karen/OneDrive/GitHub/Kaggle_PredictFutureSales/"
datafolder = "competitive-data-science-predict-future-sales/"
for dirname, _, filenames in os.walk(folder + datafolder):
    for filename in filenames:
        #file = os.path.join(dirname, filename)
        globals()[filename.split('.')[0]] = pd.read_csv(folder + datafolder + filename)
        print(filename)
        

sales_train.csv
shops.csv
test.csv
item_categories.csv
items.csv
sample_submission.csv


# Inspect loaded data

In [3]:
print(sales_train.head(5))
len(sales_train)

         date  date_block_num  shop_id  item_id  item_price  item_cnt_day
0  02.01.2013               0       59    22154      999.00           1.0
1  03.01.2013               0       25     2552      899.00           1.0
2  05.01.2013               0       25     2552      899.00          -1.0
3  06.01.2013               0       25     2554     1709.05           1.0
4  15.01.2013               0       25     2555     1099.00           1.0


2935849

In [4]:
print(test.head(5))

   ID  shop_id  item_id
0   0        5     5037
1   1        5     5320
2   2        5     5233
3   3        5     5232
4   4        5     5268


In [5]:
print(shops.head(5))

                        shop_name  shop_id
0   !Якутск Орджоникидзе, 56 фран        0
1   !Якутск ТЦ "Центральный" фран        1
2                Адыгея ТЦ "Мега"        2
3  Балашиха ТРК "Октябрь-Киномир"        3
4        Волжский ТЦ "Волга Молл"        4


In [6]:
print(items.head(5))

                                           item_name  item_id  \
0          ! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.)         D        0   
1  !ABBYY FineReader 12 Professional Edition Full...        1   
2      ***В ЛУЧАХ СЛАВЫ   (UNV)                    D        2   
3    ***ГОЛУБАЯ ВОЛНА  (Univ)                      D        3   
4        ***КОРОБКА (СТЕКЛО)                       D        4   

   item_category_id  
0                40  
1                76  
2                40  
3                40  
4                40  


In [7]:
print(item_categories.head(5))

        item_category_name  item_category_id
0  PC - Гарнитуры/Наушники                 0
1         Аксессуары - PS2                 1
2         Аксессуары - PS3                 2
3         Аксессуары - PS4                 3
4         Аксессуары - PSP                 4


In [8]:
type(sales_train['date'][0])
### the date column contains value as str format, we assume formatting is consistent dd.mm.yyyy
print(len(set(sales_train['date'])))
# print(set(sales_train['date']))
### 1034 unique dates

### extract the month + year
sales_train['month'] = sales_train.apply(lambda x: x['date'].split('.',1)[1],axis=1)
print(set(sales_train['month']))


str

1034
{'02.2013', '11.2014', '06.2015', '10.2015', '01.2013', '09.2015', '06.2014', '01.2014', '08.2013', '07.2014', '04.2015', '03.2015', '12.2013', '05.2015', '07.2015', '02.2015', '10.2013', '11.2013', '08.2015', '06.2013', '07.2013', '04.2014', '03.2013', '02.2014', '08.2014', '05.2013', '03.2014', '01.2015', '05.2014', '12.2014', '09.2014', '09.2013', '04.2013', '10.2014'}


In [9]:
### Calculate monthly items sold and prepare for training
### however doesn't account for trends over time.
sales_train_t = sales_train\
                .groupby(['shop_id','item_id','month'])\
                ['item_cnt_day'].sum()\
                .reset_index()\
                .drop(columns={'month'})\
                .rename(columns={'item_cnt_day':'item_cnt_month'})
sales_train_t

Unnamed: 0,shop_id,item_id,item_cnt_month
0,0,30,31.0
1,0,31,11.0
2,0,32,6.0
3,0,32,10.0
4,0,33,3.0
...,...,...,...
1609119,59,22164,2.0
1609120,59,22164,1.0
1609121,59,22167,1.0
1609122,59,22167,1.0


In [10]:
### Pick regressor models
estimators = all_estimators()
dict_classifiers = {}

shortlist = ['ARDRegression','BaggingRegressor','DecisionTreeRegressor','ExtraTreesRegressor', \
             'GradientBoostingRegressor','KNeighborsRegressor','LassoLars','LinearRegression', \
             'Linear SVR', 'PoissonRegressor','RandomForestRegressor']

for name, class_ in estimators:
    ### use one: ClassifierMixin, ClusterMixin, RegressorMixin, TransformerMixin
    if issubclass(class_, base.RegressorMixin):
        if name in shortlist:
            ### excluding any models which require additional parameters
            try:
                dict_classifiers[name] = class_()
            except:
                pass
        
len(dict_classifiers.items())
pprint.pprint(dict_classifiers)

10

{'ARDRegression': ARDRegression(),
 'BaggingRegressor': BaggingRegressor(),
 'DecisionTreeRegressor': DecisionTreeRegressor(),
 'ExtraTreesRegressor': ExtraTreesRegressor(),
 'GradientBoostingRegressor': GradientBoostingRegressor(),
 'KNeighborsRegressor': KNeighborsRegressor(),
 'LassoLars': LassoLars(),
 'LinearRegression': LinearRegression(),
 'PoissonRegressor': PoissonRegressor(),
 'RandomForestRegressor': RandomForestRegressor()}


In [11]:
### Split train-test data
x = sales_train_t.copy().drop(columns={'item_cnt_month'})
y = sales_train_t['item_cnt_month']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)
xy_train = list(zip(x_train, y_train))
xy_test = list(zip(x_test, y_test))

In [12]:
### Cycle train through all models
results = ([['base',0]])

for model, model_inst in dict_classifiers.items():
    try: 
        model_inst.fit(x_train, y_train)
        pred = np.array(model_inst.predict(x_test))
        score = metrics.mean_squared_error(y_test, pred)
        print(model, "mse: ",score)
        results.append([model,score])
    except:
        pass

ARDRegression()

ARDRegression mse:  75.01691990204851


BaggingRegressor()

BaggingRegressor mse:  47.447829125707756


DecisionTreeRegressor()

DecisionTreeRegressor mse:  48.9503345654851


ExtraTreesRegressor()

ExtraTreesRegressor mse:  49.06782476713751


GradientBoostingRegressor()

GradientBoostingRegressor mse:  57.18867748194295


KNeighborsRegressor()

KNeighborsRegressor mse:  50.74736771931279


LassoLars()

LassoLars mse:  75.01691990204851


LinearRegression()

LinearRegression mse:  75.00731294913798


RandomForestRegressor()

RandomForestRegressor mse:  47.48655369834793


In [13]:
### Results of all trained models
### For MSE: a lower MSE is desired
print(pd.DataFrame(results, columns={'model','score'}).sort_values(by='score', ascending=True))

                       model      score
0                       base   0.000000
2           BaggingRegressor  47.447829
9      RandomForestRegressor  47.486554
3      DecisionTreeRegressor  48.950335
4        ExtraTreesRegressor  49.067825
6        KNeighborsRegressor  50.747368
5  GradientBoostingRegressor  57.188677
8           LinearRegression  75.007313
1              ARDRegression  75.016920
7                  LassoLars  75.016920


In [20]:
for name, class_ in estimators:
    if name == 'BaggingRegressor':
        print(class_)

<class 'sklearn.ensemble._bagging.BaggingRegressor'>


In [14]:
### Prepare test set + final model
test_t = test.drop(columns={"ID"})

final_clf = sklearn.ensemble._forest.RandomForestRegressor()
final_clf.fit(x, y)
pred = np.array(final_clf.predict(test_t))
finalresults = pd.DataFrame()
finalresults['item_cnt_month'] = list(pred)
finalresults = finalresults.join(test[['ID']])
print(finalresults)

RandomForestRegressor()

        item_cnt_month      ID
0             1.408175       0
1             5.574324       1
2             2.019944       2
3             1.070000       3
4             1.344629       4
...                ...     ...
214195        1.839055  214195
214196        1.242448  214196
214197        1.159384  214197
214198        1.000000  214198
214199        1.655944  214199

[214200 rows x 2 columns]


In [15]:
### Export predictions to .csv
finalresults.to_csv(folder + 'results_RandomForest_NoProcessing.csv',index=False)

In [16]:
%%script false --no-raise-error
from pycaret.regression import *
exp_reg101 = setup(data = sales_train_t, target = 'item_cnt_day', session_id=1)