# Set environment and load data

In [5]:
import os
import pprint
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_validate
from sklearn.metrics import mean_squared_error

### Allow multi-line results
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

### See all dataframe output
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [6]:
# Dynamically load data from filenames
folder = "/Users/Karen/OneDrive/GitHub/Kaggle_PredictFutureSales/"
datafolder = "competitive-data-science-predict-future-sales/"
for dirname, _, filenames in os.walk(datafolder):
    for filename in filenames:
        #file = os.path.join(dirname, filename)
        globals()[filename.split('.')[0]] = pd.read_csv(datafolder + filename)
        print(filename)
        

sales_train.csv
shops.csv
test.csv
item_categories.csv
items.csv
sample_submission.csv


# Inspect loaded data

In [7]:
print(sales_train.head(5))
len(sales_train)

         date  date_block_num  shop_id  item_id  item_price  item_cnt_day
0  02.01.2013               0       59    22154      999.00           1.0
1  03.01.2013               0       25     2552      899.00           1.0
2  05.01.2013               0       25     2552      899.00          -1.0
3  06.01.2013               0       25     2554     1709.05           1.0
4  15.01.2013               0       25     2555     1099.00           1.0


2935849

In [8]:
print(test.head(5))

   ID  shop_id  item_id
0   0        5     5037
1   1        5     5320
2   2        5     5233
3   3        5     5232
4   4        5     5268


In [9]:
print(shops.head(5))

                        shop_name  shop_id
0   !Якутск Орджоникидзе, 56 фран        0
1   !Якутск ТЦ "Центральный" фран        1
2                Адыгея ТЦ "Мега"        2
3  Балашиха ТРК "Октябрь-Киномир"        3
4        Волжский ТЦ "Волга Молл"        4


In [10]:
print(items.head(5))

                                           item_name  item_id  \
0          ! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.)         D        0   
1  !ABBYY FineReader 12 Professional Edition Full...        1   
2      ***В ЛУЧАХ СЛАВЫ   (UNV)                    D        2   
3    ***ГОЛУБАЯ ВОЛНА  (Univ)                      D        3   
4        ***КОРОБКА (СТЕКЛО)                       D        4   

   item_category_id  
0                40  
1                76  
2                40  
3                40  
4                40  


In [11]:
print(item_categories.head(5))

        item_category_name  item_category_id
0  PC - Гарнитуры/Наушники                 0
1         Аксессуары - PS2                 1
2         Аксессуары - PS3                 2
3         Аксессуары - PS4                 3
4         Аксессуары - PSP                 4


In [12]:
list(sales_train)
sales_train_t = sales_train.drop(columns={"date","date_block_num","item_price"})
list(sales_train_t)


['date', 'date_block_num', 'shop_id', 'item_id', 'item_price', 'item_cnt_day']

['shop_id', 'item_id', 'item_cnt_day']

In [19]:
### Regressor models
from sklearn.utils.testing import all_estimators
from sklearn import base

estimators = all_estimators()
dict_classifiers = {}

shortlist = ['ARDRegression','BaggingRegressor','DecisionTreeRegressor','ExtraTreesRegressor', \
             'GradientBoostingRegressor','KNeighborsRegressor','LassoLars','LinearRegression', \
             'Linear SVR', 'PoissonRegressor','RandomForestRegressor']

for name, class_ in estimators:
    ### use one: ClassifierMixin, ClusterMixin, RegressorMixin, TransformerMixin
    if issubclass(class_, base.RegressorMixin):
        if name in shortlist:
            ### excluding any models which require additional parameters
            try:
                dict_classifiers[name] = class_()
            except:
                pass
        
len(dict_classifiers.items())
pprint.pprint(dict_classifiers)

10

{'ARDRegression': ARDRegression(),
 'BaggingRegressor': BaggingRegressor(),
 'DecisionTreeRegressor': DecisionTreeRegressor(),
 'ExtraTreesRegressor': ExtraTreesRegressor(),
 'GradientBoostingRegressor': GradientBoostingRegressor(),
 'KNeighborsRegressor': KNeighborsRegressor(),
 'LassoLars': LassoLars(),
 'LinearRegression': LinearRegression(),
 'PoissonRegressor': PoissonRegressor(),
 'RandomForestRegressor': RandomForestRegressor()}


In [14]:
### Split train-test data
x = sales_train_t.copy().drop(columns={'item_cnt_day'})
y = sales_train_t['item_cnt_day']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)
xy_train = list(zip(x_train, y_train))
xy_test = list(zip(x_test, y_test))

In [15]:
results = ([['base',0]])

for model, model_inst in dict_classifiers.items():
    try: 
        model_inst.fit(x_train, y_train)
        pred = np.array(model_inst.predict(x_test))
        score = metrics.mean_squared_error(y_test, pred)
        print(model, "mse: ",score)
        #print(metrics.classification_report(y_test, pred, digits=3))
        results.append([model,score])
    except:
        pass

ARDRegression()

ARDRegression mse:  4.500625663109595


BaggingRegressor()

BaggingRegressor mse:  3.6782187682532053


DecisionTreeRegressor()

DecisionTreeRegressor mse:  3.6785668594004277


ExtraTreesRegressor()

ExtraTreesRegressor mse:  3.6763706032589307


GradientBoostingRegressor()

GradientBoostingRegressor mse:  4.106173601860556


KNeighborsRegressor()

KNeighborsRegressor mse:  4.619413994804047


LassoLars()

LassoLars mse:  4.500625663109595


LinearRegression()

LinearRegression mse:  4.498623428286816


RandomForestRegressor()

RandomForestRegressor mse:  3.6282742692168775


In [16]:
print(pd.DataFrame(results, columns={'model','score'}).sort_values(by='score'))

                       model     score
0                       base  0.000000
9      RandomForestRegressor  3.628274
4        ExtraTreesRegressor  3.676371
2           BaggingRegressor  3.678219
3      DecisionTreeRegressor  3.678567
5  GradientBoostingRegressor  4.106174
8           LinearRegression  4.498623
1              ARDRegression  4.500626
7                  LassoLars  4.500626
6        KNeighborsRegressor  4.619414


In [None]:
test_t = test.drop(columns={"ID"})

final_clf = sklearn.ensemble._forest.RandomForestRegressor()
final_clf.fit(x, y)
pred = np.array(final_clf.predict(test_t))
finalresults = pd.DataFrame()
finalresults['Survived'] = list(pred)
finalresults = finalresults.join(test[['ID']])
print(finalresults)

In [None]:
finalresults.to_csv(folder + 'results_RandomForest_NoProcessing.csv',index=False)

In [None]:
%%script false --no-raise-error
from pycaret.regression import *
exp_reg101 = setup(data = sales_train_t, target = 'item_cnt_day', session_id=1)