In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from dateutil.relativedelta import relativedelta

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

import plotly.express as px
from mpl_toolkits.mplot3d import Axes3D

from itertools import cycle
from itertools import product

from scipy import stats

from googletrans import Translator

%matplotlib inline

In [122]:
class PredictionModel: 
    def __init__(self):
        self.import_data()
        self.down_cast()
        self.clean_df()
    
    
    def downcast_dtypes(self, df):
        float_cols = [c for c in df if df[c].dtype == "float64"]
        int_cols = [c for c in df if df[c].dtype in ["int64", "int32"]]
        df[float_cols] = df[float_cols].astype(np.float32)
        df[int_cols] = df[int_cols].astype(np.int16)
        return df
    
    def import_data(self):
        self.train = pd.read_csv('sales_train.csv')
        self.items = pd.read_csv('items.csv')
        self.item_cats = pd.read_csv('item_categories.csv')
        self.items_t = pd.read_csv('items_translated_text.csv')
        self.train_lag = pd.read_csv('month_lag_grouped.csv')
        
        
    def down_cast(self):
        self.train = self.downcast_dtypes(self.train)
        self.items = self.downcast_dtypes(self.items)
        self.train_lag = self.downcast_dtypes(self.train_lag)
        self.item_cats = self.downcast_dtypes(self.item_cats)
        
    def clean_df(self):
        self.train = self.train.merge(self.items, on='item_id')
        self.train = self.train.drop(columns='item_name')
        self.train['date'] = pd.to_datetime(self.train['date'], format='%d.%m.%Y')

        # Removes outliers from train
        self.train = self.train[self.train.item_price < 90000]
        self.train = self.train[self.train.item_cnt_day < 999]

        median = self.train[(self.train.shop_id == 32) & (self.train.item_id == 2973) & (self.train.date_block_num == 4) & (
                self.train.item_price > 0)].item_price.median()
        self.train.loc[self.train.item_price < 0, 'item_price'] = median

        train_cnt = self.train['item_cnt_day']
        self.train.drop(labels=['item_cnt_day'], axis=1, inplace=True)
        self.train.insert(6, 'item_cnt_day', train_cnt)

        self.train = pd.DataFrame(
            self.train.groupby(['date_block_num', 'shop_id', 'item_category_id', 'item_id', 'item_price'])
            ['item_cnt_day'].sum().reset_index())
        self.train.rename(columns={'item_cnt_day': 'item_cnt_month'}, inplace=True)

        self.train['item_cnt_month'] = (self.train['item_cnt_month']
                                        .fillna(0)
                                        .clip(0, 20)  # NB clip target here
                                        .astype(np.float16))
        
    def one_hot_encode(self):
        # Changes numerical, categorical features into strings to properly be represented as categorical in onehotencoding
        # nominal intergers can not be converted to binary encoding, convert to string
        self.train['date_block_num'] = [('month ' + str(i)) for i in self.train['date_block_num']]
        self.train['shop_id'] = [('shop ' + str(i)) for i in self.train['shop_id']]
        self.train['item_category_id'] = [('item_category ' + str(i)) for i in self.train['item_category_id']]
        self.train['item_id'] = [('item ' + str(i)) for i in self.train['item_id']]

    def run_model(self):
        self.x = self.train.iloc[:, :-1].values
        self.y = self.train.iloc[:, -1].values
        self.ct = ColumnTransformer([('encoder', OneHotEncoder(), [0, 1, 2, 3])], remainder='passthrough')
        self.x = self.ct.fit_transform(self.x)
        self.X_train, self.X_test, self.Y_train, self.Y_test = train_test_split(self.x, self.y, test_size=0.2, random_state=0)
        self.regressor = LinearRegression()
        self.regressor.fit(self.X_train, self.Y_train)

    def get_z_list(self, shop_id_num, item_id_num, month):
        self.item_cat = self.items.loc[self.items['item_id'] == item_id_num, ['item_category_id']].values[0][0]
        #item_cat = items[items['item_id'] == item_id_num]['item_category_id'].values
        self.prices = self.train.loc[self.train['item_id'] == 'item '+str(item_id_num), ['item_price']].values
        self.price = (stats.mode(self.prices))[0][0][0]

        return ['month '+ str(month), 'shop ' + str(shop_id_num), 'item_category ' + str(self.item_cat), 'item ' + str(item_id_num), self.price]

    def predict_month(self, shop_id_num, item_id_num, month):

        z = self.get_z_list(shop_id_num, item_id_num, month)

        #z = ['month 34', 'shop 55', 'item_category 76', 'item 492', 600.0]
        z = np.array(z, dtype=object).reshape(1, -1)
        z = self.ct.transform(z)
        z_pred = self.regressor.predict(z)

        return round(z_pred[0], 3)

In [123]:
sample_model = PredictionModel()

In [124]:
sample_model.train

Unnamed: 0,date_block_num,shop_id,item_category_id,item_id,item_price,item_cnt_month
0,0,0,2,5572,1322.0,10.0
1,0,0,2,5573,560.0,1.0
2,0,0,2,5575,806.0,4.0
3,0,0,2,5576,2231.0,5.0
4,0,0,2,5609,2381.0,1.0
...,...,...,...,...,...,...
1739014,33,59,79,17717,1250.0,4.0
1739015,33,59,79,17717,1999.0,1.0
1739016,33,59,83,22087,119.0,6.0
1739017,33,59,83,22088,119.0,2.0


In [125]:
sample_model.one_hot_encode()

In [126]:
sample_model.train

Unnamed: 0,date_block_num,shop_id,item_category_id,item_id,item_price,item_cnt_month
0,month 0,shop 0,item_category 2,item 5572,1322.0,10.0
1,month 0,shop 0,item_category 2,item 5573,560.0,1.0
2,month 0,shop 0,item_category 2,item 5575,806.0,4.0
3,month 0,shop 0,item_category 2,item 5576,2231.0,5.0
4,month 0,shop 0,item_category 2,item 5609,2381.0,1.0
...,...,...,...,...,...,...
1739014,month 33,shop 59,item_category 79,item 17717,1250.0,4.0
1739015,month 33,shop 59,item_category 79,item 17717,1999.0,1.0
1739016,month 33,shop 59,item_category 83,item 22087,119.0,6.0
1739017,month 33,shop 59,item_category 83,item 22088,119.0,2.0


In [127]:
sample_model.run_model()

In [130]:
sample_model.predict_month(55, 92, 33)

2.351

In [129]:
sample_model.get_z_list(55, 92, 33)

['month 34', 'shop 55', 'item_category 37', 'item 92', 249.0]