In [2]:
import plotly.express as px
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from xgboost import plot_importance
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
import dash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output, State

from scipy import stats


def plot_features(booster, figsize):
    fig, ax = plt.subplots(1, 1, figsize=figsize)
    return plot_importance(booster=booster, ax=ax)

In [3]:
# Down casts the data entries from int64 to int32 and float64 to float32
# This reduces the size of the records by almost half. (From 134mb to 61mb)
def downcast_dtypes(df):
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols = [c for c in df if df[c].dtype in ["int64", "int32"]]
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols] = df[int_cols].astype(np.int16)
    return df

In [4]:
# Import and clean data (importing csv into pandas)
# Read in .csv files into pandas data frames
month_group2 = pd.read_csv('month_lag_grouped.csv')
# test = pd.read_csv('test.csv').set_index('ID')
# submission = pd.read_csv('sample_submission.csv')
items = pd.read_csv('items.csv')
#item_cats = pd.read_csv('item_categories.csv')
# shops = pd.read_csv('shops.csv')
#items_t = pd.read_csv('items_translated_text.csv')
train_lag = pd.read_csv('new_month_group.csv')

In [6]:
# Calls the downcasting function
month_group2 = downcast_dtypes(month_group2)
# test = downcast_dtypes(test)
# submission = downcast_dtypes(submission)
items = downcast_dtypes(items)
train_lag = downcast_dtypes(train_lag)

#item_cats = downcast_dtypes(item_cats)

In [7]:
# Used to tranform prediction inputs
ct = ColumnTransformer([('encoder', OneHotEncoder(), [0, 1, 2, 3])], remainder='passthrough')
x = train_lag.iloc[:, :-1].values
ct.fit(x)

ColumnTransformer(remainder='passthrough',
                  transformers=[('encoder', OneHotEncoder(), [0, 1, 2, 3])])

In [12]:
def get_z_list(shop_id_num, item_id_num):
    # item_cat = items.loc[items['item_id'] == item_id_num, ['item_category_id']].values[0][0]
    item_cat = items[items['item_id'] == item_id_num]['item_category_id'].values
    prices = train_lag.loc[train_lag['item_id'] == item_id_num, ['item_price']].values
    price = (stats.mode(prices))[0][0][0]
    date_num = 34
    new_pd = month_group2.loc[month_group2['date_block_num'] == date_num-1].loc[month_group2['shop_id'] == shop_id_num].loc[month_group2['item_id'] == item_id_num]
    new_pd2 = month_group2.loc[month_group2['date_block_num'] == date_num-2].loc[month_group2['shop_id'] == shop_id_num].loc[month_group2['item_id'] == item_id_num]
    new_pd3 = month_group2.loc[month_group2['date_block_num'] == date_num-3].loc[month_group2['shop_id'] == shop_id_num].loc[month_group2['item_id'] == item_id_num]
    new_pd4 = month_group2.loc[month_group2['date_block_num'] == date_num-4].loc[month_group2['shop_id'] == shop_id_num].loc[month_group2['item_id'] == item_id_num]
    new_pd5 = month_group2.loc[month_group2['date_block_num'] == date_num-5].loc[month_group2['shop_id'] == shop_id_num].loc[month_group2['item_id'] == item_id_num]
    #print(len(new_pd['date_block_num']))
    if len(new_pd['shop_id']) > 0:
        mon1 = month_group2['item_cnt_day'][new_pd.index[0]]
    else:
        mon1 = 0

    if len(new_pd2['shop_id']) > 0:
        mon2 = month_group2['item_cnt_day'][new_pd2.index[0]]
    else:
        mon2 = 0

    if len(new_pd3['shop_id']) > 0:
        mon3 = month_group2['item_cnt_day'][new_pd3.index[0]]
    else:
        mon3 = 0

    if len(new_pd4['shop_id']) > 0:
        mon4 = month_group2['item_cnt_day'][new_pd4.index[0]]
    else:
        mon4 = 0

    if len(new_pd5['shop_id']) > 0:
        mon5 = month_group2['item_cnt_day'][new_pd5.index[0]]
    else:
        mon5 = 0

    z = ['november', 'shop ' + str(shop_id_num), 'item_category ' + str(item_cat), 'item ' + str(item_id_num), price, mon1, mon2, mon3, mon4, mon5]
    return z

In [20]:
get_z_list(7, 13071)

['november',
 'shop 7',
 'item_category [3]',
 'item 13071',
 599.0,
 0,
 1.0,
 1.0,
 0,
 1.0]

In [19]:
train_lag.loc[train_lag['shop_id'] == 7].loc[train_lag['item_id'] == 13071]

Unnamed: 0,date_block_num,month,shop_id,item_category_id,item_id,item_price,mon_lag_1,mon_lag_2,mon_lag_3,mon_lag_4,mon_lag_5,item_cnt_day
8399,0,january,7,3,13071,499.0,,,,,,12.0
76097,1,february,7,3,13071,499.0,12.0,,,,,8.0
137452,2,march,7,3,13071,499.0,8.0,12.0,,,,13.0
205541,3,april,7,3,13071,499.0,13.0,8.0,12.0,,,6.0
263972,4,may,7,3,13071,499.0,6.0,13.0,8.0,12.0,,2.0
320752,5,june,7,3,13071,499.0,2.0,6.0,13.0,8.0,12.0,4.0
381394,6,july,7,3,13071,499.0,4.0,2.0,6.0,13.0,8.0,10.0
442390,7,august,7,3,13071,499.0,10.0,4.0,2.0,6.0,13.0,11.0
504951,8,september,7,3,13071,499.0,11.0,10.0,4.0,2.0,6.0,10.0
560489,9,october,7,3,13071,449.0,10.0,11.0,10.0,4.0,2.0,9.0
