In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from os import path
import datetime

from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import LSTM
from keras.optimizers import RMSprop

In [2]:
#load files
sales_train = pd.read_csv("sales_train.csv")
submission = pd.read_csv("sample_submission.csv")
shops = pd.read_csv("shops.csv")
test = pd.read_csv("test.csv")
items = pd.read_csv("items.csv")
item_cat = pd.read_csv("item_categories.csv")


GROUPED_TRANSACTION_DATA_FILE = "grouped_transaction_data.csv"
TRAINING_DATA_FILEPATH = "training_data.csv"
training_data = None

In [20]:
# data preprocessing
def YearFromDate(dateStr):
    return dateStr.split('.')[2]
    
#add features
if (path.exists("data.csv")):
    df = pd.read_csv("data.csv")
else:  
    df = sales_train
    # month is de maand, dus jan, feb etc.
    df['month'] = df.apply(lambda r: r['date_block_num']%12, axis=1)
    df['year'] = df.apply(lambda d: YearFromDate(d['date']), axis = 1)
    df = df.drop('item_price', axis = 1) # can be dropped because item_price is just a function of shop_id, month and item_id
    df = df.drop('date_block_num', axis = 1) # can be dropped since we have the year and month
    df = df.drop('date', axis = 1 ) # can be dropped since we have year and month, don't need day
    df.to_csv("data.csv")

group_filter = ['shop_id', 'item_id', 'year','month','item_cnt_day']
#completing cleaning


# sum the transaction in the same month and save the data
if (path.exists(TRAINING_DATA_FILEPATH)):
    training_data = pd.read_csv(TRAINING_DATA_FILEPATH)[group_filter]
else:    
    df = pd.read_csv("data.csv")
    summed_data = df.groupby(['shop_id', 'item_id', 'year','month']).sum().reset_index()
    summed_data.to_csv(TRAINING_DATA_FILEPATH)
    training_data = summed_data

In [None]:
# get the items that weren't sold in training month, but were sold in test month
def addMissingTransactions():
    for shop in tqdm(X_2['shop_id'].unique()):
        for item in X_2[X_2['shop_id'] == shop]['item_id'].unique():
            sold_next_month = X_2[(X_2['shop_id'] == shop) & 
                                  (X_2['item_id'] == item)]
            sold_this_month = X_1[(X_1['shop_id'] == shop) & 
                                  (X_1['item_id'] == item)]
            #if items not in training month but in test month
            if len(sold_next_month) > 1 and len(sold_this_month) == 0:
                print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
                closest_month = FindClosestMonth(next_month, shop, item)
                print(closest_month)

In [57]:
from tqdm import tqdm

month = 9
next_month = (month+1) % 12
features = ['shop_id', 'item_id', 'year', 'month']
years = training_data['year'].unique()

X_1 = training_data[training_data['month'] == month][group_filter]
X_train = X_1[(X_1['month'] == month) & (X_1['year'] < max(years))][features]
y_train = X_1[(X_1['month'] == month) & (X_1['year'] < max(years))]['item_cnt_day']

X_2 = training_data[training_data['month'] == month][group_filter]
X_test = X_2[(X_2['month'] == month) & (X_2['year'] == max(years))][features]
y_test = X_2[(X_2['month'] == month) & (X_2['year'] == max(years))]['item_cnt_day']


In [69]:
print(len(X_train))
print(len(X_test))


93685
31531
<class 'pandas.core.series.Series'>


In [75]:
# TODO: standardize data
label_scaler = StandardScaler()
label_scaler.fit(np.array(y_train).reshape(-1, 1))
y_train = label_scaler.transform(np.array(y_train).reshape(-1,1))

year_scaler = StandardScaler()
year_scaler.fit(np.array(X_train['year']).reshape(-1, 1))
X_train['year'] = year_scaler.transform(np.array(X_train['year']).reshape(-1,1))
X_train.head()

Unnamed: 0,shop_id,item_id,year,month
8118,2,32,1.095188,9
8149,2,464,-0.913085,9
8150,2,464,1.095188,9
8168,2,482,-0.913085,9
8175,2,482,1.095188,9


In [60]:
from sklearn import preprocessing

X_shops = X_train['shop_id'].unique()
X_items = X_train['item_id'].unique()
X_months = np.array([i for i in range(0,12)])

shop_labelEncoder = preprocessing.LabelEncoder().fit(X_shops)
item_labelEncoder = preprocessing.LabelEncoder().fit(X_items)
month_labelEncoder = preprocessing.LabelEncoder().fit(X_months)

shop_labels = dict(zip(X_shops, shop_labelEncoder.transform(X_shops)))
item_labels = dict(zip(X_items, item_labelEncoder.transform(X_items)))
month_labels = dict(zip(X_months, month_labelEncoder.transform(X_months)))

In [61]:

def vectorize(X):
    matrix = np.zeros((len(X), len(X_shops) + len(X_items) + len(X_months) + 3), dtype=np.float32)
    i = 0
    for n, row in X.iterrows():
        matrix[i][shop_labels[row['shop_id']]] = 1
        matrix[i][len(shop_labels) + item_labels[row['item_id']]] = 1
        matrix[i][len(shop_labels) + len(item_labels)] = row['year']
        matrix[i][len(shop_labels) + len(item_labels)+ 3 + month_labels[row['month']]] = 1 # TODO: if i train for 1 month i can remove this
        i+=1
    return matrix
        
# matrix = vectorize(pd.DataFrame(X_split[0], columns = ['shop_id', 'item_id', 'year','month']))
matrix = vectorize(X_train)
print(len(matrix[0]))
print(y_train.iloc[0])


11276
2.0


In [63]:
print("Features:", len(matrix[0]))
print("Training records:", len(matrix))
print("Testing record labels:", len(y_train))

Features: 11276
Training records: 93685
Testing record labels: 93685


In [64]:
model = Sequential([
    Dense(32, input_dim=len(matrix[0]), activation='relu'),
    Dense(1, activation='sigmoid')
])
# TODO: should change activation to relu after standardizing data
optimizer = RMSprop(lr=0.005)
model.compile(loss="mean_squared_error", optimizer = optimizer)
model.fit(matrix, np.array(y_train), batch_size=128, epochs = 13)

Epoch 1/13
Epoch 2/13
Epoch 3/13
Epoch 4/13
Epoch 5/13
Epoch 6/13
Epoch 7/13
Epoch 8/13
Epoch 9/13
Epoch 10/13
Epoch 11/13
Epoch 12/13
Epoch 13/13


<tensorflow.python.keras.callbacks.History at 0x2382fde1a00>

In [None]:
predict_test = model.predict(X_test)