In [46]:
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import mean_squared_error, r2_score

import pandas as pd
import datetime as dt

import matplotlib.pyplot as plt
import numpy as np

from data_preprocessing.DataLoader import DataLoader

In [47]:
d = DataLoader.load_data_from_path("data/data_new")

In [48]:
def unite_sets(deliveries, products, sessions, users):
    deliveries["deltas"] = deliveries["delivery_timestamp"] - deliveries["purchase_timestamp"]

    # divide category
    categories = products.category_path.str.split(';', expand=True)
    products = pd.concat([products, categories], axis=1)
    products = products.drop(columns=['category_path'])
    products = products.rename(
        columns={0: "primary_category", 1: "secondary_category", 2: "tertiary_category", 3: "quaternary_category"})


    deliveries_sessions = pd.merge(deliveries, sessions, left_on="purchase_id", right_on="purchase_id")
    deliveries_sessions_users = pd.merge(deliveries_sessions, users, left_on="user_id", right_on="user_id")
    deliveries_sessions_users_products = pd.merge(deliveries_sessions_users, products, left_on="product_id", right_on="product_id")

    deliveries_sessions_users_products.to_csv('../out.csv')
    return deliveries_sessions_users_products

In [49]:
#Function for labeling rows
def labelTimeOfDay(row):
    hour = row['purchase_timestamp'].hour
    if(hour >= 6 and hour < 12):
        return "Morning"
    elif(hour >= 12 and hour < 18):
        return "Afternoon"
    elif(hour >=18 and hour < 24):
        return "Evening"
    else:
        return "Night"

In [50]:
d = DataLoader.load_data_from_path("data/data_new")

products = d.products
deliveries = d.deliveries
sessions = d.sessions
users = d.users

united = unite_sets(deliveries, products, sessions, users)

In [51]:
# nie wnosi zadnej informacji wiec wyrzucamy daną kolumnę
united = united.loc[:, united.columns != 'event_type']
united = united.loc[:, united.columns != 'name']
united = united.loc[:, united.columns != 'street']
united = united.loc[:, united.columns != 'product_name']
united = united.loc[:, united.columns != 'delivery_timestamp']
united = united.loc[:, united.columns != 'timestamp']
united = united.loc[:, united.columns != 'purchase_id']
united = united.loc[:, united.columns != 'product_id']
united = united.loc[:, united.columns != 'user_id']
united = united.loc[:, united.columns != 'session_id']
united = united.loc[:, united.columns != 'offered_discount']
united = united.loc[:, united.columns != 'price']
united = united.loc[:, united.columns != 'primary_category']
united = united.loc[:, united.columns != 'secondary_category']
united = united.loc[:, united.columns != 'tertiary_category']
united = united.loc[:, united.columns != 'quaternary_category']

united.loc[:,'time_of_day'] = united.apply(lambda row: labelTimeOfDay(row), axis=1)
united['weekday'] = united['purchase_timestamp'].dt.day_name()
united = united.loc[:, united.columns != 'purchase_timestamp']

In [52]:
y = pd.get_dummies(united.city, prefix='city')
united = united.join(other=y)
united = united.loc[:, united.columns != 'city']

y = pd.get_dummies(united.delivery_company, prefix='delivery_company')
united = united.join(other=y)
united = united.loc[:, united.columns != 'delivery_company']

y = pd.get_dummies(united.time_of_day, prefix='time_of_day')
united = united.join(other=y)
united = united.loc[:, united.columns != 'time_of_day']

y = pd.get_dummies(united.weekday, prefix='weekday')
united = united.join(other=y)
united = united.loc[:, united.columns != 'weekday']

united['deltas'] = pd.to_numeric(united['deltas'].dt.days, downcast='integer')

In [53]:
target_column = ['deltas'] 
predictors = list(set(list(united.columns))-set(target_column))
# united[predictors] = united[predictors]/united[predictors].max()
# united.describe().transpose()

In [56]:
X = united[predictors].values
y = united[target_column].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=40)

#Scaling
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)


In [57]:
sizePred = len(predictors)
mlpClassifier = MLPClassifier(hidden_layer_sizes=(sizePred,sizePred,sizePred), activation='relu', solver='adam', max_iter=500)
mlpClassifier.fit(X_train,y_train.ravel())

predict_train = mlpClassifier.predict(X_train)
predict_test = mlpClassifier.predict(X_test)

In [58]:
from sklearn.metrics import classification_report,confusion_matrix

#Evaluate predictions for train data
print(confusion_matrix(y_train,predict_train))
print(classification_report(y_train,predict_train))


[[ 505  269   49    0    0]
 [ 172 1876  344   34    0]
 [  30  252 1141   62    2]
 [   1   38  122  297    4]
 [   0    0   15   10   22]]
              precision    recall  f1-score   support

           0       0.71      0.61      0.66       823
           1       0.77      0.77      0.77      2426
           2       0.68      0.77      0.72      1487
           3       0.74      0.64      0.69       462
           4       0.79      0.47      0.59        47

    accuracy                           0.73      5245
   macro avg       0.74      0.65      0.69      5245
weighted avg       0.73      0.73      0.73      5245



In [59]:
#Evaluate predictions for test data
print(confusion_matrix(y_test,predict_test))
print(classification_report(y_test,predict_test))

[[191 115  20   1   0]
 [105 790 146  13   0]
 [ 14 140 460  31   2]
 [  0  23  69 102   3]
 [  0   0   5   4  14]]
              precision    recall  f1-score   support

           0       0.62      0.58      0.60       327
           1       0.74      0.75      0.74      1054
           2       0.66      0.71      0.68       647
           3       0.68      0.52      0.59       197
           4       0.74      0.61      0.67        23

    accuracy                           0.69      2248
   macro avg       0.69      0.63      0.66      2248
weighted avg       0.69      0.69      0.69      2248

