In [261]:
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import mean_squared_error, r2_score

import pandas as pd
import datetime as dt
import random 
import matplotlib.pyplot as plt
import numpy as np

from data_preprocessing.DataLoader import DataLoader

In [262]:
d = DataLoader.load_data_from_path("data/data_new")

In [263]:
def unite_sets(deliveries, products, sessions, users):
    deliveries["deltas"] = deliveries["delivery_timestamp"] - deliveries["purchase_timestamp"]

    # divide category
    categories = products.category_path.str.split(';', expand=True)
    products = pd.concat([products, categories], axis=1)
    products = products.drop(columns=['category_path'])
    products = products.rename(
        columns={0: "primary_category", 1: "secondary_category", 2: "tertiary_category", 3: "quaternary_category"})


    deliveries_sessions = pd.merge(deliveries, sessions, left_on="purchase_id", right_on="purchase_id")
    deliveries_sessions_users = pd.merge(deliveries_sessions, users, left_on="user_id", right_on="user_id")
    deliveries_sessions_users_products = pd.merge(deliveries_sessions_users, products, left_on="product_id", right_on="product_id")

    deliveries_sessions_users_products.to_csv('../out.csv')
    return deliveries_sessions_users_products

In [264]:
#Function for labeling rows
def labelTimeOfDay(row):
    hour = row['purchase_timestamp'].hour
    if(hour >= 6 and hour < 12):
        return "Morning"
    elif(hour >= 12 and hour < 18):
        return "Afternoon"
    elif(hour >=18 and hour < 24):
        return "Evening"
    else:
        return "Night"

In [265]:
d = DataLoader.load_data_from_path("data/data_new")

products = d.products
deliveries = d.deliveries
sessions = d.sessions
users = d.users

united = unite_sets(deliveries, products, sessions, users)

In [266]:
# nie wnosi zadnej informacji wiec wyrzucamy daną kolumnę
united = united.loc[:, united.columns != 'event_type']
united = united.loc[:, united.columns != 'name']
united = united.loc[:, united.columns != 'street']
united = united.loc[:, united.columns != 'product_name']
united = united.loc[:, united.columns != 'delivery_timestamp']
united = united.loc[:, united.columns != 'timestamp']
united = united.loc[:, united.columns != 'purchase_id']
united = united.loc[:, united.columns != 'product_id']
united = united.loc[:, united.columns != 'user_id']
united = united.loc[:, united.columns != 'session_id']
united = united.loc[:, united.columns != 'offered_discount']
united = united.loc[:, united.columns != 'price']
united = united.loc[:, united.columns != 'primary_category']
united = united.loc[:, united.columns != 'secondary_category']
united = united.loc[:, united.columns != 'tertiary_category']
united = united.loc[:, united.columns != 'quaternary_category']

united.loc[:,'time_of_day'] = united.apply(lambda row: labelTimeOfDay(row), axis=1)
united['weekday'] = united['purchase_timestamp'].dt.day_name()
united = united.loc[:, united.columns != 'purchase_timestamp']

In [267]:
y = pd.get_dummies(united.city, prefix='city')
united = united.join(other=y)
united = united.loc[:, united.columns != 'city']

y = pd.get_dummies(united.delivery_company, prefix='delivery_company')
united = united.join(other=y)
united = united.loc[:, united.columns != 'delivery_company']

y = pd.get_dummies(united.time_of_day, prefix='time_of_day')
united = united.join(other=y)
united = united.loc[:, united.columns != 'time_of_day']

y = pd.get_dummies(united.weekday, prefix='weekday')
united = united.join(other=y)
united = united.loc[:, united.columns != 'weekday']

united['deltas'] = pd.to_numeric(united['deltas'].dt.days, downcast='integer')

In [269]:
#balancing data 

#new group 
#united.loc[united['deltas'].isin([3,4]),'deltas'] = 3


united_0 = united.loc[united['deltas'] == 0]
united_1 = united.loc[united['deltas'] == 1]
united_2 = united.loc[united['deltas'] == 2]
united_3 = united.loc[united['deltas'] == 3]
united_4 = united.loc[united['deltas'] == 4]
united_3_4 = united.loc[united['deltas'].isin([3,4])]

sizes = [len(united_0), len(united_1), len(united_2),len(united_3_4)]
print(len(united_0))
print(len(united_1))
print(len(united_2))
print(len(united_3))
print(len(united_4))

column_names = united.columns
united_divided = [united_0, united_1, united_2, united_3, united_4]
new_united = []

for df in united_divided:
    new_size = random.randint(round(max(sizes) - 0.33 * max(sizes)), max(sizes))
    if(len(df) < new_size):
        new_united += df.values.tolist()
        new_united += df.sample(new_size - len(df), replace=True).values.tolist()
    else:
        new_united += df.sample(new_size).values.tolist()

new_united = pd.DataFrame.from_records(new_united, columns = column_names)

united_0 = new_united.loc[new_united['deltas'] == 0]
united_1 = new_united.loc[new_united['deltas'] == 1]
united_2 = new_united.loc[new_united['deltas'] == 2]
united_3 = new_united.loc[new_united['deltas'] == 3]
united_4 = new_united.loc[new_united['deltas'] == 4]

print("NEW:")
print(len(united_0))
print(len(united_1))
print(len(united_2))
print(len(united_3))
print(len(united_4))

united = new_united
united.describe().transpose()

1150
3480
2134
659
70
NEW:
3450
3129
3376
2694
2701


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
deltas,15350.0,1.874072,1.402314,0.0,1.0,2.0,3.0,4.0
city_Gdynia,15350.0,0.065016,0.246563,0.0,0.0,0.0,0.0,1.0
city_Konin,15350.0,0.117459,0.321977,0.0,0.0,0.0,0.0,1.0
city_Kutno,15350.0,0.056352,0.230607,0.0,0.0,0.0,0.0,1.0
city_Mielec,15350.0,0.312117,0.463372,0.0,0.0,0.0,1.0,1.0
city_Police,15350.0,0.108925,0.311555,0.0,0.0,0.0,0.0,1.0
city_Radom,15350.0,0.092834,0.290209,0.0,0.0,0.0,0.0,1.0
city_Szczecin,15350.0,0.150163,0.357243,0.0,0.0,0.0,0.0,1.0
city_Warszawa,15350.0,0.097134,0.296149,0.0,0.0,0.0,0.0,1.0
delivery_company_360,15350.0,0.340977,0.474053,0.0,0.0,0.0,1.0,1.0


In [270]:
target_column = ['deltas'] 
predictors = list(set(list(united.columns))-set(target_column))
# united[predictors] = united[predictors]/united[predictors].max()
# united.describe().transpose()

In [271]:
X = united[predictors].values
y = united[target_column].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=40)

#Scaling
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)


In [272]:
sizePred = len(predictors)
mlpClassifier = MLPClassifier(hidden_layer_sizes=(sizePred,sizePred,sizePred), activation='relu', solver='adam', max_iter=500)
mlpClassifier.fit(X_train,y_train.ravel())

predict_train = mlpClassifier.predict(X_train)
predict_test = mlpClassifier.predict(X_test)

In [273]:
from sklearn.metrics import classification_report,confusion_matrix

#Evaluate predictions for train data
print(confusion_matrix(y_train,predict_train))
print(classification_report(y_train,predict_train))


[[2214  182   43    6    0]
 [ 485 1320  306   93    2]
 [ 140  296 1592  261   64]
 [  11   47  173 1416  201]
 [   0    0    0   22 1871]]
              precision    recall  f1-score   support

           0       0.78      0.91      0.84      2445
           1       0.72      0.60      0.65      2206
           2       0.75      0.68      0.71      2353
           3       0.79      0.77      0.78      1848
           4       0.88      0.99      0.93      1893

    accuracy                           0.78     10745
   macro avg       0.78      0.79      0.78     10745
weighted avg       0.78      0.78      0.78     10745



In [274]:
#Evaluate predictions for test data
print(confusion_matrix(y_test,predict_test))
print(classification_report(y_test,predict_test))

[[882  86  33   4   0]
 [214 494 176  39   0]
 [ 79 140 628 138  38]
 [  5  15 100 637  89]
 [  0   0   0  14 794]]
              precision    recall  f1-score   support

           0       0.75      0.88      0.81      1005
           1       0.67      0.54      0.60       923
           2       0.67      0.61      0.64      1023
           3       0.77      0.75      0.76       846
           4       0.86      0.98      0.92       808

    accuracy                           0.75      4605
   macro avg       0.74      0.75      0.74      4605
weighted avg       0.74      0.75      0.74      4605

