In [1]:
import numpy as np
import pandas as pd
import sklearn
import matplotlib
import tensorflow as tf
import tensorflow.keras as keras
import pickle
import os
import time
import sys
import warnings

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, RobustScaler
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
from tensorflow.keras import Sequential, optimizers, layers, metrics, models, regularizers

warnings.filterwarnings("ignore")

In [2]:
df = pd.read_pickle("Data/df_final1.pkl")

In [3]:
# Create a list of recipients
recipients = []

for i in df["recipient"]:
    if i not in recipients:
        recipients.append(i)
        
print(len(recipients))

4629


In [4]:
# Create a dictionary with the number of orders as the value for each customer as the key
counter = {}

for i in range(len(df)):
    if df["recipient"][i] not in counter:
        counter[df.loc[i, ("recipient")]] = 1
    else:
        counter[df.loc[i, ("recipient")]] += 1

In [5]:
# Drop all customers from the dataframe if they have fewer than 20 orders
for idx, (key, val) in enumerate(counter.items()):
    if val < 20:
        df.drop(df[df.recipient == key].index, inplace=True)

In [6]:
# Create a new list of customers based on the new dataframe
recipients_new = []

for i in df["recipient"]:
    if i not in recipients_new:
        recipients_new.append(i)
        
print(len(recipients_new))

3698


In [7]:
# Drop all but one row for orders with multiple products, so no customer has more than one row per date
df_new = pd.DataFrame(columns = df.columns)

for i in recipients_new:
    test = df[df.recipient == i]
    test = test.drop_duplicates(subset = ["delivery_date"])
    test.reset_index(drop=True, inplace=True)
    
    df_new = pd.concat([df_new, test])

In [8]:
# Add the previous 5 differences to each row in the dataframe
df_final = pd.DataFrame(columns = df_new.columns)

for i in recipients_new:
    test = df_new[df_new.recipient == i]
    
    for idx, val in test.iterrows():
        if idx >= 5:
            test.loc[idx, "t_5"] = test["difference"].loc[idx-5]
            test.loc[idx, "t_4"] = test["difference"].loc[idx-4]
            test.loc[idx, "t_3"] = test["difference"].loc[idx-3]
            test.loc[idx, "t_2"] = test["difference"].loc[idx-2]
            test.loc[idx, "t_1"] = test["difference"].loc[idx-1]
            
    df_final = pd.concat([df_final, test])

In [9]:
# Drop any row without a value for t_5, indicating that no history is available for that order
df_final.reset_index(drop=True, inplace=True)
df_final.drop(df_final[pd.isna(df_final.t_5) == True].index, inplace=True)
df_final.reset_index(drop=True, inplace=True)

In [10]:
df_final.to_pickle("df_with_history.pkl")

In [11]:
df = pd.read_pickle("Data/df_with_history.pkl")

In [12]:
print(len(df))

249100


In [13]:
# Create a new list of customers
recipients_new = []

for i in df["recipient"]:
    if i not in recipients_new:
        recipients_new.append(i)
        
print(len(recipients_new))

3698


In [14]:
# Create a new dataframe with only customers that have more than 10 orders
df_final1 = pd.DataFrame(columns = df.columns)

for i in recipients:
    test = df[df.recipient == i]
    
    if len(test) >= 10:
        df_final1 = pd.concat([df_final1, test])

In [15]:
print(len(df_final1))

248449


In [16]:
df_final1.to_pickle("df_with_history_>10_orders.pkl")