In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import LabelEncoder

In [2]:
df = pd.read_csv('../Common Datasets/train_df.csv')

In [3]:
df

Unnamed: 0,PURCHASE_DATE,MAGIC_KEY,BOX_ID,BOX_COUNT,QUALITY,DELIVERY_OPTION,MILK,MEAT,UNIT_PRICE
0,2019-01-01,2BE4ABE1D99,162.0,1.0,Premium,Delivery from Collection Point,10.0,2.2,23.98
1,2019-01-01,2BF0F5D1852,162.0,1.0,Premium,Delivery from Collection Point,10.0,2.2,23.98
2,2019-01-01,2CA3A78FEA9,162.0,1.0,Premium,Delivery from Collection Point,10.0,2.2,23.98
3,2019-01-01,2C2A89A5F22,162.0,1.0,Premium,Delivery from Collection Point,10.0,2.2,23.98
4,2019-01-01,2C586661A56,162.0,1.0,Premium,Delivery from Collection Point,10.0,2.2,23.98
...,...,...,...,...,...,...,...,...,...
1867929,2018-10-28,2BD992B5538,12.0,1.0,Premium,Home Delivery - CoD,8.0,1.5,12.98
1867930,2018-10-28,2C97CD72233,17.0,1.0,Premium,Home Delivery - CoD,10.0,1.8,12.98
1867931,2018-10-28,2C91C61D372,40.0,1.0,Premium,Home Delivery - CoD,12.0,1.8,19.98
1867932,2018-10-28,2CD70CFC4E3,51.0,1.0,Premium,Home Delivery - CoD,18.0,2.9,23.98


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1867934 entries, 0 to 1867933
Data columns (total 9 columns):
 #   Column           Dtype  
---  ------           -----  
 0   PURCHASE_DATE    object 
 1   MAGIC_KEY        object 
 2   BOX_ID           float64
 3   BOX_COUNT        float64
 4   QUALITY          object 
 5   DELIVERY_OPTION  object 
 6   MILK             float64
 7   MEAT             float64
 8   UNIT_PRICE       float64
dtypes: float64(5), object(4)
memory usage: 128.3+ MB


In [5]:
df['PURCHASE_DATE'] = pd.to_datetime(df['PURCHASE_DATE'])

# <font color="red">We need to create a new datasets with customer behaviour</font>

In [6]:
# Group by 'MAGIC_KEY' and count the number of purchases for each customer
df_customer = df.groupby('MAGIC_KEY')['PURCHASE_DATE'].count().reset_index()
df_customer.columns = ['MAGIC_KEY', 'PURCHASE_COUNT']

# Display the total number of purchases made by each customer
df_customer

Unnamed: 0,MAGIC_KEY,PURCHASE_COUNT
0,249670911D8,1
1,249751FC4DD,1
2,24978027606,1
3,24979164422,1
4,2497B8B4FDA,2
...,...,...
1048941,2E6F53ECC13,1
1048942,2E6F6DAFFF3,4
1048943,2E6F72C6F1C,1
1048944,2E6F9C7B9B4,1


In [7]:
# Calculate the total amount spent for each purchase
df['TOTAL_AMOUNT'] = df['BOX_COUNT'] * df['UNIT_PRICE']

# Group by 'MAGIC_KEY' and sum the total amount spent by each customer
total_amount_spent = df.groupby('MAGIC_KEY')['TOTAL_AMOUNT'].sum().reset_index()
total_amount_spent.columns = ['MAGIC_KEY', 'TOTAL_AMOUNT_SPENT']

# Display the total amount spent by each customer
total_amount_spent

Unnamed: 0,MAGIC_KEY,TOTAL_AMOUNT_SPENT
0,249670911D8,13.96
1,249751FC4DD,17.98
2,24978027606,15.96
3,24979164422,13.96
4,2497B8B4FDA,33.94
...,...,...
1048941,2E6F53ECC13,19.98
1048942,2E6F6DAFFF3,71.92
1048943,2E6F72C6F1C,19.98
1048944,2E6F9C7B9B4,19.98


In [8]:
df_customer = pd.merge(df_customer, total_amount_spent, on='MAGIC_KEY')
df_customer

Unnamed: 0,MAGIC_KEY,PURCHASE_COUNT,TOTAL_AMOUNT_SPENT
0,249670911D8,1,13.96
1,249751FC4DD,1,17.98
2,24978027606,1,15.96
3,24979164422,1,13.96
4,2497B8B4FDA,2,33.94
...,...,...,...
1048941,2E6F53ECC13,1,19.98
1048942,2E6F6DAFFF3,4,71.92
1048943,2E6F72C6F1C,1,19.98
1048944,2E6F9C7B9B4,1,19.98


In [9]:
last_purchase_date = df.groupby('MAGIC_KEY')['PURCHASE_DATE'].max().reset_index()
last_purchase_date.columns = ['MAGIC_KEY', 'LAST_PURCHASE_DATE']

# Display the DataFrame with the last purchase date for each customer
last_purchase_date

Unnamed: 0,MAGIC_KEY,LAST_PURCHASE_DATE
0,249670911D8,2018-11-01
1,249751FC4DD,2018-10-12
2,24978027606,2018-10-19
3,24979164422,2018-11-30
4,2497B8B4FDA,2018-11-01
...,...,...
1048941,2E6F53ECC13,2018-11-25
1048942,2E6F6DAFFF3,2019-01-19
1048943,2E6F72C6F1C,2018-10-21
1048944,2E6F9C7B9B4,2019-01-28


In [10]:
last_purchase_date.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048946 entries, 0 to 1048945
Data columns (total 2 columns):
 #   Column              Non-Null Count    Dtype         
---  ------              --------------    -----         
 0   MAGIC_KEY           1048946 non-null  object        
 1   LAST_PURCHASE_DATE  1048946 non-null  datetime64[ns]
dtypes: datetime64[ns](1), object(1)
memory usage: 16.0+ MB


In [11]:
last_purchase_date['DAY_SINCE_LAST_PURCHASE'] = pd.to_datetime('2019-02-01') - last_purchase_date['LAST_PURCHASE_DATE']
last_purchase_date

Unnamed: 0,MAGIC_KEY,LAST_PURCHASE_DATE,DAY_SINCE_LAST_PURCHASE
0,249670911D8,2018-11-01,92 days
1,249751FC4DD,2018-10-12,112 days
2,24978027606,2018-10-19,105 days
3,24979164422,2018-11-30,63 days
4,2497B8B4FDA,2018-11-01,92 days
...,...,...,...
1048941,2E6F53ECC13,2018-11-25,68 days
1048942,2E6F6DAFFF3,2019-01-19,13 days
1048943,2E6F72C6F1C,2018-10-21,103 days
1048944,2E6F9C7B9B4,2019-01-28,4 days


In [12]:
df_customer = pd.merge(df_customer, last_purchase_date, on='MAGIC_KEY')
df_customer

Unnamed: 0,MAGIC_KEY,PURCHASE_COUNT,TOTAL_AMOUNT_SPENT,LAST_PURCHASE_DATE,DAY_SINCE_LAST_PURCHASE
0,249670911D8,1,13.96,2018-11-01,92 days
1,249751FC4DD,1,17.98,2018-10-12,112 days
2,24978027606,1,15.96,2018-10-19,105 days
3,24979164422,1,13.96,2018-11-30,63 days
4,2497B8B4FDA,2,33.94,2018-11-01,92 days
...,...,...,...,...,...
1048941,2E6F53ECC13,1,19.98,2018-11-25,68 days
1048942,2E6F6DAFFF3,4,71.92,2019-01-19,13 days
1048943,2E6F72C6F1C,1,19.98,2018-10-21,103 days
1048944,2E6F9C7B9B4,1,19.98,2019-01-28,4 days


In [13]:
df_sorted = df.sort_values(['MAGIC_KEY', 'PURCHASE_DATE'])

# Calculate the time difference between consecutive purchases for each customer
df_sorted['TIME_DIFF'] = df_sorted.groupby('MAGIC_KEY')['PURCHASE_DATE'].diff()

# Group by 'MAGIC_KEY' and calculate the average time difference between consecutive purchases
average_diff = df_sorted.groupby('MAGIC_KEY')['TIME_DIFF'].mean().reset_index()
average_diff.columns = ['MAGIC_KEY', 'AVERAGE_DIFF']

# Fill NaN values (customers with only one purchase) with 0
average_diff['AVERAGE_DIFF'] = average_diff['AVERAGE_DIFF'].fillna(0)
average_diff['AVERAGE_DIFF'] = pd.to_timedelta(average_diff['AVERAGE_DIFF'])
average_diff

Unnamed: 0,MAGIC_KEY,AVERAGE_DIFF
0,249670911D8,0 days 00:00:00
1,249751FC4DD,0 days 00:00:00
2,24978027606,0 days 00:00:00
3,24979164422,0 days 00:00:00
4,2497B8B4FDA,29 days 00:00:00
...,...,...
1048941,2E6F53ECC13,0 days 00:00:00
1048942,2E6F6DAFFF3,30 days 16:00:00
1048943,2E6F72C6F1C,0 days 00:00:00
1048944,2E6F9C7B9B4,0 days 00:00:00


In [14]:
df_customer = pd.merge(df_customer, average_diff, on='MAGIC_KEY')
df_customer

Unnamed: 0,MAGIC_KEY,PURCHASE_COUNT,TOTAL_AMOUNT_SPENT,LAST_PURCHASE_DATE,DAY_SINCE_LAST_PURCHASE,AVERAGE_DIFF
0,249670911D8,1,13.96,2018-11-01,92 days,0 days 00:00:00
1,249751FC4DD,1,17.98,2018-10-12,112 days,0 days 00:00:00
2,24978027606,1,15.96,2018-10-19,105 days,0 days 00:00:00
3,24979164422,1,13.96,2018-11-30,63 days,0 days 00:00:00
4,2497B8B4FDA,2,33.94,2018-11-01,92 days,29 days 00:00:00
...,...,...,...,...,...,...
1048941,2E6F53ECC13,1,19.98,2018-11-25,68 days,0 days 00:00:00
1048942,2E6F6DAFFF3,4,71.92,2019-01-19,13 days,30 days 16:00:00
1048943,2E6F72C6F1C,1,19.98,2018-10-21,103 days,0 days 00:00:00
1048944,2E6F9C7B9B4,1,19.98,2019-01-28,4 days,0 days 00:00:00


In [15]:
df_customer.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048946 entries, 0 to 1048945
Data columns (total 6 columns):
 #   Column                   Non-Null Count    Dtype          
---  ------                   --------------    -----          
 0   MAGIC_KEY                1048946 non-null  object         
 1   PURCHASE_COUNT           1048946 non-null  int64          
 2   TOTAL_AMOUNT_SPENT       1048946 non-null  float64        
 3   LAST_PURCHASE_DATE       1048946 non-null  datetime64[ns] 
 4   DAY_SINCE_LAST_PURCHASE  1048946 non-null  timedelta64[ns]
 5   AVERAGE_DIFF             1048946 non-null  timedelta64[ns]
dtypes: datetime64[ns](1), float64(1), int64(1), object(1), timedelta64[ns](2)
memory usage: 48.0+ MB


In [16]:
df_customer.describe()

Unnamed: 0,PURCHASE_COUNT,TOTAL_AMOUNT_SPENT,LAST_PURCHASE_DATE,DAY_SINCE_LAST_PURCHASE,AVERAGE_DIFF
count,1048946.0,1048946.0,1048946,1048946,1048946
mean,1.780772,31.72906,2018-12-16 09:45:36.216354816,46 days 14:14:23.783645677,15 days 13:47:54.728888603
min,1.0,9.96,2018-10-01 00:00:00,1 days 00:00:00,0 days 00:00:00
25%,1.0,15.96,2018-11-18 00:00:00,17 days 00:00:00,0 days 00:00:00
50%,1.0,23.98,2018-12-28 00:00:00,35 days 00:00:00,0 days 00:00:00
75%,2.0,43.88,2019-01-15 00:00:00,75 days 00:00:00,30 days 00:00:00
max,40.0,487.2,2019-01-31 00:00:00,123 days 00:00:00,122 days 00:00:00
std,1.192321,22.74335,,35 days 18:30:03.134143502,21 days 20:52:43.690023691


In [17]:
threshold = pd.Timedelta(days=15)
filtered_df = df_customer.loc[(df_customer['PURCHASE_COUNT'] > 1) & (df_customer['AVERAGE_DIFF'] < threshold)]

In [18]:
filtered_df

Unnamed: 0,MAGIC_KEY,PURCHASE_COUNT,TOTAL_AMOUNT_SPENT,LAST_PURCHASE_DATE,DAY_SINCE_LAST_PURCHASE,AVERAGE_DIFF
126,24B29B5F0CD,4,62.06,2018-12-03,60 days,13 days 16:00:00
179,24BF50B496C,5,76.22,2018-11-30,63 days,14 days 18:00:00
201,24C6F8AA96D,2,24.36,2019-01-26,6 days,6 days 00:00:00
264,24D28944EEE,12,121.68,2018-12-30,33 days,3 days 21:49:05.454545454
587,27B30365AE9,2,33.94,2018-10-22,102 days,10 days 00:00:00
...,...,...,...,...,...,...
1048510,2E2336C7F7F,14,225.12,2019-01-29,3 days,8 days 22:09:13.846153846
1048647,2E4196577FD,8,81.12,2018-11-12,81 days,5 days 17:08:34.285714285
1048691,2E4BC0BCAF8,10,127.80,2019-01-21,11 days,12 days 00:00:00
1048870,2E6AC2366D0,3,44.34,2018-10-29,95 days,7 days 00:00:00


In [19]:
df_customer.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048946 entries, 0 to 1048945
Data columns (total 6 columns):
 #   Column                   Non-Null Count    Dtype          
---  ------                   --------------    -----          
 0   MAGIC_KEY                1048946 non-null  object         
 1   PURCHASE_COUNT           1048946 non-null  int64          
 2   TOTAL_AMOUNT_SPENT       1048946 non-null  float64        
 3   LAST_PURCHASE_DATE       1048946 non-null  datetime64[ns] 
 4   DAY_SINCE_LAST_PURCHASE  1048946 non-null  timedelta64[ns]
 5   AVERAGE_DIFF             1048946 non-null  timedelta64[ns]
dtypes: datetime64[ns](1), float64(1), int64(1), object(1), timedelta64[ns](2)
memory usage: 48.0+ MB


In [20]:
# Filter the DataFrame to include only dates in the first 15 days of October 2018
october_first_15_df = df[(df['PURCHASE_DATE'].dt.year == 2018) & 
                          (df['PURCHASE_DATE'].dt.month == 10) &
                          (df['PURCHASE_DATE'].dt.day <= 15)]

november_first_15_df = df[(df['PURCHASE_DATE'].dt.year == 2018) & 
                          (df['PURCHASE_DATE'].dt.month == 11) &
                          (df['PURCHASE_DATE'].dt.day <= 15)]

december_first_15_df = df[(df['PURCHASE_DATE'].dt.year == 2018) & 
                          (df['PURCHASE_DATE'].dt.month == 12) &
                          (df['PURCHASE_DATE'].dt.day <= 15)]

january_first_15_df = df[(df['PURCHASE_DATE'].dt.year == 2019) & 
                          (df['PURCHASE_DATE'].dt.month == 1) &
                          (df['PURCHASE_DATE'].dt.day <= 15)]

In [21]:
# Define a function to check if there is a purchase in the specified date range
def check_purchase(df, month, year):
    column_name = f"{month}_{year}_FIRST_15"
    return df.groupby('MAGIC_KEY')['PURCHASE_DATE'].count().astype(bool).astype(int).rename(column_name)

In [22]:
# Apply the function to each sub dataset
october_2018_first_15 = check_purchase(october_first_15_df, '10', '2018')
november_2018_first_15 = check_purchase(november_first_15_df, '11', '2018')
december_2018_first_15 = check_purchase(december_first_15_df, '12', '2018')
january_2019_first_15 = check_purchase(january_first_15_df, '1', '2019')

In [23]:
df_customer = pd.merge(df_customer, october_2018_first_15, on='MAGIC_KEY', how='left')
df_customer = pd.merge(df_customer, november_2018_first_15, on='MAGIC_KEY', how='left')
df_customer = pd.merge(df_customer, december_2018_first_15, on='MAGIC_KEY', how='left')
df_customer = pd.merge(df_customer, january_2019_first_15, on='MAGIC_KEY', how='left')

df_customer['10_2018_FIRST_15'] = df_customer['10_2018_FIRST_15'].fillna(0)
df_customer['11_2018_FIRST_15'] = df_customer['11_2018_FIRST_15'].fillna(0)
df_customer['12_2018_FIRST_15'] = df_customer['12_2018_FIRST_15'].fillna(0)
df_customer['1_2019_FIRST_15'] = df_customer['1_2019_FIRST_15'].fillna(0)
df_customer

Unnamed: 0,MAGIC_KEY,PURCHASE_COUNT,TOTAL_AMOUNT_SPENT,LAST_PURCHASE_DATE,DAY_SINCE_LAST_PURCHASE,AVERAGE_DIFF,10_2018_FIRST_15,11_2018_FIRST_15,12_2018_FIRST_15,1_2019_FIRST_15
0,249670911D8,1,13.96,2018-11-01,92 days,0 days 00:00:00,0.0,1.0,0.0,0.0
1,249751FC4DD,1,17.98,2018-10-12,112 days,0 days 00:00:00,1.0,0.0,0.0,0.0
2,24978027606,1,15.96,2018-10-19,105 days,0 days 00:00:00,0.0,0.0,0.0,0.0
3,24979164422,1,13.96,2018-11-30,63 days,0 days 00:00:00,0.0,0.0,0.0,0.0
4,2497B8B4FDA,2,33.94,2018-11-01,92 days,29 days 00:00:00,1.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
1048941,2E6F53ECC13,1,19.98,2018-11-25,68 days,0 days 00:00:00,0.0,0.0,0.0,0.0
1048942,2E6F6DAFFF3,4,71.92,2019-01-19,13 days,30 days 16:00:00,0.0,0.0,0.0,0.0
1048943,2E6F72C6F1C,1,19.98,2018-10-21,103 days,0 days 00:00:00,0.0,0.0,0.0,0.0
1048944,2E6F9C7B9B4,1,19.98,2019-01-28,4 days,0 days 00:00:00,0.0,0.0,0.0,0.0


In [24]:
df_customer.to_csv("df_customer_train.csv", index=False)  # Save test data