In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import LabelEncoder

In [2]:
df = pd.read_csv('Datasets/merged_df.csv')

In [3]:
df

Unnamed: 0,PURCHASE_DATE,MAGIC_KEY,BOX_ID,BOX_COUNT,QUALITY,DELIVERY_OPTION,MILK,MEAT,UNIT_PRICE,PURCHASE
0,2019-02-01,2CED678A247,12.0,1.0,Premium,Home Delivery - CoD,8.0,1.5,12.98,Y
1,2019-02-01,2BF58D91BA1,12.0,1.0,Premium,Home Delivery - CoD,8.0,1.5,12.98,Y
2,2019-02-01,2C15B86534E,99.0,1.0,Premium,Delivery from Collection Point,0.0,3.3,13.96,Y
3,2019-02-01,2C32D9A859A,6.0,1.0,Premium,Home Delivery - CoD,0.0,2.7,11.96,Y
4,2019-02-01,2C7A55404D1,4.0,1.0,Premium,Home Delivery - CoD,0.0,2.5,11.96,Y
...,...,...,...,...,...,...,...,...,...,...
2455718,2018-10-28,2BD992B5538,12.0,1.0,Premium,Home Delivery - CoD,8.0,1.5,12.98,N
2455719,2018-10-28,2C97CD72233,17.0,1.0,Premium,Home Delivery - CoD,10.0,1.8,12.98,N
2455720,2018-10-28,2C91C61D372,40.0,1.0,Premium,Home Delivery - CoD,12.0,1.8,19.98,N
2455721,2018-10-28,2CD70CFC4E3,51.0,1.0,Premium,Home Delivery - CoD,18.0,2.9,23.98,N


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2455723 entries, 0 to 2455722
Data columns (total 10 columns):
 #   Column           Dtype  
---  ------           -----  
 0   PURCHASE_DATE    object 
 1   MAGIC_KEY        object 
 2   BOX_ID           float64
 3   BOX_COUNT        float64
 4   QUALITY          object 
 5   DELIVERY_OPTION  object 
 6   MILK             float64
 7   MEAT             float64
 8   UNIT_PRICE       float64
 9   PURCHASE         object 
dtypes: float64(5), object(5)
memory usage: 187.4+ MB


In [5]:
df['PURCHASE_DATE'] = pd.to_datetime(df['PURCHASE_DATE'])

# <font color="red">We need to create a new datasets with customer behaviour</font>

In [6]:
# Group by 'MAGIC_KEY' and count the number of purchases for each customer
df_customer = df.groupby('MAGIC_KEY')['PURCHASE'].count().reset_index()
df_customer.columns = ['MAGIC_KEY', 'PURCHASE_COUNT']

# Display the total number of purchases made by each customer
df_customer

Unnamed: 0,MAGIC_KEY,PURCHASE_COUNT
0,249670911D8,2
1,249751FC4DD,1
2,24978027606,1
3,24979164422,1
4,2497B8B4FDA,2
...,...,...
1274082,2E6F72C6F1C,3
1274083,2E6F8194908,1
1274084,2E6F9C7B9B4,1
1274085,2E6FB0EBB32,6


In [7]:
# Calculate the total amount spent for each purchase
df['TOTAL_AMOUNT'] = df['BOX_COUNT'] * df['UNIT_PRICE']

# Group by 'MAGIC_KEY' and sum the total amount spent by each customer
total_amount_spent = df.groupby('MAGIC_KEY')['TOTAL_AMOUNT'].sum().reset_index()
total_amount_spent.columns = ['MAGIC_KEY', 'TOTAL_AMOUNT_SPENT']

# Display the total amount spent by each customer
total_amount_spent

Unnamed: 0,MAGIC_KEY,TOTAL_AMOUNT_SPENT
0,249670911D8,24.10
1,249751FC4DD,17.98
2,24978027606,15.96
3,24979164422,13.96
4,2497B8B4FDA,33.94
...,...,...
1274082,2E6F72C6F1C,51.94
1274083,2E6F8194908,11.96
1274084,2E6F9C7B9B4,19.98
1274085,2E6FB0EBB32,107.88


In [8]:
df_customer = pd.merge(df_customer, total_amount_spent, on='MAGIC_KEY')
df_customer

Unnamed: 0,MAGIC_KEY,PURCHASE_COUNT,TOTAL_AMOUNT_SPENT
0,249670911D8,2,24.10
1,249751FC4DD,1,17.98
2,24978027606,1,15.96
3,24979164422,1,13.96
4,2497B8B4FDA,2,33.94
...,...,...,...
1274082,2E6F72C6F1C,3,51.94
1274083,2E6F8194908,1,11.96
1274084,2E6F9C7B9B4,1,19.98
1274085,2E6FB0EBB32,6,107.88


In [9]:
last_purchase_date = df.groupby('MAGIC_KEY')['PURCHASE_DATE'].max().reset_index()
last_purchase_date.columns = ['MAGIC_KEY', 'LAST_PURCHASE_DATE']

# Display the DataFrame with the last purchase date for each customer
last_purchase_date

Unnamed: 0,MAGIC_KEY,LAST_PURCHASE_DATE
0,249670911D8,2019-02-02
1,249751FC4DD,2018-10-12
2,24978027606,2018-10-19
3,24979164422,2018-11-30
4,2497B8B4FDA,2018-11-01
...,...,...
1274082,2E6F72C6F1C,2019-02-27
1274083,2E6F8194908,2019-02-23
1274084,2E6F9C7B9B4,2019-01-28
1274085,2E6FB0EBB32,2019-02-11


In [10]:
last_purchase_date.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1274087 entries, 0 to 1274086
Data columns (total 2 columns):
 #   Column              Non-Null Count    Dtype         
---  ------              --------------    -----         
 0   MAGIC_KEY           1274087 non-null  object        
 1   LAST_PURCHASE_DATE  1274087 non-null  datetime64[ns]
dtypes: datetime64[ns](1), object(1)
memory usage: 19.4+ MB


In [11]:
last_purchase_date['DAY_SINCE_LAST_PURCHASE'] = pd.to_datetime('2019-03-01') - last_purchase_date['LAST_PURCHASE_DATE']
last_purchase_date

Unnamed: 0,MAGIC_KEY,LAST_PURCHASE_DATE,DAY_SINCE_LAST_PURCHASE
0,249670911D8,2019-02-02,27 days
1,249751FC4DD,2018-10-12,140 days
2,24978027606,2018-10-19,133 days
3,24979164422,2018-11-30,91 days
4,2497B8B4FDA,2018-11-01,120 days
...,...,...,...
1274082,2E6F72C6F1C,2019-02-27,2 days
1274083,2E6F8194908,2019-02-23,6 days
1274084,2E6F9C7B9B4,2019-01-28,32 days
1274085,2E6FB0EBB32,2019-02-11,18 days


In [12]:
df_customer = pd.merge(df_customer, last_purchase_date, on='MAGIC_KEY')
df_customer

Unnamed: 0,MAGIC_KEY,PURCHASE_COUNT,TOTAL_AMOUNT_SPENT,LAST_PURCHASE_DATE,DAY_SINCE_LAST_PURCHASE
0,249670911D8,2,24.10,2019-02-02,27 days
1,249751FC4DD,1,17.98,2018-10-12,140 days
2,24978027606,1,15.96,2018-10-19,133 days
3,24979164422,1,13.96,2018-11-30,91 days
4,2497B8B4FDA,2,33.94,2018-11-01,120 days
...,...,...,...,...,...
1274082,2E6F72C6F1C,3,51.94,2019-02-27,2 days
1274083,2E6F8194908,1,11.96,2019-02-23,6 days
1274084,2E6F9C7B9B4,1,19.98,2019-01-28,32 days
1274085,2E6FB0EBB32,6,107.88,2019-02-11,18 days


In [13]:
df_sorted = df.sort_values(['MAGIC_KEY', 'PURCHASE_DATE'])

# Calculate the time difference between consecutive purchases for each customer
df_sorted['TIME_DIFF'] = df_sorted.groupby('MAGIC_KEY')['PURCHASE_DATE'].diff()

# Group by 'MAGIC_KEY' and calculate the average time difference between consecutive purchases
average_diff = df_sorted.groupby('MAGIC_KEY')['TIME_DIFF'].mean().reset_index()
average_diff.columns = ['MAGIC_KEY', 'AVERAGE_DIFF']

# Fill NaN values (customers with only one purchase) with 0
average_diff['AVERAGE_DIFF'] = average_diff['AVERAGE_DIFF'].fillna(0)
average_diff['AVERAGE_DIFF'] = pd.to_timedelta(average_diff['AVERAGE_DIFF'])
average_diff

Unnamed: 0,MAGIC_KEY,AVERAGE_DIFF
0,249670911D8,93 days 00:00:00
1,249751FC4DD,0 days 00:00:00
2,24978027606,0 days 00:00:00
3,24979164422,0 days 00:00:00
4,2497B8B4FDA,29 days 00:00:00
...,...,...
1274082,2E6F72C6F1C,64 days 12:00:00
1274083,2E6F8194908,0 days 00:00:00
1274084,2E6F9C7B9B4,0 days 00:00:00
1274085,2E6FB0EBB32,26 days 14:24:00


In [14]:
df_customer = pd.merge(df_customer, average_diff, on='MAGIC_KEY')
df_customer

Unnamed: 0,MAGIC_KEY,PURCHASE_COUNT,TOTAL_AMOUNT_SPENT,LAST_PURCHASE_DATE,DAY_SINCE_LAST_PURCHASE,AVERAGE_DIFF
0,249670911D8,2,24.10,2019-02-02,27 days,93 days 00:00:00
1,249751FC4DD,1,17.98,2018-10-12,140 days,0 days 00:00:00
2,24978027606,1,15.96,2018-10-19,133 days,0 days 00:00:00
3,24979164422,1,13.96,2018-11-30,91 days,0 days 00:00:00
4,2497B8B4FDA,2,33.94,2018-11-01,120 days,29 days 00:00:00
...,...,...,...,...,...,...
1274082,2E6F72C6F1C,3,51.94,2019-02-27,2 days,64 days 12:00:00
1274083,2E6F8194908,1,11.96,2019-02-23,6 days,0 days 00:00:00
1274084,2E6F9C7B9B4,1,19.98,2019-01-28,32 days,0 days 00:00:00
1274085,2E6FB0EBB32,6,107.88,2019-02-11,18 days,26 days 14:24:00


In [15]:
df_customer.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1274087 entries, 0 to 1274086
Data columns (total 6 columns):
 #   Column                   Non-Null Count    Dtype          
---  ------                   --------------    -----          
 0   MAGIC_KEY                1274087 non-null  object         
 1   PURCHASE_COUNT           1274087 non-null  int64          
 2   TOTAL_AMOUNT_SPENT       1274087 non-null  float64        
 3   LAST_PURCHASE_DATE       1274087 non-null  datetime64[ns] 
 4   DAY_SINCE_LAST_PURCHASE  1274087 non-null  timedelta64[ns]
 5   AVERAGE_DIFF             1274087 non-null  timedelta64[ns]
dtypes: datetime64[ns](1), float64(1), int64(1), object(1), timedelta64[ns](2)
memory usage: 58.3+ MB


In [16]:
df_customer.describe()

Unnamed: 0,PURCHASE_COUNT,TOTAL_AMOUNT_SPENT,LAST_PURCHASE_DATE,DAY_SINCE_LAST_PURCHASE,AVERAGE_DIFF
count,1274087.0,1274087.0,1274087,1274087,1274087
mean,1.927437,33.99961,2019-01-07 19:54:36.858488576,52 days 04:05:23.141512313,17 days 20:08:27.532807059
min,1.0,5.56,2018-10-01 00:00:00,1 days 00:00:00,0 days 00:00:00
25%,1.0,15.96,2018-12-06 00:00:00,15 days 00:00:00,0 days 00:00:00
50%,1.0,23.98,2019-01-23 00:00:00,37 days 00:00:00,0 days 00:00:00
75%,2.0,47.88,2019-02-14 00:00:00,85 days 00:00:00,31 days 00:00:00
max,40.0,574.56,2019-02-28 00:00:00,151 days 00:00:00,150 days 00:00:00
std,1.411799,26.95334,,44 days 07:48:00.875529724,25 days 02:40:52.977985959


In [17]:
threshold = pd.Timedelta(days=15)
filtered_df = df_customer.loc[(df_customer['PURCHASE_COUNT'] > 1) & (df_customer['AVERAGE_DIFF'] < threshold)]

In [18]:
filtered_df

Unnamed: 0,MAGIC_KEY,PURCHASE_COUNT,TOTAL_AMOUNT_SPENT,LAST_PURCHASE_DATE,DAY_SINCE_LAST_PURCHASE,AVERAGE_DIFF
11,2499B9E0C1C,3,37.14,2019-02-18,11 days,8 days 00:00:00
206,24BFE993FEB,2,22.32,2019-02-19,10 days,5 days 00:00:00
233,24C6F8AA96D,2,24.36,2019-01-26,34 days,6 days 00:00:00
289,24D1CF7CA25,2,20.28,2019-02-21,8 days,10 days 00:00:00
306,24D28944EEE,12,121.68,2018-12-30,61 days,3 days 21:49:05.454545454
...,...,...,...,...,...,...
1273588,2E2336C7F7F,21,342.78,2019-02-28,1 days,7 days 07:12:00
1273738,2E4196577FD,8,81.12,2018-11-12,109 days,5 days 17:08:34.285714285
1273775,2E4833A51B7,2,31.94,2019-02-20,9 days,12 days 00:00:00
1273790,2E4BC0BCAF8,12,153.36,2019-02-18,11 days,12 days 08:43:38.181818181


In [19]:
#new_purchase_df = df.groupby('MAGIC_KEY')['PURCHASE'].apply(lambda x: 1 if (x == 'Y').any() else 0).reset_index()
#new_purchase_df

#df_customer = pd.merge(df_customer, new_purchase_df, on='MAGIC_KEY')
#df_customer

#le = LabelEncoder() #creating an instance of LabelEncoder

#df_encoded = df_customer.copy()
#df_encoded['MAGIC_KEY'] = le.fit_transform(df_encoded['MAGIC_KEY'])

#sns.set(font_scale = 2)
#plt.subplots(figsize = (10, 5))
#heat_plot = sns.heatmap(df_encoded.corr(method = 'pearson'), annot = True, cmap = 'RdYlGn', annot_kws={'size': 10})

#plt.yticks(fontsize = 10)
#plt.xticks(fontsize = 10)

#plt.show()

In [20]:
df_customer.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1274087 entries, 0 to 1274086
Data columns (total 6 columns):
 #   Column                   Non-Null Count    Dtype          
---  ------                   --------------    -----          
 0   MAGIC_KEY                1274087 non-null  object         
 1   PURCHASE_COUNT           1274087 non-null  int64          
 2   TOTAL_AMOUNT_SPENT       1274087 non-null  float64        
 3   LAST_PURCHASE_DATE       1274087 non-null  datetime64[ns] 
 4   DAY_SINCE_LAST_PURCHASE  1274087 non-null  timedelta64[ns]
 5   AVERAGE_DIFF             1274087 non-null  timedelta64[ns]
dtypes: datetime64[ns](1), float64(1), int64(1), object(1), timedelta64[ns](2)
memory usage: 58.3+ MB


In [21]:
october_2018_first_15 = df.groupby('MAGIC_KEY')['PURCHASE_DATE'].apply(
    lambda x: 1 if ((x.dt.month == 10) & (x.dt.day <= 15) & (x.dt.year == 2018)).any() else 0
).reset_index(name='10_2018_FIRST_15')

# Display the result DataFrame
october_2018_first_15

Unnamed: 0,MAGIC_KEY,10_2018_FIRST_15
0,249670911D8,0
1,249751FC4DD,1
2,24978027606,0
3,24979164422,0
4,2497B8B4FDA,1
...,...,...
1274082,2E6F72C6F1C,0
1274083,2E6F8194908,0
1274084,2E6F9C7B9B4,0
1274085,2E6FB0EBB32,1


In [22]:
df_customer = pd.merge(df_customer, october_2018_first_15, on='MAGIC_KEY')
df_customer

Unnamed: 0,MAGIC_KEY,PURCHASE_COUNT,TOTAL_AMOUNT_SPENT,LAST_PURCHASE_DATE,DAY_SINCE_LAST_PURCHASE,AVERAGE_DIFF,10_2018_FIRST_15
0,249670911D8,2,24.10,2019-02-02,27 days,93 days 00:00:00,0
1,249751FC4DD,1,17.98,2018-10-12,140 days,0 days 00:00:00,1
2,24978027606,1,15.96,2018-10-19,133 days,0 days 00:00:00,0
3,24979164422,1,13.96,2018-11-30,91 days,0 days 00:00:00,0
4,2497B8B4FDA,2,33.94,2018-11-01,120 days,29 days 00:00:00,1
...,...,...,...,...,...,...,...
1274082,2E6F72C6F1C,3,51.94,2019-02-27,2 days,64 days 12:00:00,0
1274083,2E6F8194908,1,11.96,2019-02-23,6 days,0 days 00:00:00,0
1274084,2E6F9C7B9B4,1,19.98,2019-01-28,32 days,0 days 00:00:00,0
1274085,2E6FB0EBB32,6,107.88,2019-02-11,18 days,26 days 14:24:00,1


In [23]:
november_2018_first_15 = df.groupby('MAGIC_KEY')['PURCHASE_DATE'].apply(
    lambda x: 1 if ((x.dt.month == 11) & (x.dt.day <= 15) & (x.dt.year == 2018)).any() else 0
).reset_index(name='11_2018_FIRST_15')

# Display the result DataFrame
november_2018_first_15

Unnamed: 0,MAGIC_KEY,11_2018_FIRST_15
0,249670911D8,1
1,249751FC4DD,0
2,24978027606,0
3,24979164422,0
4,2497B8B4FDA,1
...,...,...
1274082,2E6F72C6F1C,0
1274083,2E6F8194908,0
1274084,2E6F9C7B9B4,0
1274085,2E6FB0EBB32,0


In [24]:
df_customer = pd.merge(df_customer, november_2018_first_15, on='MAGIC_KEY')
df_customer

Unnamed: 0,MAGIC_KEY,PURCHASE_COUNT,TOTAL_AMOUNT_SPENT,LAST_PURCHASE_DATE,DAY_SINCE_LAST_PURCHASE,AVERAGE_DIFF,10_2018_FIRST_15,11_2018_FIRST_15
0,249670911D8,2,24.10,2019-02-02,27 days,93 days 00:00:00,0,1
1,249751FC4DD,1,17.98,2018-10-12,140 days,0 days 00:00:00,1,0
2,24978027606,1,15.96,2018-10-19,133 days,0 days 00:00:00,0,0
3,24979164422,1,13.96,2018-11-30,91 days,0 days 00:00:00,0,0
4,2497B8B4FDA,2,33.94,2018-11-01,120 days,29 days 00:00:00,1,1
...,...,...,...,...,...,...,...,...
1274082,2E6F72C6F1C,3,51.94,2019-02-27,2 days,64 days 12:00:00,0,0
1274083,2E6F8194908,1,11.96,2019-02-23,6 days,0 days 00:00:00,0,0
1274084,2E6F9C7B9B4,1,19.98,2019-01-28,32 days,0 days 00:00:00,0,0
1274085,2E6FB0EBB32,6,107.88,2019-02-11,18 days,26 days 14:24:00,1,0


In [25]:
december_2018_first_15 = df.groupby('MAGIC_KEY')['PURCHASE_DATE'].apply(
    lambda x: 1 if ((x.dt.month == 12) & (x.dt.day <= 15) & (x.dt.year == 2018)).any() else 0
).reset_index(name='12_2018_FIRST_15')

# Display the result DataFrame
december_2018_first_15

Unnamed: 0,MAGIC_KEY,12_2018_FIRST_15
0,249670911D8,0
1,249751FC4DD,0
2,24978027606,0
3,24979164422,0
4,2497B8B4FDA,0
...,...,...
1274082,2E6F72C6F1C,0
1274083,2E6F8194908,0
1274084,2E6F9C7B9B4,0
1274085,2E6FB0EBB32,0


In [26]:
df_customer = pd.merge(df_customer, december_2018_first_15, on='MAGIC_KEY')
df_customer

Unnamed: 0,MAGIC_KEY,PURCHASE_COUNT,TOTAL_AMOUNT_SPENT,LAST_PURCHASE_DATE,DAY_SINCE_LAST_PURCHASE,AVERAGE_DIFF,10_2018_FIRST_15,11_2018_FIRST_15,12_2018_FIRST_15
0,249670911D8,2,24.10,2019-02-02,27 days,93 days 00:00:00,0,1,0
1,249751FC4DD,1,17.98,2018-10-12,140 days,0 days 00:00:00,1,0,0
2,24978027606,1,15.96,2018-10-19,133 days,0 days 00:00:00,0,0,0
3,24979164422,1,13.96,2018-11-30,91 days,0 days 00:00:00,0,0,0
4,2497B8B4FDA,2,33.94,2018-11-01,120 days,29 days 00:00:00,1,1,0
...,...,...,...,...,...,...,...,...,...
1274082,2E6F72C6F1C,3,51.94,2019-02-27,2 days,64 days 12:00:00,0,0,0
1274083,2E6F8194908,1,11.96,2019-02-23,6 days,0 days 00:00:00,0,0,0
1274084,2E6F9C7B9B4,1,19.98,2019-01-28,32 days,0 days 00:00:00,0,0,0
1274085,2E6FB0EBB32,6,107.88,2019-02-11,18 days,26 days 14:24:00,1,0,0


In [27]:
january_2019_first_15 = df.groupby('MAGIC_KEY')['PURCHASE_DATE'].apply(
    lambda x: 1 if ((x.dt.month == 1) & (x.dt.day <= 15) & (x.dt.year == 2019)).any() else 0
).reset_index(name='1_2019_FIRST_15')

# Display the result DataFrame
january_2019_first_15

Unnamed: 0,MAGIC_KEY,1_2019_FIRST_15
0,249670911D8,0
1,249751FC4DD,0
2,24978027606,0
3,24979164422,0
4,2497B8B4FDA,0
...,...,...
1274082,2E6F72C6F1C,0
1274083,2E6F8194908,0
1274084,2E6F9C7B9B4,0
1274085,2E6FB0EBB32,0


In [28]:
df_customer = pd.merge(df_customer, january_2019_first_15, on='MAGIC_KEY')
df_customer

Unnamed: 0,MAGIC_KEY,PURCHASE_COUNT,TOTAL_AMOUNT_SPENT,LAST_PURCHASE_DATE,DAY_SINCE_LAST_PURCHASE,AVERAGE_DIFF,10_2018_FIRST_15,11_2018_FIRST_15,12_2018_FIRST_15,1_2019_FIRST_15
0,249670911D8,2,24.10,2019-02-02,27 days,93 days 00:00:00,0,1,0,0
1,249751FC4DD,1,17.98,2018-10-12,140 days,0 days 00:00:00,1,0,0,0
2,24978027606,1,15.96,2018-10-19,133 days,0 days 00:00:00,0,0,0,0
3,24979164422,1,13.96,2018-11-30,91 days,0 days 00:00:00,0,0,0,0
4,2497B8B4FDA,2,33.94,2018-11-01,120 days,29 days 00:00:00,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...
1274082,2E6F72C6F1C,3,51.94,2019-02-27,2 days,64 days 12:00:00,0,0,0,0
1274083,2E6F8194908,1,11.96,2019-02-23,6 days,0 days 00:00:00,0,0,0,0
1274084,2E6F9C7B9B4,1,19.98,2019-01-28,32 days,0 days 00:00:00,0,0,0,0
1274085,2E6FB0EBB32,6,107.88,2019-02-11,18 days,26 days 14:24:00,1,0,0,0


In [29]:
february_2019_first_15 = df.groupby('MAGIC_KEY')['PURCHASE_DATE'].apply(
    lambda x: 1 if ((x.dt.month == 2) & (x.dt.day <= 15) & (x.dt.year == 2019)).any() else 0
).reset_index(name='2_2019_FIRST_15')

# Display the result DataFrame
february_2019_first_15

df_customer = pd.merge(df_customer, february_2019_first_15, on='MAGIC_KEY')
df_customer

Unnamed: 0,MAGIC_KEY,PURCHASE_COUNT,TOTAL_AMOUNT_SPENT,LAST_PURCHASE_DATE,DAY_SINCE_LAST_PURCHASE,AVERAGE_DIFF,10_2018_FIRST_15,11_2018_FIRST_15,12_2018_FIRST_15,1_2019_FIRST_15,2_2019_FIRST_15
0,249670911D8,2,24.10,2019-02-02,27 days,93 days 00:00:00,0,1,0,0,1
1,249751FC4DD,1,17.98,2018-10-12,140 days,0 days 00:00:00,1,0,0,0,0
2,24978027606,1,15.96,2018-10-19,133 days,0 days 00:00:00,0,0,0,0,0
3,24979164422,1,13.96,2018-11-30,91 days,0 days 00:00:00,0,0,0,0,0
4,2497B8B4FDA,2,33.94,2018-11-01,120 days,29 days 00:00:00,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
1274082,2E6F72C6F1C,3,51.94,2019-02-27,2 days,64 days 12:00:00,0,0,0,0,1
1274083,2E6F8194908,1,11.96,2019-02-23,6 days,0 days 00:00:00,0,0,0,0,0
1274084,2E6F9C7B9B4,1,19.98,2019-01-28,32 days,0 days 00:00:00,0,0,0,0,0
1274085,2E6FB0EBB32,6,107.88,2019-02-11,18 days,26 days 14:24:00,1,0,0,0,1


In [30]:
df_customer.to_csv("df_customer_all.csv", index=False)  # Save test data