In [1]:
import os
#import jovian
import matplotlib
import opendatasets as od
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
%matplotlib inline
sns.set_style('darkgrid')
matplotlib.rcParams['font.size'] = 14
matplotlib.rcParams['figure.figsize'] = (10, 6)
matplotlib.rcParams['figure.facecolor'] = '#00000000'

In [2]:
df_boxes = pd.read_csv('boxes.csv') #importing box details datasets
df_problem1 = pd.read_csv('problem 3.csv') #importing predict datasets
df_purchase = pd.read_csv('purchase.csv') #importing history datasets

In [3]:
merged_df = df_boxes.merge(df_purchase, on='BOX_ID')

In [4]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2455800 entries, 0 to 2455799
Data columns (total 9 columns):
 #   Column           Dtype  
---  ------           -----  
 0   BOX_ID           int64  
 1   QUALITY          object 
 2   DELIVERY_OPTION  object 
 3   MILK             float64
 4   MEAT             float64
 5   UNIT_PRICE       float64
 6   PURCHASE_DATE    object 
 7   MAGIC_KEY        object 
 8   BOX_COUNT        float64
dtypes: float64(4), int64(1), object(4)
memory usage: 168.6+ MB


In [5]:
total_unique_box_id = merged_df['BOX_ID'].nunique()

print("Total unique BOX_ID:", total_unique_box_id)

Total unique BOX_ID: 290


In [6]:
total_unique_magic_key = merged_df['MAGIC_KEY'].nunique()

print("Total unique BOX_ID:", total_unique_magic_key)

Total unique BOX_ID: 1274087


## data cleaning

In [7]:
round(df_purchase.describe())

Unnamed: 0,BOX_ID,BOX_COUNT
count,2455817.0,2455817.0
mean,231.0,1.0
std,29233.0,0.0
min,1.0,-1.0
25%,106.0,1.0
50%,143.0,1.0
75%,215.0,1.0
max,11111111.0,19.0


In [8]:
round(merged_df.describe())

Unnamed: 0,BOX_ID,MILK,MEAT,UNIT_PRICE,BOX_COUNT
count,2455800.0,2455800.0,2455800.0,2455800.0,2455800.0
mean,154.0,8.0,2.0,18.0,1.0
std,70.0,6.0,1.0,4.0,0.0
min,1.0,0.0,0.0,6.0,1.0
25%,106.0,0.0,2.0,14.0,1.0
50%,143.0,10.0,2.0,18.0,1.0
75%,215.0,11.0,2.0,20.0,1.0
max,290.0,24.0,6.0,24.0,19.0


In [9]:
merged_df.duplicated().sum()

77

In [10]:
# drop duplicates rows
merged_df= merged_df.drop_duplicates()

In [11]:
from scipy import stats

z_scores = np.abs(stats.zscore(df_purchase['BOX_COUNT']))
threshold = 3
outliers = np.where(z_scores > threshold)[0]
print("Outliers detected using Z-score method:", outliers)

In [12]:
merged_df.head()

Unnamed: 0,BOX_ID,QUALITY,DELIVERY_OPTION,MILK,MEAT,UNIT_PRICE,PURCHASE_DATE,MAGIC_KEY,BOX_COUNT
0,1,Premium,Home Delivery - CoD,0.0,2.7,9.96,4/2/2019,2C88D36D1FC,1.0
1,2,Premium,Home Delivery - CoD,0.0,2.3,11.96,2/1/2019,2BF011BDB38,1.0
2,2,Premium,Home Delivery - CoD,0.0,2.3,11.96,2/1/2019,2CA0EE8F2B3,1.0
3,2,Premium,Home Delivery - CoD,0.0,2.3,11.96,2/1/2019,2C623730B09,1.0
4,2,Premium,Home Delivery - CoD,0.0,2.3,11.96,2/1/2019,2CA6CE1054F,1.0


In [13]:
merged_df['PURCHASE_DATE'] = pd.to_datetime(merged_df['PURCHASE_DATE'], format="%d/%m/%Y")

In [14]:
#merged_df.PURCHASE_DATE.min(), merged_df.PURCHASE_DATE.max()

(Timestamp('2018-10-01 00:00:00'), Timestamp('2019-02-28 00:00:00'))

In [33]:
merged_df.PURCHASE_DATE.info()

<class 'pandas.core.series.Series'>
RangeIndex: 2455723 entries, 0 to 2455722
Series name: PURCHASE_DATE
Non-Null Count    Dtype         
--------------    -----         
2455723 non-null  datetime64[ns]
dtypes: datetime64[ns](1)
memory usage: 18.7 MB


# feature engineering

In [15]:
# Group the data by MAGIC_KEY and calculate the sum of MEAT purchased by each Magic Key
magic_key_meat = merged_df.groupby('MAGIC_KEY')['MEAT'].sum().reset_index()

# Count the frequency of purchases made by each Magic Key
magic_key_purchase_frequency = merged_df.groupby('MAGIC_KEY').size().reset_index(name='PURCHASE_FREEQUENCY')

# Merge the two DataFrames on MAGIC_KEY
magic_key_data = pd.merge(magic_key_meat, magic_key_purchase_frequency, on='MAGIC_KEY')

# Display the resulting DataFrame
print(magic_key_data)

           MAGIC_KEY  MEAT  PURCHASE_FREEQUENCY
0        249670911D8   2.4                    2
1        249751FC4DD   1.8                    1
2        24978027606   2.9                    1
3        24979164422   2.5                    1
4        2497B8B4FDA   5.4                    2
...              ...   ...                  ...
1274082  2E6F72C6F1C   4.8                    3
1274083  2E6F8194908   2.4                    1
1274084  2E6F9C7B9B4   2.2                    1
1274085  2E6FB0EBB32  12.8                    6
1274086  2E6FBE224FA   2.7                    1

[1274087 rows x 3 columns]


In [16]:
merged_df

Unnamed: 0,BOX_ID,QUALITY,DELIVERY_OPTION,MILK,MEAT,UNIT_PRICE,PURCHASE_DATE,MAGIC_KEY,BOX_COUNT
0,1,Premium,Home Delivery - CoD,0.0,2.7,9.96,2019-02-04,2C88D36D1FC,1.0
1,2,Premium,Home Delivery - CoD,0.0,2.3,11.96,2019-01-02,2BF011BDB38,1.0
2,2,Premium,Home Delivery - CoD,0.0,2.3,11.96,2019-01-02,2CA0EE8F2B3,1.0
3,2,Premium,Home Delivery - CoD,0.0,2.3,11.96,2019-01-02,2C623730B09,1.0
4,2,Premium,Home Delivery - CoD,0.0,2.3,11.96,2019-01-02,2CA6CE1054F,1.0
...,...,...,...,...,...,...,...,...,...
2455795,289,Standard,Home Delivery - Digital Payment,0.0,4.7,15.96,2019-02-18,29149CE828C,1.0
2455796,290,Standard,Home Delivery - Digital Payment,12.0,1.8,19.98,2019-02-20,28FF7857D43,1.0
2455797,290,Standard,Home Delivery - Digital Payment,12.0,1.8,19.98,2019-01-05,28F675D813E,1.0
2455798,290,Standard,Home Delivery - Digital Payment,12.0,1.8,19.98,2019-01-11,29D99F43873,1.0


## Let's also parse the date column

In [17]:
# Calculate the last purchase date for each Magic Key
#merged_df['PURCHASE_DATE'] = pd.to_datetime(merged_df['PURCHASE_DATE']), format='%d/%m/%Y')
last_purchase_date = merged_df.groupby('MAGIC_KEY')['PURCHASE_DATE'].max().reset_index()
last_purchase_date.columns = ['MAGIC_KEY', 'LAST_PURCHASE_DATE']

# Merge the last purchase date DataFrame with the merged_df DataFrame
merged_df = pd.merge(merged_df, last_purchase_date, on='MAGIC_KEY', how='left')

# Convert PURCHASE_DATE and LAST_PURCHASE_DATE to datetime objects

merged_df['LAST_PURCHASE_DATE'] = pd.to_datetime(merged_df['LAST_PURCHASE_DATE'])

# Calculate the difference in days between the purchase date and the last purchase date
merged_df['DAYS_FROM_LAST_PURCHASE'] = (merged_df['PURCHASE_DATE'] - merged_df['LAST_PURCHASE_DATE']).dt.days

# Ensure that the difference is always positive
merged_df['DAYS_FROM_LAST_PURCHASE'] = merged_df['DAYS_FROM_LAST_PURCHASE'].abs()


## extracting Day, Month, Year from the dataset and making new columns of them

In [18]:
merged_df['DAY'] = merged_df['PURCHASE_DATE'].dt.day
merged_df['MONTH'] = merged_df['PURCHASE_DATE'].dt.month
merged_df['YEAR'] = merged_df['PURCHASE_DATE'].dt.year

## adding month wise purchase 

In [19]:
#---------------------------------------------------OCOTBER------------------------------------------

# Filter the DataFrame to include only the rows with purchase dates in October 2018
october_2018_df = merged_df[(merged_df['PURCHASE_DATE'].dt.year == 2018) & 
                             (merged_df['PURCHASE_DATE'].dt.month == 10)]

# Further filter the DataFrame to include only the rows with purchase dates within the first 15 days of October 2018
october_2018_first_15_days_df = october_2018_df[october_2018_df['PURCHASE_DATE'].dt.day <= 15]

# Group the filtered DataFrame by "MAGIC_KEY" and check if any purchase of MILK or MEAT is made by each Magic Key
october_2018_first_15_days_purchases = october_2018_first_15_days_df.groupby('MAGIC_KEY')[['MILK', 'MEAT']].sum()

# Create the new column "10_FIRST_PURCHASE_2018" based on whether a purchase is made by each Magic Key
merged_df['10_FIRST_PURCHASE_2018'] = 0
merged_df.loc[merged_df['MAGIC_KEY'].isin(october_2018_first_15_days_purchases.index), '10_FIRST_PURCHASE_2018'] = 1


#---------------------------------------------------------------------------------------------------------------------------

# Filter the DataFrame to include only the rows with purchase dates in October 2018
october_2018_df = merged_df[(merged_df['PURCHASE_DATE'].dt.year == 2018) & 
                             (merged_df['PURCHASE_DATE'].dt.month == 10)]

# Further filter the DataFrame to include only the rows with purchase dates within the first 15 days of October 2018
october_2018_last_15_days_df = october_2018_df[october_2018_df['PURCHASE_DATE'].dt.day > 15]

# Group the filtered DataFrame by "MAGIC_KEY" and check if any purchase of MILK or MEAT is made by each Magic Key
october_2018_last_15_days_purchases = october_2018_last_15_days_df.groupby('MAGIC_KEY')[['MILK', 'MEAT']].sum()

# Create the new column "10_FIRST_PURCHASE_2018" based on whether a purchase is made by each Magic Key
merged_df['10_LAST_PURCHASE_2018'] = 0
merged_df.loc[merged_df['MAGIC_KEY'].isin(october_2018_last_15_days_purchases.index), '10_LAST_PURCHASE_2018'] = 1


#_____________________________________________________________ NOVEMBER------------------------------

# Filter the DataFrame to include only the rows with purchase dates in November 2018
november_2018_df = merged_df[(merged_df['PURCHASE_DATE'].dt.year == 2018) & 
                             (merged_df['PURCHASE_DATE'].dt.month == 11)]

# Further filter the DataFrame to include only the rows with purchase dates within the first 15 days of November 2018
november_2018_first_15_days_df = november_2018_df[november_2018_df['PURCHASE_DATE'].dt.day <= 15]

# Group the filtered DataFrame by "MAGIC_KEY" and check if any purchase of MILK or MEAT is made by each Magic Key
november_2018_first_15_days_purchases = november_2018_first_15_days_df.groupby('MAGIC_KEY')[['MILK', 'MEAT']].sum()

# Create the new column "11_FIRST_PURCHASE_2018" based on whether a purchase is made by each Magic Key
merged_df['11_FIRST_PURCHASE_2018'] = 0
merged_df.loc[merged_df['MAGIC_KEY'].isin(november_2018_first_15_days_purchases.index), '11_FIRST_PURCHASE_2018'] = 1


#---------------------------------------------------------------------------------------------------------------------------

# Filter the DataFrame to include only the rows with purchase dates in November 2018
november_2018_df = merged_df[(merged_df['PURCHASE_DATE'].dt.year == 2018) & 
                             (merged_df['PURCHASE_DATE'].dt.month == 11)]

# Further filter the DataFrame to include only the rows with purchase dates within the first 15 days of November 2018
november_2018_first_15_days_df = november_2018_df[november_2018_df['PURCHASE_DATE'].dt.day > 15]

# Group the filtered DataFrame by "MAGIC_KEY" and check if any purchase of MILK or MEAT is made by each Magic Key
november_2018_last_15_days_purchases = november_2018_first_15_days_df.groupby('MAGIC_KEY')[['MILK', 'MEAT']].sum()

# Create the new column "11_FIRST_PURCHASE_2018" based on whether a purchase is made by each Magic Key
merged_df['11_LAST_PURCHASE_2018'] = 0
merged_df.loc[merged_df['MAGIC_KEY'].isin(november_2018_first_15_days_purchases.index), '11_LAST_PURCHASE_2018'] = 1


#------------------------------------------------------- DECEMBER---------------------------------------------

# Filter the DataFrame to include only the rows with purchase dates in December 2018
december_2018_df = merged_df[(merged_df['PURCHASE_DATE'].dt.year == 2018) & 
                             (merged_df['PURCHASE_DATE'].dt.month == 11)]

# Further filter the DataFrame to include only the rows with purchase dates within the first 15 days of december 2018
december_2018_first_15_days_df = december_2018_df[december_2018_df['PURCHASE_DATE'].dt.day <= 15]

# Group the filtered DataFrame by "MAGIC_KEY" and check if any purchase of MILK or MEAT is made by each Magic Key
december_2018_first_15_days_purchases = december_2018_first_15_days_df.groupby('MAGIC_KEY')[['MILK', 'MEAT']].sum()

# Create the new column "11_FIRST_PURCHASE_2018" based on whether a purchase is made by each Magic Key
merged_df['12_FIRST_PURCHASE_2018'] = 0
merged_df.loc[merged_df['MAGIC_KEY'].isin(december_2018_first_15_days_purchases.index), '12_FIRST_PURCHASE_2018'] = 1

#---------------------------------------------------------------------------------------------------------------------

# Filter the DataFrame to include only the rows with purchase dates in December 2018
december_2018_df = merged_df[(merged_df['PURCHASE_DATE'].dt.year == 2018) & 
                             (merged_df['PURCHASE_DATE'].dt.month == 11)]

# Further filter the DataFrame to include only the rows with purchase dates within the first 15 days of december 2018
december_2018_last_15_days_df = december_2018_df[december_2018_df['PURCHASE_DATE'].dt.day > 15]

# Group the filtered DataFrame by "MAGIC_KEY" and check if any purchase of MILK or MEAT is made by each Magic Key
december_2018_last_15_days_purchases = december_2018_last_15_days_df.groupby('MAGIC_KEY')[['MILK', 'MEAT']].sum()

# Create the new column "11_FIRST_PURCHASE_2018" based on whether a purchase is made by each Magic Key
merged_df['12_LAST_PURCHASE_2018'] = 0
merged_df.loc[merged_df['MAGIC_KEY'].isin(december_2018_last_15_days_purchases.index), '12_LAST_PURCHASE_2018'] = 1



#---------------------------------------------------JANUARY 2019------------------------------------------

# Filter the DataFrame to include only the rows with purchase dates in January 2019
january_2019_df = merged_df[(merged_df['PURCHASE_DATE'].dt.year == 2019) & 
                          (merged_df['PURCHASE_DATE'].dt.month == 1)]

# Further filter the DataFrame to include only the rows with purchase dates within the first 15 days of January 2019
january_2019_first_15_days_df = january_2019_df[january_2019_df['PURCHASE_DATE'].dt.day <= 15]

# Group the filtered DataFrame by "MAGIC_KEY" and check if any purchase of MILK or MEAT is made by each Magic Key
january_2019_first_15_days_purchases = january_2019_first_15_days_df.groupby('MAGIC_KEY')[['MILK', 'MEAT']].sum()

# Create the new column "01_FIRST_PURCHASE_2019" based on whether a purchase is made by each Magic Key
merged_df['01_FIRST_PURCHASE_2019'] = 0
merged_df.loc[merged_df['MAGIC_KEY'].isin(january_2019_first_15_days_purchases.index), '01_FIRST_PURCHASE_2019'] = 1


#---------------------------------------------------------------------------------------------------------------------------

# Filter the DataFrame to include only the rows with purchase dates in January 2019
january_2019_df = merged_df[(merged_df['PURCHASE_DATE'].dt.year == 2019) & 
                          (merged_df['PURCHASE_DATE'].dt.month == 1)]

# Further filter the DataFrame to include only the rows with purchase dates within the first 15 days of January 2019
january_2019_last_15_days_df = january_2019_df[january_2019_df['PURCHASE_DATE'].dt.day > 15]

# Group the filtered DataFrame by "MAGIC_KEY" and check if any purchase of MILK or MEAT is made by each Magic Key
january_2019_last_15_days_purchases = january_2019_last_15_days_df.groupby('MAGIC_KEY')[['MILK', 'MEAT']].sum()

# Create the new column "01_LAST_PURCHASE_2019" based on whether a purchase is made by each Magic Key
merged_df['01_LAST_PURCHASE_2019'] = 0
merged_df.loc[merged_df['MAGIC_KEY'].isin(january_2019_last_15_days_purchases.index), '01_LAST_PURCHASE_2019'] = 1


#_____________________________________________________________ FEBRUARY 2019----------------------------------------------

# Filter the DataFrame to include only the rows with purchase dates in February 2019
february_2019_df = merged_df[(merged_df['PURCHASE_DATE'].dt.year == 2019) & 
                            (merged_df['PURCHASE_DATE'].dt.month == 2)]

# Further filter the DataFrame to include only the rows with purchase dates within the first 15 days of February 2019
february_2019_first_15_days_df = february_2019_df[february_2019_df['PURCHASE_DATE'].dt.day <= 15]

# Group the filtered DataFrame by "MAGIC_KEY" and check if any purchase of MILK or MEAT is made by each Magic Key
february_2019_first_15_days_purchases = february_2019_first_15_days_df.groupby('MAGIC_KEY')[['MILK', 'MEAT']].sum()

# Create the new column "02_FIRST_PURCHASE_2019" based on whether a purchase is made by each Magic Key
merged_df['02_FIRST_PURCHASE_2019'] = 0
merged_df.loc[merged_df['MAGIC_KEY'].isin(february_2019_first_15_days_purchases.index), '02_FIRST_PURCHASE_2019'] = 1

#---------------------------------------------------------------------------------------------------------------------

# Filter the DataFrame to include only the rows with purchase dates in February 2019
february_2019_df = merged_df[(merged_df['PURCHASE_DATE'].dt.year == 2019) & 
                            (merged_df['PURCHASE_DATE'].dt.month == 2)]

# Further filter the DataFrame to include only the rows with purchase dates within the first 15 days of February 2019
february_2019_last_15_days_df = february_2019_df[february_2019_df['PURCHASE_DATE'].dt.day > 15]

# Group the filtered DataFrame by "MAGIC_KEY" and check if any purchase of MILK or MEAT is made by each Magic Key
february_2019_last_15_days_purchases = february_2019_last_15_days_df.groupby('MAGIC_KEY')[['MILK', 'MEAT']].sum()

# Create the new column "02_FIRST_PURCHASE_2019" based on whether a purchase is made by each Magic Key
merged_df['02_LAST_PURCHASE_2019'] = 0
merged_df.loc[merged_df['MAGIC_KEY'].isin(february_2019_last_15_days_purchases.index), '02_LAST_PURCHASE_2019'] = 1


In [20]:
merged_df

Unnamed: 0,BOX_ID,QUALITY,DELIVERY_OPTION,MILK,MEAT,UNIT_PRICE,PURCHASE_DATE,MAGIC_KEY,BOX_COUNT,LAST_PURCHASE_DATE,...,10_FIRST_PURCHASE_2018,10_LAST_PURCHASE_2018,11_FIRST_PURCHASE_2018,11_LAST_PURCHASE_2018,12_FIRST_PURCHASE_2018,12_LAST_PURCHASE_2018,01_FIRST_PURCHASE_2019,01_LAST_PURCHASE_2019,02_FIRST_PURCHASE_2019,02_LAST_PURCHASE_2019
0,1,Premium,Home Delivery - CoD,0.0,2.7,9.96,2019-02-04,2C88D36D1FC,1.0,2019-02-04,...,0,0,0,0,0,0,0,0,1,0
1,2,Premium,Home Delivery - CoD,0.0,2.3,11.96,2019-01-02,2BF011BDB38,1.0,2019-02-04,...,0,0,0,0,0,0,1,0,1,0
2,2,Premium,Home Delivery - CoD,0.0,2.3,11.96,2019-01-02,2CA0EE8F2B3,1.0,2019-01-02,...,0,0,0,0,0,0,1,0,0,0
3,2,Premium,Home Delivery - CoD,0.0,2.3,11.96,2019-01-02,2C623730B09,1.0,2019-02-02,...,0,0,0,0,0,0,1,0,1,0
4,2,Premium,Home Delivery - CoD,0.0,2.3,11.96,2019-01-02,2CA6CE1054F,1.0,2019-01-02,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2455718,289,Standard,Home Delivery - Digital Payment,0.0,4.7,15.96,2019-02-18,29149CE828C,1.0,2019-02-19,...,1,1,0,0,0,0,0,0,1,1
2455719,290,Standard,Home Delivery - Digital Payment,12.0,1.8,19.98,2019-02-20,28FF7857D43,1.0,2019-02-20,...,0,0,0,0,0,0,0,0,0,1
2455720,290,Standard,Home Delivery - Digital Payment,12.0,1.8,19.98,2019-01-05,28F675D813E,1.0,2019-01-05,...,0,0,0,0,0,0,1,0,0,0
2455721,290,Standard,Home Delivery - Digital Payment,12.0,1.8,19.98,2019-01-11,29D99F43873,1.0,2019-02-26,...,0,1,1,1,1,1,1,0,1,1


# Adding extra columns 

In [21]:
# Calculate the sum of MEAT purchased by each Magic Key
merged_df['TOTAL_MEAT'] = merged_df.groupby('MAGIC_KEY')['MEAT'].transform('sum')

# Calculate the sum of MEAT purchased by each Magic Key
merged_df['TOTAL_MILK'] = merged_df.groupby('MAGIC_KEY')['MILK'].transform('sum')


# Calculate the purchase frequency for each Magic Key
merged_df['MEAT_PURCHASE_FREQUENCY'] = merged_df.groupby(['MAGIC_KEY', 'MEAT'])['MEAT'].transform('count')

# Calculate the purchase frequency for each Magic Key
merged_df['MILK_PURCHASE_FREQUENCY'] = merged_df.groupby(['MAGIC_KEY', 'MILK'])['MILK'].transform('count')

# Calculate the sum of BOX_COUNT purchased by each Magic Key
merged_df['TOTAL_BOX_COUNT'] = merged_df.groupby('MAGIC_KEY')['BOX_COUNT'].transform('sum')

# Calculate the total amount spent by each Magic Key holder and assign it directly to the 'TOTAL_AMOUNT' column
merged_df['TOTAL_AMOUNT'] = merged_df.groupby('MAGIC_KEY')['BOX_COUNT'].transform('sum') * merged_df['UNIT_PRICE']



In [22]:
merged_df

Unnamed: 0,BOX_ID,QUALITY,DELIVERY_OPTION,MILK,MEAT,UNIT_PRICE,PURCHASE_DATE,MAGIC_KEY,BOX_COUNT,LAST_PURCHASE_DATE,...,01_FIRST_PURCHASE_2019,01_LAST_PURCHASE_2019,02_FIRST_PURCHASE_2019,02_LAST_PURCHASE_2019,TOTAL_MEAT,TOTAL_MILK,MEAT_PURCHASE_FREQUENCY,MILK_PURCHASE_FREQUENCY,TOTAL_BOX_COUNT,TOTAL_AMOUNT
0,1,Premium,Home Delivery - CoD,0.0,2.7,9.96,2019-02-04,2C88D36D1FC,1.0,2019-02-04,...,0,0,1,0,2.7,0.0,1,1,1.0,9.96
1,2,Premium,Home Delivery - CoD,0.0,2.3,11.96,2019-01-02,2BF011BDB38,1.0,2019-02-04,...,1,0,1,0,4.8,0.0,1,2,2.0,23.92
2,2,Premium,Home Delivery - CoD,0.0,2.3,11.96,2019-01-02,2CA0EE8F2B3,1.0,2019-01-02,...,1,0,0,0,2.3,0.0,1,1,1.0,11.96
3,2,Premium,Home Delivery - CoD,0.0,2.3,11.96,2019-01-02,2C623730B09,1.0,2019-02-02,...,1,0,1,0,4.7,0.0,1,2,2.0,23.92
4,2,Premium,Home Delivery - CoD,0.0,2.3,11.96,2019-01-02,2CA6CE1054F,1.0,2019-01-02,...,1,0,0,0,2.3,0.0,1,1,1.0,11.96
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2455718,289,Standard,Home Delivery - Digital Payment,0.0,4.7,15.96,2019-02-18,29149CE828C,1.0,2019-02-19,...,0,0,1,1,33.9,60.0,1,8,16.0,255.36
2455719,290,Standard,Home Delivery - Digital Payment,12.0,1.8,19.98,2019-02-20,28FF7857D43,1.0,2019-02-20,...,0,0,0,1,1.8,12.0,1,1,1.0,19.98
2455720,290,Standard,Home Delivery - Digital Payment,12.0,1.8,19.98,2019-01-05,28F675D813E,1.0,2019-01-05,...,1,0,0,0,1.8,12.0,1,1,1.0,19.98
2455721,290,Standard,Home Delivery - Digital Payment,12.0,1.8,19.98,2019-01-11,29D99F43873,1.0,2019-02-26,...,1,0,1,1,29.9,12.0,1,1,9.0,179.82


In [23]:
#merged_df.PURCHASE_DATE.min(), merged_df.PURCHASE_DATE.max()

In [24]:
#merged_df.BOX_COUNT.value_counts()

In [25]:
#temp_df = merged_df.copy()

In [28]:
# Assuming the Magic Key you want to search for is '290D33249B7'
merged_df.loc[merged_df['MAGIC_KEY'] == '290D33249B7']

# Display the specific row



Unnamed: 0,BOX_ID,QUALITY,DELIVERY_OPTION,MILK,MEAT,UNIT_PRICE,PURCHASE_DATE,MAGIC_KEY,BOX_COUNT,LAST_PURCHASE_DATE,...,01_FIRST_PURCHASE_2019,01_LAST_PURCHASE_2019,02_FIRST_PURCHASE_2019,02_LAST_PURCHASE_2019,TOTAL_MEAT,TOTAL_MILK,MEAT_PURCHASE_FREQUENCY,MILK_PURCHASE_FREQUENCY,TOTAL_BOX_COUNT,TOTAL_AMOUNT


In [34]:
merged_df.PURCHASE_DATE.info()

<class 'pandas.core.series.Series'>
RangeIndex: 2455723 entries, 0 to 2455722
Series name: PURCHASE_DATE
Non-Null Count    Dtype         
--------------    -----         
2455723 non-null  datetime64[ns]
dtypes: datetime64[ns](1)
memory usage: 18.7 MB


In [29]:
merged_df.to_csv('merged_data.csv', index=False)