In [1]:
import pandas as pd
import math
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
# Load Articles
data_dir = "../data"
articles = pd.read_csv(data_dir + '/articles.csv')

In [3]:
# Load Customers
data_dir = "../data"
customers = pd.read_csv(data_dir + '/customers.csv')

In [4]:
# Load Transactions
data_dir = "../data"
transactions = pd.read_csv(data_dir + '/transactions_train.csv')

In [5]:
###AGE###

# Replacing NaN values with the mean
missing_values = customers['age'].isna().sum()
mean_age = math.floor(customers['age'].mean())
customers['age'].fillna(mean_age, inplace=True)

# Check for missing values after
print(f"{missing_values} Missing Values Count in 'age' replaced with mean of {mean_age}")

15861 Missing Values Count in 'age' replaced with mean of 36


In [6]:
###Splitting t_dat into Year, Month, Day

# Convert to datetime
transactions['t_dat'] = pd.to_datetime(transactions['t_dat'])

# Extracting day, month, and year
transactions['Day'] = transactions['t_dat'].dt.day
transactions['Month'] = transactions['t_dat'].dt.month
transactions['Year'] = transactions['t_dat'].dt.year

transactions.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,Day,Month,Year
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.050831,2,20,9,2018
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.030492,2,20,9,2018
2,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,0.015237,2,20,9,2018
3,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003,0.016932,2,20,9,2018
4,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004,0.016932,2,20,9,2018


In [7]:
# Ensure 't_dat' is in datetime format
transactions['t_dat'] = pd.to_datetime(transactions['t_dat'])

# Total number of purchases per customer
total_purchases_per_customer = transactions.groupby('customer_id').size().reset_index(name='total_purchases')

# Merging the total_purchases column back to the original transactions dataframe
transactions = pd.merge(transactions, total_purchases_per_customer, on='customer_id', how='left')

# Display the DataFrame
print(transactions.head())


       t_dat                                        customer_id  article_id  \
0 2018-09-20  000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...   663713001   
1 2018-09-20  000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...   541518023   
2 2018-09-20  00007d2de826758b65a93dd24ce629ed66842531df6699...   505221004   
3 2018-09-20  00007d2de826758b65a93dd24ce629ed66842531df6699...   685687003   
4 2018-09-20  00007d2de826758b65a93dd24ce629ed66842531df6699...   685687004   

      price  sales_channel_id  Day  Month  Year  total_purchases  
0  0.050831                 2   20      9  2018               18  
1  0.030492                 2   20      9  2018               18  
2  0.015237                 2   20      9  2018              120  
3  0.016932                 2   20      9  2018              120  
4  0.016932                 2   20      9  2018              120  


In [8]:
# Get the latest date as "interval"
now = transactions['t_dat'].max() + pd.Timedelta(days=1)

# Calculate the most recent purchase date for each customer
most_recent_purchase_per_customer = transactions.groupby('customer_id')['t_dat'].max().reset_index(name='most_recent_purchase')

# Calculate recency
most_recent_purchase_per_customer['recency'] = (now - most_recent_purchase_per_customer['most_recent_purchase']).dt.days

# Merging
transactions = pd.merge(transactions, most_recent_purchase_per_customer, on='customer_id', how='left')

print(transactions.head())

       t_dat                                        customer_id  article_id  \
0 2018-09-20  000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...   663713001   
1 2018-09-20  000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...   541518023   
2 2018-09-20  00007d2de826758b65a93dd24ce629ed66842531df6699...   505221004   
3 2018-09-20  00007d2de826758b65a93dd24ce629ed66842531df6699...   685687003   
4 2018-09-20  00007d2de826758b65a93dd24ce629ed66842531df6699...   685687004   

      price  sales_channel_id  Day  Month  Year  total_purchases  \
0  0.050831                 2   20      9  2018               18   
1  0.030492                 2   20      9  2018               18   
2  0.015237                 2   20      9  2018              120   
3  0.016932                 2   20      9  2018              120   
4  0.016932                 2   20      9  2018              120   

  most_recent_purchase  recency  
0           2020-09-15        8  
1           2020-09-15        8  
2           20

In [11]:
transactions2 = pd.DataFrame(transactions)

# Convert 't_dat' to datetime type if it's not
transactions2['t_dat'] = pd.to_datetime(transactions2['t_dat'])

# Get the unique weeks and sort them
unique_weeks = sorted(transactions2['t_dat'].dt.strftime('%Y-%U').unique())

# Create a mapping of week to an incrementing integer starting from 0
week_mapping = {week: i for i, week in enumerate(unique_weeks)}

# Add the custom week number to the DataFrame
transactions2['week'] = transactions2['t_dat'].dt.strftime('%Y-%U').map(week_mapping)

In [12]:
transactions2.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,Day,Month,Year,total_purchases,most_recent_purchase,recency,week
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.050831,2,20,9,2018,18,2020-09-15,8,0
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.030492,2,20,9,2018,18,2020-09-15,8,0
2,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,0.015237,2,20,9,2018,120,2020-05-13,133,0
3,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003,0.016932,2,20,9,2018,120,2020-05-13,133,0
4,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004,0.016932,2,20,9,2018,120,2020-05-13,133,0


In [10]:
customers.to_csv("customer_e.csv", index=False)
transactions2.to_csv("transactions_train_e.csv", index=False)

In [15]:
# Read the CSV file
trans = pd.read_csv('transactions_train_e.csv')
cust = pd.read_csv('customer_e.csv')

# Save DataFrame to a Parquet file
trans.to_parquet('transactions_train.parquet', engine='pyarrow')
cust.to_parquet('customers.parquet', engine='pyarrow')

In [16]:
articles.to_parquet('articles.parquet', engine='pyarrow')