In [1]:
import pandas as pd
import numpy as np

# Load datasets

customers_data = pd.read_csv("../data/customers.csv")
products_data = pd.read_csv("../data/products.csv")
sales_data = pd.read_csv("../data/sales.csv")


In [2]:
# Display basic information about the datasets
customers_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   customer_id  500 non-null    int64 
 1   location     500 non-null    object
 2   gender       500 non-null    object
 3   signup_date  500 non-null    object
dtypes: int64(1), object(3)
memory usage: 15.8+ KB


In [3]:
products_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   product_id  100 non-null    int64 
 1   name        100 non-null    object
 2   category    100 non-null    object
 3   supplier    100 non-null    object
 4   price       100 non-null    int64 
dtypes: int64(2), object(3)
memory usage: 4.0+ KB


In [4]:
sales_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   sale_id      5000 non-null   int64 
 1   product_id   5000 non-null   int64 
 2   customer_id  5000 non-null   int64 
 3   timestamp    5000 non-null   object
 4   quantity     5000 non-null   int64 
 5   price        5000 non-null   int64 
 6   revenue      5000 non-null   int64 
dtypes: int64(6), object(1)
memory usage: 273.6+ KB


In [5]:
# Check for duplicates in customers_data
customers_data.duplicated().sum()


np.int64(0)

In [6]:
# Check for duplicates in sales_data
products_data.duplicated().sum()


np.int64(0)

In [7]:
# Check for duplicates in sales_data
sales_data.duplicated().sum()

np.int64(0)

In [8]:
# Check for missing values in customers_data
customers_data.isna().sum()

customer_id    0
location       0
gender         0
signup_date    0
dtype: int64

In [9]:
# Check for missing values in products_data
products_data.isna().sum()

product_id    0
name          0
category      0
supplier      0
price         0
dtype: int64

In [10]:
# Check for missing values in sales_data
sales_data.isna().sum()

sale_id        0
product_id     0
customer_id    0
timestamp      0
quantity       0
price          0
revenue        0
dtype: int64

In [11]:
# Convert date columns to datetime format
customers_data['signup_date'] = pd.to_datetime(customers_data['signup_date'])
sales_data['timestamp'] = pd.to_datetime(sales_data['timestamp'])

In [12]:
# Standardize text data: capitalize and strip whitespace
customers_data['location'] = customers_data['location'].str.title().str.strip()
customers_data['gender'] = customers_data['gender'].str.title().str.strip()
products_data['category'] = products_data['category'].str.title().str.strip()


In [13]:
# Merging datasets
merged_data = pd.merge(sales_data, products_data, how = "left")
merged_data = pd.merge(customers_data, merged_data, how = "left")
merged_data

Unnamed: 0,customer_id,location,gender,signup_date,sale_id,product_id,timestamp,quantity,price,revenue,name,category,supplier
0,1,Ibadan,Male,2020-12-25,164,56,2023-09-08,4,8313,33252,Product_56,Groceries,Supplier A
1,1,Ibadan,Male,2020-12-25,424,28,2022-11-19,4,27160,108640,Product_28,Groceries,Supplier D
2,1,Ibadan,Male,2020-12-25,498,44,2021-11-30,2,24321,48642,Product_44,Books,Supplier B
3,1,Ibadan,Male,2020-12-25,586,41,2021-11-22,4,11883,47532,Product_41,Groceries,Supplier D
4,1,Ibadan,Male,2020-12-25,841,68,2021-10-08,1,14788,14788,Product_68,Electronics,Supplier A
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,500,Ibadan,Female,2022-03-24,354,80,2023-01-08,1,13761,13761,Product_80,Electronics,Supplier A
4996,500,Ibadan,Female,2022-03-24,3618,82,2021-01-05,2,12222,24444,Product_82,Home,Supplier A
4997,500,Ibadan,Female,2022-03-24,4163,56,2021-08-15,2,8313,16626,Product_56,Groceries,Supplier A
4998,500,Ibadan,Female,2022-03-24,4408,97,2021-03-27,4,13357,53428,Product_97,Toys,Supplier D


In [14]:
# Calculate revenue
merged_data['revenue'] = merged_data['price'] * merged_data['quantity']


In [15]:
# Calculate customer lifetime value
merged_data['customer_lifetime_value'] = merged_data.groupby('customer_id')['revenue'].transform('sum')


In [16]:
# Calculate days since last purchase
last_purchase = merged_data.groupby('customer_id')['timestamp'].max()

reference = merged_data['timestamp'].max()

merged_data = merged_data.join(last_purchase, on='customer_id', rsuffix='_last')
merged_data['days_since_last_purchase'] = (reference - merged_data['timestamp_last']).dt.days



In [17]:
merged_data.head()

Unnamed: 0,customer_id,location,gender,signup_date,sale_id,product_id,timestamp,quantity,price,revenue,name,category,supplier,customer_lifetime_value,timestamp_last,days_since_last_purchase
0,1,Ibadan,Male,2020-12-25,164,56,2023-09-08,4,8313,33252,Product_56,Groceries,Supplier A,692985,2023-09-08,114
1,1,Ibadan,Male,2020-12-25,424,28,2022-11-19,4,27160,108640,Product_28,Groceries,Supplier D,692985,2023-09-08,114
2,1,Ibadan,Male,2020-12-25,498,44,2021-11-30,2,24321,48642,Product_44,Books,Supplier B,692985,2023-09-08,114
3,1,Ibadan,Male,2020-12-25,586,41,2021-11-22,4,11883,47532,Product_41,Groceries,Supplier D,692985,2023-09-08,114
4,1,Ibadan,Male,2020-12-25,841,68,2021-10-08,1,14788,14788,Product_68,Electronics,Supplier A,692985,2023-09-08,114


In [18]:
# Export cleaned data
merged_data.to_csv('../data/cleaned_data.csv', index=False)
