In [None]:
import pandas as pd
import numpy as np

# Load datasets

customers_data = pd.read_csv("../data/customers.csv")
products_data = pd.read_csv("../data/products.csv")
sales_data = pd.read_csv("../data/sales.csv")


In [None]:
# Display basic information about the datasets
customers_data.info()

In [None]:
products_data.info()

In [None]:
sales_data.info()

In [None]:
# Check for duplicates in customers_data
customers_data.duplicated().sum()


In [None]:
# Check for duplicates in sales_data
products_data.duplicated().sum()


In [None]:
# Check for duplicates in sales_data
sales_data.duplicated().sum()

In [None]:
# Check for missing values in customers_data
customers_data.isna().sum()

In [None]:
# Check for missing values in products_data
products_data.isna().sum()

In [None]:
# Check for missing values in sales_data
sales_data.isna().sum()

In [None]:
# Convert date columns to datetime format
customers_data['signup_date'] = pd.to_datetime(customers_data['signup_date'])
sales_data['timestamp'] = pd.to_datetime(sales_data['timestamp'])

In [None]:
# Standardize text data: capitalize and strip whitespace
customers_data['location'] = customers_data['location'].str.title().str.strip()
customers_data['gender'] = customers_data['gender'].str.title().str.strip()
products_data['category'] = products_data['category'].str.title().str.strip()
products_data['supplier'] = products_data['supplier'].str.title().str.strip()

In [None]:
# Merging datasets
merged_data = pd.merge(sales_data, products_data, how = "left")
merged_data = pd.merge(customers_data, merged_data, how = "left")
merged_data

In [None]:
def check_outliers(col):
    Q1 = merged_data[col].quantile(0.25)
    Q3 = merged_data[col].quantile(0.75)
    IQR = Q3 - Q1

    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR

    outliers = merged_data[(merged_data[col] < lower) | (merged_data[col] > upper)]
    return outliers

In [None]:
check_outliers('price')

In [None]:
check_outliers('quantity')

In [None]:
# Calculate revenue
merged_data['revenue'] = merged_data['price'] * merged_data['quantity']


In [None]:
# Calculate customer lifetime value
merged_data['customer_lifetime_value'] = merged_data.groupby('customer_id')['revenue'].transform('sum')


In [None]:
# Calculate days since last purchase
last_purchase = merged_data.groupby('customer_id')['timestamp'].max()

reference = merged_data['timestamp'].max()

merged_data = merged_data.join(last_purchase, on='customer_id', rsuffix='_last')
merged_data['days_since_last_purchase'] = (reference - merged_data['timestamp_last']).dt.days



In [None]:
merged_data.head()

In [None]:
# Export cleaned data
merged_data.to_csv('../data/cleaned_data.csv', index=False)
