In [None]:
#imports 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Optional settings
sns.set(style='whitegrid')
pd.set_option('display.max_columns', None)


In [None]:
#loading the dataset
df = pd.read_excel("../data/online_retail.xlsx")
df.head()


In [None]:
#shape and columns
df.shape
df.columns


In [None]:
#info and description
df.info()
df.describe()

In [None]:
#drop missing customer IDs
df = df.dropna(subset=['CustomerID'])
print("After dropping missing CustomerIDs:", df.shape)

In [None]:
#remove canceled orders
df = df[~df['InvoiceNo'].astype(str).str.startswith('C')]
print("After removing canceled transactions:", df.shape)


In [None]:
#remove negative quantities
df = df[df['Quantity'] > 0]
print("After removing negative quantities:", df.shape)

In [None]:
#remove invalid unit prices
df = df[df['UnitPrice'] > 0]
print("After removing invalid unit prices:", df.shape)

In [None]:
#drop duplicate rows
df = df.drop_duplicates()
print("After dropping duplicates:", df.shape)

In [None]:
#feature engineering - total price
df['TotalPrice'] = df['Quantity'] * df['UnitPrice']
df[['Quantity', 'UnitPrice', 'TotalPrice']].head()

In [None]:
#feature engineering - invoice month 
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate']) #incase data not paresed
df['InvoiceMonth'] = df['InvoiceDate'].dt.to_period('M')
df[['InvoiceDate', 'InvoiceMonth']].head()

In [None]:
#convert customer ID to string
df['CustomerID'] = df['CustomerID'].astype(str)

In [None]:
#save the cleaned dataset
print("Saving cleaned dataset...")
df.to_csv('../data/online_retail_cleaned.csv', index=False)


In [None]:
#final check 
print("✅ Cleaned dataset shape:", df.shape)
print("🧍 Unique customers:", df['CustomerID'].nunique())
print("📅 Date range:", df['InvoiceDate'].min(), "to", df['InvoiceDate'].max())
# --- IGNORE ---

In [None]:
#revenue by month
monthly_revenue = df.groupby('InvoiceMonth')['TotalPrice'].sum().reset_index()
plt.figure(figsize=(12, 6))
sns.lineplot(x='InvoiceMonth', y='TotalPrice', data=monthly_revenue, marker='o')
plt.title('Monthly Revenue')
plt.xlabel('Invoice Month')
plt.ylabel('Total Revenue')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
#top products by revenue
top_products = df.groupby('Description')['TotalPrice'].sum().reset_index().sort_values(by='TotalPrice', ascending=False).head(10)
plt.figure(figsize=(12, 6))
sns.barplot(x='TotalPrice', y='Description', data=top_products, palette='viridis')
plt.title('Top 10 Products by Revenue')
plt.xlabel('Total Revenue')
plt.ylabel('Product Description')
plt.tight_layout()
plt.show()

In [None]:
#top 10 products by quantity sold
top_products_quantity = df.groupby('Description')['Quantity'].sum().reset_index().sort_values(by='Quantity', ascending=False).head(10)
plt.figure(figsize=(12, 6))
sns.barplot(x='Quantity', y='Description', data=top_products_quantity, palette='magma')
plt.title('Top 10 Products by Quantity Sold')
plt.xlabel('Total Quantity Sold')
plt.ylabel('Product Description')
plt.tight_layout()
plt.show()

In [None]:
#revenue by country
country_revenue = df.groupby('Country')['TotalPrice'].sum().reset_index().sort_values(by='TotalPrice', ascending=False).head(10)
plt.figure(figsize=(12, 6))
sns.barplot(x='TotalPrice', y='Country', data=country_revenue, palette='coolwarm')
plt.title('Top 10 Countries by Revenue')
plt.xlabel('Total Revenue')
plt.ylabel('Country')
plt.tight_layout()
plt.show()