In [60]:
import os
import warnings
warnings.filterwarnings("ignore")

In [61]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import timedelta
from scipy import stats
from scipy.cluster.hierarchy import linkage, dendrogram
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.metrics import silhouette_score, adjusted_rand_score
from sklearn.neighbors import NearestNeighbors

In [62]:
sns.set(style="whitegrid", context="notebook", font_scale=1.05)
plt.rcParams["figure.figsize"] = (9,6)

In [63]:
DATA_PATH = "OnlineRetail.csv"          # set to your CSV path
OUTPUT_DIR = "rfm_outputs"
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [64]:
df = pd.read_csv(DATA_PATH, encoding="latin1", low_memory=False)

In [77]:
df.shape

(168631, 8)

In [65]:
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,01-12-2010 08:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,01-12-2010 08:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,01-12-2010 08:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,01-12-2010 08:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,01-12-2010 08:26,3.39,17850.0,United Kingdom


In [66]:
df = df.dropna(subset=["CustomerID"])
df["CustomerID"] = df["CustomerID"].astype(int)

In [67]:
df["InvoiceDate"] = pd.to_datetime(df["InvoiceDate"], errors="coerce")
df = df.dropna(subset=["InvoiceDate"])

In [74]:
df['InvoiceNo'].info()

<class 'pandas.core.series.Series'>
Index: 172782 entries, 0 to 541908
Series name: InvoiceNo
Non-Null Count   Dtype 
--------------   ----- 
172782 non-null  object
dtypes: object(1)
memory usage: 2.6+ MB


In [75]:
df = df[~df["InvoiceNo"].astype(str).str.startswith("C")]
df = df[(df["Quantity"] > 0) & (df["UnitPrice"] > 0)]

In [76]:
df.shape

(168631, 8)

In [78]:
df["Amount"] = df["Quantity"] * df["UnitPrice"]

In [79]:
print("Raw transactions after cleaning:", len(df))
print("Unique customers:", df["CustomerID"].nunique())

Raw transactions after cleaning: 168631
Unique customers: 2997


In [88]:
df.isnull().sum()

InvoiceNo      0
StockCode      0
Description    0
Quantity       0
InvoiceDate    0
UnitPrice      0
CustomerID     0
Country        0
Amount         0
dtype: int64

In [83]:
print(df[["Quantity","UnitPrice","Amount"]].describe())

            Quantity      UnitPrice         Amount
count  168631.000000  168631.000000  168631.000000
mean       13.079546       3.147421      22.982061
std       201.856371      26.125811     427.780198
min         1.000000       0.040000       0.100000
25%         2.000000       1.250000       4.680000
50%         6.000000       1.950000      11.700000
75%        12.000000       3.750000      19.800000
max     80995.000000    8142.750000  168469.600000


In [90]:
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,Amount
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-01-12 08:26:00,2.55,17850,United Kingdom,15.3
1,536365,71053,WHITE METAL LANTERN,6,2010-01-12 08:26:00,3.39,17850,United Kingdom,20.34
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-01-12 08:26:00,2.75,17850,United Kingdom,22.0
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-01-12 08:26:00,3.39,17850,United Kingdom,20.34
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-01-12 08:26:00,3.39,17850,United Kingdom,20.34
