In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

# -----------------------------
# Load dataset (update path if needed)
# -----------------------------
df = pd.read_csv("online_retail.csv", encoding="latin1")

# -----------------------------
# Preprocessing
# -----------------------------
# Remove missing CustomerID
df = df.dropna(subset=["CustomerID"])

# Remove cancelled invoices
df = df[~df["InvoiceNo"].astype(str).str.startswith("C")]

# Remove invalid quantities and prices
df = df[(df["Quantity"] > 0) & (df["UnitPrice"] > 0)]

# Convert InvoiceDate to datetime
df["InvoiceDate"] = pd.to_datetime(df["InvoiceDate"])

# -----------------------------
# RFM Table
# -----------------------------
ref_date = df["InvoiceDate"].max() + pd.Timedelta(days=1)

rfm = df.groupby("CustomerID").agg({
    "InvoiceDate": lambda x: (ref_date - x.max()).days,   # Recency
    "InvoiceNo": "nunique",                              # Frequency
    "Quantity": lambda x: (x * df.loc[x.index, "UnitPrice"]).sum()  # Monetary
}).reset_index()

rfm.columns = ["CustomerID", "Recency", "Frequency", "Monetary"]

# -----------------------------
# Scaling & Clustering
# -----------------------------
scaler = StandardScaler()
rfm_scaled = scaler.fit_transform(rfm[["Recency", "Frequency", "Monetary"]])

kmeans = KMeans(n_clusters=4, random_state=42)
rfm["Cluster"] = kmeans.fit_predict(rfm_scaled)

# -----------------------------
# Save RFM Table
# -----------------------------
rfm.to_csv("rfm_table.csv", index=False)

print("✅ rfm_table.csv created successfully and saved in your folder!")
rfm.head()
