In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans, DBSCAN
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')


# Load dataset
file_path = "/kaggle/input/online-retail-dataset/online_retail.csv"
df = pd.read_csv(file_path, encoding="ISO-8859-1")

# Display basic information
print(df.head())
print(df.info())
print("Missing values:\n", df.isnull().sum())
print("Unique product categories:", df["Description"].nunique())
print("Number of unique customers:", df["CustomerID"].nunique())
print("Date range:", df["InvoiceDate"].min(), "-", df["InvoiceDate"].max())

# Data Cleaning
print("Cleaning data...")
df_cleaned = df.copy().dropna(subset=["Description", "CustomerID"])
df_cleaned["InvoiceDate"] = pd.to_datetime(df_cleaned["InvoiceDate"])
df_cleaned = df_cleaned[(df_cleaned["Quantity"] > 0) & (df_cleaned["UnitPrice"] > 0)]

# Visualization: Monthly Sales
plt.figure(figsize=(12, 6))
df_cleaned.set_index("InvoiceDate")["Quantity"].resample("M").sum().plot()
plt.title("Monthly Sales Volume")
plt.xlabel("Date")
plt.ylabel("Quantity Sold")
plt.show()

# Visualization: Product Price Distribution
plt.figure(figsize=(10, 5))
sns.histplot(df_cleaned["UnitPrice"], bins=100, kde=True)
plt.xlim(0, 100)
plt.title("Product Price Distribution")
plt.xlabel("Price")
plt.ylabel("Count")
plt.show()

# Top 10 Products by Sales
top_products = df_cleaned.groupby("Description")["Quantity"].sum().sort_values(ascending=False).head(10)
plt.figure(figsize=(10, 5))
sns.barplot(y=top_products.index, x=top_products.values, palette="Blues_r")
plt.xlabel("Quantity Sold")
plt.ylabel("Product")
plt.title("Top 10 Best-Selling Products")
plt.show()

# Customer Spending Analysis
df_cleaned["TotalPrice"] = df_cleaned["Quantity"] * df_cleaned["UnitPrice"]
customer_spending = df_cleaned.groupby("CustomerID")["TotalPrice"].sum()

plt.figure(figsize=(10, 5))
sns.histplot(customer_spending, bins=100, kde=True)
plt.xlim(0, 2000)
plt.title("Customer Spending Distribution")
plt.xlabel("Total Spending")
plt.ylabel("Number of Customers")
plt.show()

# Outlier Handling
price_limit = df_cleaned["UnitPrice"].quantile(0.99)
plt.figure(figsize=(10, 5))
sns.histplot(df_cleaned[df_cleaned["UnitPrice"] < price_limit]["UnitPrice"], bins=50, kde=True)
plt.title("Price Distribution (Without Outliers)")
plt.xlabel("Price")
plt.ylabel("Count")
plt.show()

# RFM Analysis
current_date = df_cleaned["InvoiceDate"].max()
rfm = df_cleaned.groupby("CustomerID").agg({
    "InvoiceDate": lambda x: (current_date - x.max()).days,
    "InvoiceNo": "count",
    "TotalPrice": "sum"
}).reset_index()
rfm.columns = ["CustomerID", "Recency", "Frequency", "Monetary"]
print(rfm.describe())

# Scaling for Clustering
scaler = StandardScaler()
rfm_scaled = scaler.fit_transform(rfm[["Recency", "Frequency", "Monetary"]])

# Elbow Method to Find Optimal k
wcss = []
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(rfm_scaled)
    wcss.append(kmeans.inertia_)

plt.figure(figsize=(10, 5))
plt.plot(range(1, 11), wcss, marker="o", linestyle="--")
plt.xlabel("Number of Clusters")
plt.ylabel("WCSS (Inertia)")
plt.title("Elbow Method for Optimal k")
plt.show()

# Clustering
optimal_k = 4
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
rfm["Cluster"] = kmeans.fit_predict(rfm_scaled)
print(rfm["Cluster"].value_counts())

# Visualizing Clusters
plt.figure(figsize=(10, 6))
sns.scatterplot(x=rfm["Recency"], y=rfm["Monetary"], hue=rfm["Cluster"], palette="Set2", alpha=0.7)
plt.title("Customer Segmentation (Recency vs Monetary)")
plt.xlabel("Recency (days since last purchase)")
plt.ylabel("Monetary (Total Spending)")
plt.legend(title="Cluster")
plt.show()

plt.figure(figsize=(10, 6))
sns.scatterplot(x=rfm["Frequency"], y=rfm["Monetary"], hue=rfm["Cluster"], palette="Set1", alpha=0.7)
plt.title("Customer Segmentation (Frequency vs Monetary)")
plt.xlabel("Frequency (Number of Purchases)")
plt.ylabel("Monetary (Total Spending)")
plt.legend(title="Cluster")
plt.show()

# Cluster Summary
rfm_cluster_summary = rfm.groupby("Cluster")[["Recency", "Frequency", "Monetary"]].mean()
plt.figure(figsize=(12, 5))
sns.heatmap(rfm_cluster_summary, annot=True, fmt=".2f", cmap="coolwarm")
plt.title("Average RFM Values by Cluster")
plt.show()


# Check available columns in rfm
echo "List of columns in rfm:", rfm.columns.tolist()

# Rename 'TotalOrders_y' if it exists
if "TotalOrders_y" in rfm.columns:
    rfm.rename(columns={"TotalOrders_y": "TotalOrders"}, inplace=True)
elif "TotalOrders_x" in rfm.columns:
    rfm.rename(columns={"TotalOrders_x": "TotalOrders"}, inplace=True)

# Create 'TotalOrders' column if missing
if "TotalOrders" not in rfm.columns:
    print("⚠️ Warning! 'TotalOrders' column is missing. Creating it with default value 0.")
    rfm["TotalOrders"] = 0

# Special Customer Analysis
special_customers = [12748, 14911]
special_customers_df = df_cleaned[df_cleaned["CustomerID"].isin(special_customers)]

# Calculate key metrics
special_customers_analysis = special_customers_df.groupby("CustomerID").agg(
    Total_Spending=("TotalPrice", "sum"),
    Total_Orders=("InvoiceNo", "nunique"),
    Unique_Products=("Description", "nunique"),
    Total_Quantity=("Quantity", "sum")
)

# Save analysis to a CSV file
special_customers_analysis.to_csv("special_customers_analysis.csv")

# Identify Enterprise Customers
enterprise_customers = rfm[(rfm["TotalOrders"] > 200) & (rfm["Monetary"] > 30000)]["CustomerID"]
print("Enterprise Customers:", enterprise_customers.tolist())

# Assign Enterprise label to identified customers
rfm["Cluster"] = rfm.apply(lambda row: "Enterprise" if row["CustomerID"] in enterprise_customers.tolist() else row["Cluster"], axis=1)

# Identify Wholesale Customers
wholesale_customers = [14646, 18102, 17450, 16446, 14911, 12415, 14156, 17511, 16029, 15098]
rfm["Cluster"] = rfm.apply(lambda row: "Wholesale" if row["CustomerID"] in wholesale_customers else row["Cluster"], axis=1)

# Merge small clusters into one called "Special"
final_df = rfm.drop(columns=["CustomerID"], errors="ignore")
final_df["Cluster"] = final_df["Cluster"].replace({"Enterprise": "Special", "Wholesale": "Special", 3: 2})

# Convert 'Special' to numerical representation
final_df["Cluster"] = final_df["Cluster"].replace("Special", 2).infer_objects(copy=False).astype(int)
print("Cluster distribution after merging minor classes:")
print(final_df["Cluster"].value_counts())

# Encode Clusters
le = LabelEncoder()
final_df["Cluster"] = le.fit_transform(final_df["Cluster"])
print("Unique cluster values after encoding:", final_df["Cluster"].unique())

# Split data into train and test sets
X = final_df.drop(columns=["Cluster"], errors="ignore")
y = final_df["Cluster"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Display dataset shapes
print("Shapes of X_train, X_test, y_train, y_test:", X_train.shape, X_test.shape, y_train.shape, y_test.shape)

# Class distribution
print("Class distribution in y_train:")
print(pd.Series(y_train).value_counts())
print("Class distribution in y_test:")
print(pd.Series(y_test).value_counts())

