# Deliverable 3: Classification, Clustering, and Pattern Mining

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc
from sklearn.cluster import KMeans
from mlxtend.frequent_patterns import apriori, association_rules


In [None]:
# Load dataset
df = pd.read_excel('Online Retail.xlsx')

# Data Cleaning (same as previous deliverables)
df = df.dropna(subset=['CustomerID'])
df = df.drop_duplicates(subset=['InvoiceNo', 'StockCode'])
df = df[(df['Quantity'] > 0) & (df['UnitPrice'] > 0)]
df['TotalPrice'] = df['Quantity'] * df['UnitPrice']


## Feature Engineering for Classification
Create features for customer segmentation classification tasks.

In [None]:
# Aggregate customer data
customer_df = df.groupby('CustomerID').agg({
    'Quantity': 'sum',
    'UnitPrice': 'mean',
    'TotalPrice': 'sum',
    'InvoiceNo': 'nunique'
}).reset_index()

customer_df.rename(columns={
    'Quantity': 'TotalQuantity',
    'UnitPrice': 'AvgUnitPrice',
    'TotalPrice': 'TotalSpent',
    'InvoiceNo': 'NumPurchases'
}, inplace=True)

# Create a binary classification target: High spender vs Low spender
threshold = customer_df['TotalSpent'].median()
customer_df['HighSpender'] = (customer_df['TotalSpent'] > threshold).astype(int)

# Features and target
X = customer_df[['TotalQuantity', 'AvgUnitPrice', 'NumPurchases']]
y = customer_df['HighSpender']

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


## Classification Models

In [None]:
# Decision Tree Classifier
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)

# Evaluation
print("Decision Tree Classification Report:")
print(classification_report(y_test, y_pred_dt))


In [None]:
# k-Nearest Neighbors Classifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)

# Evaluation
print("k-NN Classification Report:")
print(classification_report(y_test, y_pred_knn))


## Hyperparameter Tuning for k-NN

In [None]:
# Hyperparameter tuning for k-NN
param_grid = {'n_neighbors': range(1, 20)}
grid_knn = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5, scoring='accuracy')
grid_knn.fit(X_train, y_train)
print(f"Best k for k-NN: {grid_knn.best_params_['n_neighbors']}")


In [None]:
# ROC Curve for best k-NN
y_proba_knn = grid_knn.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = roc_curve(y_test, y_proba_knn)
roc_auc = auc(fpr, tpr)

plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - k-NN')
plt.legend(loc="lower right")
plt.show()


## Clustering Model

In [None]:
# K-Means Clustering
kmeans = KMeans(n_clusters=3, random_state=42)
clusters = kmeans.fit_predict(X)

# Add cluster labels to data
customer_df['Cluster'] = clusters

# Visualize Clusters
plt.figure(figsize=(8,6))
sns.scatterplot(x='TotalQuantity', y='TotalSpent', hue='Cluster', data=customer_df, palette='Set1')
plt.title('Customer Segments by K-Means Clustering')
plt.show()


## Association Rule Mining

In [None]:
# Prepare data for Apriori
basket = df.groupby(['InvoiceNo', 'Description'])['Quantity'].sum().unstack().reset_index().fillna(0).set_index('InvoiceNo')
basket = basket.applymap(lambda x: 1 if x > 0 else 0)

# Apply Apriori
frequent_itemsets = apriori(basket, min_support=0.05, use_colnames=True)
rules = association_rules(frequent_itemsets, metric='lift', min_threshold=1.0)
rules.sort_values('confidence', ascending=False, inplace=True)
rules.head()


## Insights and Observations
- **Classification**: Decision Tree and k-NN models effectively predicted high spenders. Tuning k-NN improved accuracy.
- **Clustering**: K-Means identified customer segments with distinct purchasing behaviors.
- **Pattern Mining**: Apriori discovered common itemsets in customer transactions, useful for marketing strategies.
