In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import hdbscan
import os

In [None]:
# Create results directory if it doesn't exist
os.makedirs('results', exist_ok=True)

In [None]:
# Load the data
df = pd.read_csv('dataset/data.csv', encoding='latin-1')

In [None]:
# Display basic information about the dataset
print(df.info())
print(df.describe())

In [None]:
# Check for missing values
print(df.isnull().sum())

In [None]:
# Convert InvoiceDate to datetime
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])

In [None]:
# Calculate TotalAmount
df['TotalAmount'] = df['Quantity'] * df['UnitPrice']

In [None]:
# Feature Engineering
last_invoice_date = df['InvoiceDate'].max()
rfm = df.groupby('CustomerID').agg({
    'InvoiceDate': lambda x: (last_invoice_date - x.max()).days,
    'InvoiceNo': 'count',
    'TotalAmount': 'sum'
})
rfm.columns = ['Recency', 'Frequency', 'MonetaryValue']

In [None]:
# Drop any rows with NaN values
rfm = rfm.dropna()

In [None]:
# Exploratory Data Analysis
fig, axes = plt.subplots(1, 3, figsize=(20, 5))
for i, column in enumerate(['Recency', 'Frequency', 'MonetaryValue']):
    sns.histplot(rfm[column], ax=axes[i])
    axes[i].set_title(f'Distribution of {column}')
plt.tight_layout()
plt.savefig('results/rfm_distributions.png')
plt.close()

In [None]:
# Correlation analysis
plt.figure(figsize=(10, 8))
sns.heatmap(rfm.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation between RFM variables')
plt.savefig('results/rfm_correlation.png')
plt.close()

In [None]:
# Customer Segmentation
scaler = StandardScaler()
rfm_normalized = scaler.fit_transform(rfm)

In [None]:
# K-means clustering
kmeans = KMeans(n_clusters=4, random_state=42)
rfm['KMeansCluster'] = kmeans.fit_predict(rfm_normalized)

In [None]:
# Hierarchical clustering
hierarchical = AgglomerativeClustering(n_clusters=4)
rfm['HierarchicalCluster'] = hierarchical.fit_predict(rfm_normalized)

In [None]:
# DBSCAN
hdbscan_clusterer = hdbscan.HDBSCAN(min_cluster_size=10)
rfm['DBSCANCluster'] = hdbscan_clusterer.fit_predict(rfm_normalized)

In [None]:
# Analyze Cluster Results
def analyze_clusters(data, cluster_col):
    return data.groupby(cluster_col).agg({
        'Recency': 'mean',
        'Frequency': 'mean',
        'MonetaryValue': 'mean'
    })
kmeans_analysis = analyze_clusters(rfm, 'KMeansCluster')
hierarchical_analysis = analyze_clusters(rfm, 'HierarchicalCluster')
dbscan_analysis = analyze_clusters(rfm, 'DBSCANCluster')

In [None]:
# Save cluster analyses
kmeans_analysis.to_csv('results/kmeans_analysis.csv')
hierarchical_analysis.to_csv('results/hierarchical_analysis.csv')
dbscan_analysis.to_csv('results/dbscan_analysis.csv')

In [None]:
# Visualize clusters (K-means)
pca = PCA(n_components=2)
rfm_pca = pca.fit_transform(rfm_normalized)

In [None]:
plt.figure(figsize=(10, 8))
scatter = plt.scatter(rfm_pca[:, 0], rfm_pca[:, 1], c=rfm['KMeansCluster'], cmap='viridis')
plt.title('Customer Segments (K-means)')
plt.colorbar(scatter)
plt.savefig('results/kmeans_clusters.png')
plt.close()

In [None]:
# Predictive Modeling
X = rfm[['Recency', 'Frequency', 'MonetaryValue']]
y = rfm['KMeansCluster']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)

In [None]:
y_pred = rf_classifier.predict(X_test)

In [None]:
# Classification Report
class_report = classification_report(y_test, y_pred)
with open('results/classification_report.txt', 'w') as f:
    f.write(class_report)

In [None]:
# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10, 8))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.savefig('results/confusion_matrix.png')
plt.close()

In [None]:
# Feature Importance
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': rf_classifier.feature_importances_
}).sort_values('importance', ascending=False)

In [None]:
plt.figure(figsize=(10, 6))
sns.barplot(x='importance', y='feature', data=feature_importance)
plt.title('Feature Importance in Customer Segmentation')
plt.savefig('results/feature_importance.png')
plt.close()

In [None]:
# Save feature importance
feature_importance.to_csv('results/feature_importance.csv', index=False)