# Customer Journey Analysis
This notebook performs clustering and dimensionality reduction on customer journey data.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Load the dataset
df = pd.read_csv('ecommerce_customer_data_large.csv')
df.head()

## Data Cleaning

In [None]:
# Drop duplicates and check missing values
df.drop_duplicates(inplace=True)
print(df.isnull().sum())

## Feature Engineering

In [None]:
# Convert 'Purchase Date' to datetime
df['Purchase Date'] = pd.to_datetime(df['Purchase Date'])

# Aggregate features per customer
features = df.groupby('Customer ID').agg({
    'Total Purchase Amount': 'sum',
    'Quantity': 'sum',
    'Returns': 'sum',
    'Product Price': 'mean',
    'Purchase Date': ['count', lambda x: (x.max() - x.min()).days]
})

features.columns = ['Total_Spend', 'Total_Quantity', 'Total_Returns', 'Avg_Product_Price', 'Purchase_Frequency', 'Days_Between_First_Last']
features.reset_index(inplace=True)
features.head()

## Normalization

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(features.drop('Customer ID', axis=1))

## PCA - Dimensionality Reduction

In [None]:
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

# Add PCA columns to features
features['PCA1'] = X_pca[:, 0]
features['PCA2'] = X_pca[:, 1]
features.head()

## K-Means Clustering

In [None]:
kmeans = KMeans(n_clusters=3, random_state=42)
features['Cluster'] = kmeans.fit_predict(X_pca)

## Cluster Visualization

In [None]:
plt.figure(figsize=(8, 6))
sns.scatterplot(data=features, x='PCA1', y='PCA2', hue='Cluster', palette='Set2')
plt.title('Customer Segments')
plt.show()

## Interpretation
- Analyze the segments based on cluster statistics.
- Label segments like 'High-value', 'Churn-risk', etc.