# Week 4, Day 6: Unsupervised Learning Hackathon

## Challenge Overview
Apply unsupervised learning techniques to analyze a real-world dataset. You'll use the concepts learned throughout Week 4:
- Clustering
- Dimensionality Reduction
- Anomaly Detection
- Visualization

## Problem: E-commerce Customer Analysis
Analyze customer behavior patterns and identify segments in an e-commerce dataset.

In [None]:
# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.ensemble import IsolationForest
from umap import UMAP

## Part 1: Data Generation and Preparation

In [None]:
def generate_ecommerce_data(n_samples=1000):
    """Generate synthetic e-commerce customer data"""
    np.random.seed(42)
    
    # Generate features
    data = {
        'purchase_frequency': np.random.poisson(5, n_samples),
        'avg_order_value': np.random.normal(100, 30, n_samples),
        'time_on_site': np.random.gamma(5, 2, n_samples),
        'items_viewed': np.random.poisson(20, n_samples),
        'cart_abandonment_rate': np.random.beta(2, 5, n_samples),
        'days_since_last_purchase': np.random.exponential(30, n_samples),
        'total_purchases': np.random.poisson(15, n_samples),
        'discount_usage': np.random.binomial(10, 0.3, n_samples),
        'customer_service_calls': np.random.poisson(2, n_samples),
        'website_visits': np.random.poisson(30, n_samples)
    }
    
    # Create DataFrame
    df = pd.DataFrame(data)
    
    # Add some anomalies
    n_anomalies = 50
    anomaly_indices = np.random.choice(n_samples, n_anomalies, replace=False)
    
    df.loc[anomaly_indices, 'avg_order_value'] *= 5
    df.loc[anomaly_indices, 'website_visits'] *= 3
    
    return df

# Generate dataset
df = generate_ecommerce_data()
print("Dataset Shape:", df.shape)
print("\nFirst few rows:")
print(df.head())

## Challenge Tasks

### Task 1: Exploratory Data Analysis

In [None]:
def perform_eda(df):
    """Perform exploratory data analysis"""
    # Your code here:
    # 1. Analyze feature distributions
    # 2. Check correlations
    # 3. Identify patterns
    # 4. Visualize relationships
    pass

# Example solution structure:
def example_eda(df):
    # Basic statistics
    print("Basic Statistics:")
    print(df.describe())
    
    # Correlation analysis
    plt.figure(figsize=(12, 8))
    sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
    plt.title('Correlation Matrix')
    plt.show()
    
    # Feature distributions
    plt.figure(figsize=(15, 10))
    for i, col in enumerate(df.columns, 1):
        plt.subplot(3, 4, i)
        sns.histplot(df[col], kde=True)
        plt.title(col)
    plt.tight_layout()
    plt.show()

example_eda(df)

### Task 2: Customer Segmentation

In [None]:
def segment_customers(df):
    """Perform customer segmentation"""
    # Your code here:
    # 1. Preprocess data
    # 2. Apply clustering
    # 3. Analyze segments
    # 4. Visualize results
    pass

# Example solution structure:
def example_segmentation(df):
    # Scale features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(df)
    
    # Apply K-means
    kmeans = KMeans(n_clusters=4, random_state=42)
    clusters = kmeans.fit_predict(X_scaled)
    
    # Add cluster labels to DataFrame
    df_clustered = df.copy()
    df_clustered['Cluster'] = clusters
    
    # Analyze clusters
    print("\nCluster Sizes:")
    print(df_clustered['Cluster'].value_counts())
    
    # Cluster profiles
    print("\nCluster Profiles:")
    print(df_clustered.groupby('Cluster').mean())
    
    # Visualize clusters
    pca = PCA(n_components=2)
    X_pca = pca.fit_transform(X_scaled)
    
    plt.figure(figsize=(10, 6))
    scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=clusters, cmap='viridis')
    plt.colorbar(scatter)
    plt.title('Customer Segments (PCA)')
    plt.show()

example_segmentation(df)

### Task 3: Anomaly Detection

In [None]:
def detect_anomalies(df):
    """Detect anomalous customers"""
    # Your code here:
    # 1. Preprocess data
    # 2. Apply detection methods
    # 3. Compare results
    # 4. Visualize anomalies
    pass

# Example solution structure:
def example_anomaly_detection(df):
    # Scale features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(df)
    
    # Apply Isolation Forest
    iso_forest = IsolationForest(contamination=0.1, random_state=42)
    anomalies = iso_forest.fit_predict(X_scaled)
    
    # Add anomaly labels
    df_anomalies = df.copy()
    df_anomalies['Anomaly'] = anomalies
    
    # Analyze anomalies
    print("\nNumber of anomalies detected:", (anomalies == -1).sum())
    print("\nAnomaly characteristics:")
    print(df_anomalies.groupby('Anomaly').mean())
    
    # Visualize anomalies
    pca = PCA(n_components=2)
    X_pca = pca.fit_transform(X_scaled)
    
    plt.figure(figsize=(10, 6))
    plt.scatter(X_pca[anomalies == 1, 0], X_pca[anomalies == 1, 1], 
                label='Normal')
    plt.scatter(X_pca[anomalies == -1, 0], X_pca[anomalies == -1, 1], 
                color='red', label='Anomaly')
    plt.title('Anomaly Detection Results')
    plt.legend()
    plt.show()

example_anomaly_detection(df)

### Task 4: Dimensionality Reduction and Visualization

In [None]:
def visualize_patterns(df):
    """Visualize customer patterns"""
    # Your code here:
    # 1. Apply dimensionality reduction
    # 2. Create visualizations
    # 3. Analyze patterns
    # 4. Compare methods
    pass

# Example solution structure:
def example_visualization(df):
    # Scale features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(df)
    
    # Apply different methods
    pca = PCA(n_components=2)
    umap = UMAP(random_state=42)
    
    X_pca = pca.fit_transform(X_scaled)
    X_umap = umap.fit_transform(X_scaled)
    
    # Visualize results
    plt.figure(figsize=(15, 5))
    
    # PCA
    plt.subplot(121)
    plt.scatter(X_pca[:, 0], X_pca[:, 1], alpha=0.5)
    plt.title('PCA Projection')
    
    # UMAP
    plt.subplot(122)
    plt.scatter(X_umap[:, 0], X_umap[:, 1], alpha=0.5)
    plt.title('UMAP Projection')
    
    plt.tight_layout()
    plt.show()
    
    # Analyze explained variance (PCA)
    print("\nPCA Explained Variance Ratio:")
    print(pca.explained_variance_ratio_)

example_visualization(df)

## Evaluation Criteria

Your solution will be evaluated based on:

1. Data Analysis (25%)
   - Quality of EDA
   - Pattern identification
   - Insight generation

2. Customer Segmentation (25%)
   - Segment identification
   - Segment analysis
   - Business insights

3. Anomaly Detection (25%)
   - Method selection
   - Implementation quality
   - Result interpretation

4. Visualization (25%)
   - Clarity of visualizations
   - Pattern representation
   - Technical implementation

## Submission Guidelines
1. Complete all tasks in this notebook
2. Document your approach and decisions
3. Include visualizations and insights
4. Provide business recommendations