In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

# Read the CSV files
customers_df = pd.read_csv('Customers.csv')
transactions_df = pd.read_csv('Transactions.csv')
products_df = pd.read_csv('Products.csv')

# Data preprocessing
customers_df['SignupDate'] = pd.to_datetime(customers_df['SignupDate'])
transactions_df['TransactionDate'] = pd.to_datetime(transactions_df['TransactionDate'])

# Customer Analysis
def analyze_customers():
    # Region distribution
    region_dist = customers_df['Region'].value_counts()
    
    # Customer signup trends
    customers_df['SignupMonth'] = customers_df['SignupDate'].dt.to_period('M')
    monthly_signups = customers_df.groupby('SignupMonth').size()
    
    return region_dist, monthly_signups

# Transaction Analysis
def analyze_transactions():
    # Merge transactions with customer and product information
    merged_df = transactions_df.merge(customers_df, on='CustomerID')
    merged_df = merged_df.merge(products_df, on='ProductID')
    
    # Calculate customer lifetime value (CLV)
    customer_ltv = merged_df.groupby('CustomerID')['TotalValue'].sum().sort_values(ascending=False)
    
    # Analyze product categories
    category_sales = merged_df.groupby('Category')['TotalValue'].sum().sort_values(ascending=False)
    
    # Calculate average transaction value by region
    avg_transaction_by_region = merged_df.groupby('Region')['TotalValue'].mean()
    
    # Analyze purchase frequency
    purchase_frequency = merged_df.groupby('CustomerID').size().mean()
    
    return customer_ltv, category_sales, avg_transaction_by_region, purchase_frequency

# Product Analysis
def analyze_products():
    # Product category distribution
    category_dist = products_df['Category'].value_counts()
    
    # Price distribution statistics
    price_stats = products_df['Price'].describe()
    
    return category_dist, price_stats

# Generate visualizations
def create_visualizations():
    plt.figure(figsize=(15, 10))
    
    # Customer region distribution
    plt.subplot(2, 2, 1)
    sns.countplot(data=customers_df, x='Region')
    plt.title('Customer Distribution by Region')
    plt.xticks(rotation=45)
    
    # Transaction value distribution
    plt.subplot(2, 2, 2)
    sns.histplot(data=transactions_df, x='TotalValue', bins=30)
    plt.title('Transaction Value Distribution')
    
    # Product category distribution
    plt.subplot(2, 2, 3)
    sns.countplot(data=products_df, x='Category')
    plt.title('Product Category Distribution')
    plt.xticks(rotation=45)
    
    # Save the plot
    plt.tight_layout()
    plt.savefig('eda_visualizations.png')

# Run the analysis
region_dist, monthly_signups = analyze_customers()
customer_ltv, category_sales, avg_transaction_by_region, purchase_frequency = analyze_transactions()
category_dist, price_stats = analyze_products()

# Print insights
print("\nBusiness Insights:")
print("\n1. Customer Geographic Distribution:")
print(region_dist)

print("\n2. Top Product Categories by Sales:")
print(category_sales)

print("\n3. Average Transaction Value by Region:")
print(avg_transaction_by_region)

print("\n4. Product Price Statistics:")
print(price_stats)

print("\n5. Average Purchase Frequency per Customer:")
print(f"Average number of transactions per customer: {purchase_frequency:.2f}")


Business Insights:

1. Customer Geographic Distribution:
Region
South America    59
Europe           50
North America    46
Asia             45
Name: count, dtype: int64

2. Top Product Categories by Sales:
Category
Books          192147.47
Electronics    180783.50
Clothing       166170.66
Home Decor     150893.93
Name: TotalValue, dtype: float64

3. Average Transaction Value by Region:
Region
Asia             697.591606
Europe           710.489872
North America    624.235246
South America    721.554474
Name: TotalValue, dtype: float64

4. Product Price Statistics:
count    100.000000
mean     267.551700
std      143.219383
min       16.080000
25%      147.767500
50%      292.875000
75%      397.090000
max      497.760000
Name: Price, dtype: float64

5. Average Purchase Frequency per Customer:
Average number of transactions per customer: 5.03


In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
import json

# Read the data
customers_df = pd.read_csv('Customers.csv')
transactions_df = pd.read_csv('Transactions.csv')
products_df = pd.read_csv('Products.csv')

def create_customer_profile():
    """Create comprehensive customer profiles using both customer and transaction data"""
    
    # 1. Basic customer features from customer data
    customer_profile = customers_df.copy()
    customer_profile['SignupDate'] = pd.to_datetime(customer_profile['SignupDate'])
    customer_profile['account_age_days'] = (pd.Timestamp.now() - customer_profile['SignupDate']).dt.days
    
    # 2. Transaction-based features
    transaction_features = transactions_df.groupby('CustomerID').agg({
        'TransactionID': 'count',  # number of transactions
        'TotalValue': ['sum', 'mean', 'std'],  # spending patterns
        'Quantity': ['sum', 'mean', 'std'],  # purchase quantity patterns
    }).reset_index()
    
    # Flatten column names
    transaction_features.columns = [
        'CustomerID', 'transaction_count', 'total_spend', 'avg_transaction_value',
        'std_transaction_value', 'total_items', 'avg_items_per_transaction', 'std_items'
    ]
    
    # 3. Product category preferences
    # Merge transactions with products to get categories
    trans_with_categories = transactions_df.merge(
        products_df[['ProductID', 'Category']], 
        on='ProductID'
    )
    
    # Calculate category preferences using pivot_table
    category_pivot = pd.pivot_table(
        trans_with_categories,
        index='CustomerID',
        columns='Category',
        values='Quantity',
        aggfunc='sum',
        fill_value=0
    )
    
    # Normalize category preferences
    category_sums = category_pivot.sum(axis=1)
    category_pivot = category_pivot.div(category_sums, axis=0).fillna(0)
    
    # 4. Recent behavior features
    trans_with_categories['TransactionDate'] = pd.to_datetime(trans_with_categories['TransactionDate'])
    recent_transactions = trans_with_categories.sort_values('TransactionDate').groupby('CustomerID').tail(5)
    
    recent_features = recent_transactions.groupby('CustomerID').agg({
        'TotalValue': 'mean',
        'Quantity': 'mean'
    }).reset_index()
    
    recent_features.columns = ['CustomerID', 'recent_avg_value', 'recent_avg_quantity']
    
    # Add average transaction value per item for recent transactions
    recent_features['recent_avg_price_per_item'] = recent_features['recent_avg_value'] / recent_features['recent_avg_quantity']
    
    # 5. Merge all features
    customer_profile = customer_profile.merge(transaction_features, on='CustomerID', how='left')
    customer_profile = customer_profile.merge(category_pivot.reset_index(), on='CustomerID', how='left')
    customer_profile = customer_profile.merge(recent_features, on='CustomerID', how='left')
    
    # Fill NaN values for customers with no transactions
    customer_profile = customer_profile.fillna(0)
    
    return customer_profile

def find_lookalikes(customer_profile, target_customers, n_recommendations=3):
    """Find lookalike customers using cosine similarity"""
    
    # Select features for similarity calculation
    feature_cols = customer_profile.select_dtypes(include=['float64', 'int64']).columns
    feature_cols = feature_cols.drop(['CustomerID']) if 'CustomerID' in feature_cols else feature_cols
    
    # Normalize features
    scaler = StandardScaler()
    normalized_features = scaler.fit_transform(customer_profile[feature_cols])
    
    # Calculate similarity matrix
    similarity_matrix = cosine_similarity(normalized_features)
    
    # Generate recommendations for target customers
    lookalike_results = {}
    customer_ids = customer_profile['CustomerID'].values
    
    for target_id in target_customers:
        target_idx = customer_profile[customer_profile['CustomerID'] == target_id].index[0]
        similarities = similarity_matrix[target_idx]
        
        # Get top N similar customers (excluding self)
        similar_indices = np.argsort(similarities)[::-1][1:n_recommendations+1]
        
        similar_customers = [
            {
                'customer_id': str(customer_ids[idx]),
                'similarity_score': float(similarities[idx])
            }
            for idx in similar_indices
        ]
        
        lookalike_results[str(target_id)] = similar_customers
    
    return lookalike_results

# Create customer profiles
print("Creating customer profiles...")
customer_profile = create_customer_profile()

# Get first 20 customers
target_customers = customers_df['CustomerID'].iloc[:20].tolist()

# Generate lookalike recommendations
print("Generating lookalike recommendations...")
lookalike_results = find_lookalikes(customer_profile, target_customers)

# Save results to CSV
print("Saving results...")
with open('FirstName_LastName_Lookalike.csv', 'w') as f:
    json.dump(lookalike_results, f, indent=2)

# Print sample results
print("\nSample lookalike recommendations:")
for target_id in list(lookalike_results.keys())[:3]:
    print(f"\nTarget Customer: {target_id}")
    for rec in lookalike_results[target_id]:
        print(f"Similar Customer: {rec['customer_id']}, Similarity Score: {rec['similarity_score']:.4f}")

Creating customer profiles...
Generating lookalike recommendations...
Saving results...

Sample lookalike recommendations:

Target Customer: C0001
Similar Customer: C0005, Similarity Score: 0.7979
Similar Customer: C0069, Similarity Score: 0.7179
Similar Customer: C0130, Similarity Score: 0.6831

Target Customer: C0002
Similar Customer: C0060, Similarity Score: 0.7916
Similar Customer: C0062, Similarity Score: 0.7798
Similar Customer: C0025, Similarity Score: 0.7519

Target Customer: C0003
Similar Customer: C0144, Similarity Score: 0.9027
Similar Customer: C0091, Similarity Score: 0.7028
Similar Customer: C0151, Similarity Score: 0.6788


In [8]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import davies_bouldin_score, silhouette_score, calinski_harabasz_score
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings

# Set environment variable to avoid KMeans memory leak warning
os.environ['OMP_NUM_THREADS'] = '1'

# Filter warnings
warnings.filterwarnings('ignore', category=UserWarning)

# Rest of the code remains the same until visualize_clusters function...

def visualize_clusters(features, X, kmeans, save_prefix='clustering'):
    """Create visualizations for the clustering results"""
    
    print("\nGenerating visualizations...")
    
    # 1. PCA visualization
    try:
        from sklearn.decomposition import PCA
        pca = PCA(n_components=2)
        X_pca = pca.fit_transform(X)
        
        plt.figure(figsize=(10, 6))
        scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=features['Cluster'], cmap='viridis')
        plt.title('Cluster Visualization using PCA')
        plt.xlabel('First Principal Component')
        plt.ylabel('Second Principal Component')
        plt.colorbar(scatter)
        plt.savefig(f'{save_prefix}_pca.png')
        plt.close()
        print("✓ Created PCA visualization (clustering_pca.png)")
    except Exception as e:
        print(f"× Error creating PCA visualization: {str(e)}")
    
    # 2. Cluster sizes visualization
    try:
        plt.figure(figsize=(12, 6))
        features['Cluster'].value_counts().sort_index().plot(kind='bar')
        plt.title('Cluster Sizes')
        plt.xlabel('Cluster')
        plt.ylabel('Number of Customers')
        plt.savefig(f'{save_prefix}_sizes.png')
        plt.close()
        print("✓ Created cluster sizes visualization (clustering_sizes.png)")
    except Exception as e:
        print(f"× Error creating cluster sizes visualization: {str(e)}")
    
    # 3. Feature importance heatmap
    try:
        feature_cols = [
            'recency', 'frequency', 'monetary', 
            'avg_transaction_value', 'avg_items_per_transaction'
        ]
        
        cluster_means = features.groupby('Cluster')[feature_cols].mean()
        
        # Scale the means for better visualization
        scaler = StandardScaler()
        cluster_means_scaled = pd.DataFrame(
            scaler.fit_transform(cluster_means),
            index=cluster_means.index,
            columns=cluster_means.columns
        )
        
        plt.figure(figsize=(15, 8))
        sns.heatmap(cluster_means_scaled, cmap='YlOrRd', annot=True, fmt='.2f')
        plt.title('Cluster Characteristics (Standardized Values)')
        plt.savefig(f'{save_prefix}_characteristics.png')
        plt.close()
        print("✓ Created cluster characteristics heatmap (clustering_characteristics.png)")
    except Exception as e:
        print(f"× Error creating characteristics heatmap: {str(e)}")

    # 4. Save cluster insights
    try:
        cluster_insights = pd.DataFrame()
        for col in ['monetary', 'frequency', 'recency']:
            cluster_insights[f'avg_{col}'] = features.groupby('Cluster')[col].mean()
        cluster_insights.to_csv(f'{save_prefix}_insights.csv')
        print("✓ Saved cluster insights to clustering_insights.csv")
    except Exception as e:
        print(f"× Error saving cluster insights: {str(e)}")

# Execute clustering analysis
print("1. Preparing features for clustering...")
clustering_features = prepare_clustering_features()

print("\n2. Performing clustering analysis...")
clustered_data, metrics, X, final_kmeans = perform_clustering_analysis(clustering_features)

print("\n3. Clustering Results:")
print(f"✓ Optimal number of clusters: {len(clustered_data['Cluster'].unique())}")
print(f"✓ Davies-Bouldin Index: {metrics.iloc[metrics['db_score'].idxmin()]['db_score']:.4f}")
print(f"✓ Silhouette Score: {metrics.iloc[metrics['db_score'].idxmin()]['silhouette_score']:.4f}")
print(f"✓ Calinski-Harabasz Score: {metrics.iloc[metrics['db_score'].idxmin()]['calinski_harabasz_score']:.4f}")

print("\n4. Cluster sizes:")
print(clustered_data['Cluster'].value_counts().sort_index())

print("\n5. Creating visualizations and saving results...")
visualize_clusters(clustered_data, X, final_kmeans)

# Save final results
try:
    clustered_data.to_csv('FirstName_LastName_Clustering_Results.csv', index=False)
    print("✓ Saved clustering results to FirstName_LastName_Clustering_Results.csv")
except Exception as e:
    print(f"× Error saving clustering results: {str(e)}")

try:
    metrics.to_csv('FirstName_LastName_Clustering_Metrics.csv', index=False)
    print("✓ Saved clustering metrics to FirstName_LastName_Clustering_Metrics.csv")
except Exception as e:
    print(f"× Error saving clustering metrics: {str(e)}")

print("\nClustering analysis completed!")

1. Preparing features for clustering...

2. Performing clustering analysis...

3. Clustering Results:
✓ Optimal number of clusters: 7
✓ Davies-Bouldin Index: 1.4300
✓ Silhouette Score: 0.1727
✓ Calinski-Harabasz Score: 31.8906

4. Cluster sizes:
Cluster
0    55
1    32
2     9
3    43
4    18
5    32
6    11
Name: count, dtype: int64

5. Creating visualizations and saving results...

Generating visualizations...
✓ Created PCA visualization (clustering_pca.png)
✓ Created cluster sizes visualization (clustering_sizes.png)
✓ Created cluster characteristics heatmap (clustering_characteristics.png)
✓ Saved cluster insights to clustering_insights.csv
✓ Saved clustering results to FirstName_LastName_Clustering_Results.csv
✓ Saved clustering metrics to FirstName_LastName_Clustering_Metrics.csv

Clustering analysis completed!
