## Import Dependencies

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
from scipy.spatial.distance import pdist
from sklearn.cluster import DBSCAN
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import PCA
warnings.filterwarnings('ignore')

# Set style for better visualizations
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

## Load and Prepare Data

In [None]:
# Load the cleaned dataset
data_path = "../dataset/clean_data.csv"
df = pd.read_csv(data_path)

print("Dataset shape:", df.shape)
print("\nColumns:", df.columns.tolist())
df.head()

## Identify Top 5 Causes of Death

In [None]:
# Filter out 'All Causes' and aggregate to find top 5 causes
filtered_df = df[df['Indicator Name'] != 'All Causes']

# Calculate total deaths for each cause across all years, ages, and sexes
cause_totals = filtered_df.groupby('Indicator Name')['Number'].sum().sort_values(ascending=False)
top5_causes = cause_totals.head(5).index.tolist()

print("Top 5 Causes of Death:")
for i, cause in enumerate(top5_causes, 1):
    print(f"{i}. {cause}: {cause_totals[cause]:,.0f} total deaths")

# Visualize the top 5 causes
plt.figure(figsize=(12, 6))
top5_data = cause_totals.head(5)
bars = plt.bar(range(len(top5_data)), top5_data.values, color='steelblue', alpha=0.7)
plt.xlabel('Cause of Death')
plt.ylabel('Total Deaths')
plt.title('Top 5 Causes of Death (2000-2020)', fontsize=14, fontweight='bold')
plt.xticks(range(len(top5_data)), [cause[:30] + '...' if len(cause) > 30 else cause for cause in top5_data.index], rotation=45, ha='right')
plt.gca().yaxis.set_major_formatter(plt.FuncFormatter(lambda x, _: f'{x/1000:.0f}K'))

# Add value labels on bars
for bar, value in zip(bars, top5_data.values):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + value*0.01, 
             f'{value/1000:.0f}K', ha='center', va='bottom')

plt.tight_layout()
plt.show()

## Prepare Data for Clustering Analysis

In [None]:
# Filter data for top 5 causes and exclude 'All Ages' category
clustering_data = df[
    (df['Indicator Name'].isin(top5_causes)) & 
    (df['Age Category'] != 'All Ages') &
    (df['Sex'] == 'All')  # Focus on combined gender data for simplicity
].copy()

print(f"Filtered data shape: {clustering_data.shape}")
print(f"Unique age categories: {sorted(clustering_data['Age Category'].unique())}")
print(f"Years covered: {clustering_data['Year'].min()} - {clustering_data['Year'].max()}")

In [None]:
# Create a feature matrix for clustering
# Aggregate data across years to get average mortality metrics per age category and cause
feature_matrix = clustering_data.groupby(['Age Category', 'Indicator Name']).agg({
    'Number': 'mean',
    'Death Rate': 'mean',
    'Percent of All Causes': 'mean'
}).reset_index()

# Pivot to create a matrix where rows are age categories and columns are causes
# Use Death Rate as the primary metric for clustering
death_rate_matrix = feature_matrix.pivot(index='Age Category', columns='Indicator Name', values='Death Rate')
death_rate_matrix = death_rate_matrix.fillna(0)  # Fill missing values with 0 if any

print("Death Rate Matrix shape:", death_rate_matrix.shape)
print("\nAge categories:", death_rate_matrix.index.tolist())
death_rate_matrix.head()

## Heatmap Analysis of Death Causes Across Age Categories

In [None]:
# Visualize the death rate heatmap
plt.figure(figsize=(14, 8))
sns.heatmap(death_rate_matrix.T, annot=True, fmt='.1f', cmap='YlOrRd', cbar_kws={'label': 'Death Rate per 100,000'})
plt.title('Death Rates by Age Category and Cause of Death', fontsize=14, fontweight='bold')
plt.xlabel('Age Category')
plt.ylabel('Cause of Death')
plt.xticks(rotation=45)
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

In [None]:
# Calculate summary statistics for each age category
age_stats = death_rate_matrix.agg(['mean', 'std', 'max', 'sum'], axis=1)
age_stats['total_risk_score'] = age_stats['sum']  # Sum of all death rates as risk score
age_stats = age_stats.sort_values('total_risk_score', ascending=False)

print("Age Category Risk Summary:")
print(age_stats.round(2))

## Data Preprocessing for Clustering

In [None]:
# Standardize the features for clustering
scaler = StandardScaler()
scaled_features = scaler.fit_transform(death_rate_matrix)

# Create a DataFrame with scaled features for easier interpretation
scaled_df = pd.DataFrame(scaled_features, 
                        index=death_rate_matrix.index, 
                        columns=death_rate_matrix.columns)

print("Scaled features shape:", scaled_df.shape)
print("\nScaled features summary:")
print(scaled_df.describe().round(3))

## DBSCAN Clustering Analysis

### 1. Determining Optimal DBSCAN Min Samples and Epsilon Parameter Values

In [None]:
# Set min_samples to 2 due to small dataset size to ensure clusters can form
min_samples = 2

# Find neighbors distances
neighbors = NearestNeighbors(n_neighbors=min_samples)
neighbors_fit = neighbors.fit(scaled_features)
distances, indices = neighbors_fit.kneighbors(scaled_features)

# Get distances to k-th nearest neighbor
distances = np.sort(distances[:, min_samples-1], axis=0)

# Plot
plt.figure(figsize=(8,4))
plt.plot(distances)
plt.title('k-distance graph for DBSCAN eps determination')
plt.xlabel('Points sorted by distance')
plt.ylabel(f'{min_samples}th nearest neighbor distance')
plt.grid(True, alpha=0.3)
plt.show()

### 2. Apply DBSCAN Clustering

In [None]:
# Define DBSCAN parameters
# eps is set to 0.5 based on the optimal value observed from the k-distance graph
# min_samples is set to 2 to accommodate smaller data point instances in this dataset
dbscan = DBSCAN(eps=0.5, min_samples=min_samples)

# Fit DBSCAN to the scaled features and predict cluster labels
dbscan_labels = dbscan.fit_predict(scaled_features)

# Create a dataframe for DBSCAN results by copying the original death rate matrix
dbscan_df = death_rate_matrix.copy()
dbscan_df['Cluster'] = dbscan_labels

# Add Risk_Score column for consistency with hierarchical clustering results
dbscan_df['Risk_Score'] = age_stats['total_risk_score']

# Sort the dataframe by cluster assignment and descending risk score for clearer output
dbscan_df = dbscan_df.sort_values(['Cluster', 'Risk_Score'], ascending=[True, False])

# Display DBSCAN clustering results
print(f"DBSCAN Clustering Results:")
print("="*60)

# Extract unique cluster labels (including noise points labeled as -1)
unique_clusters = sorted(dbscan_df['Cluster'].unique())

# Print cluster membership details and average risk score for each cluster
for cluster_id in unique_clusters:
    cluster_ages = dbscan_df[dbscan_df['Cluster'] == cluster_id]
    avg_risk = cluster_ages['Risk_Score'].mean()
    
    if cluster_id == -1:
        print(f"\nNoise Points (Avg Risk Score: {avg_risk:.1f}):")
    else:
        print(f"\nCluster {cluster_id} (Avg Risk Score: {avg_risk:.1f}):")
        
    for age in cluster_ages.index:
        risk_score = cluster_ages.loc[age, 'Risk_Score']
        print(f"  - {age}: {risk_score:.1f}")


### 3. Risk Segment Analysis

In [None]:
# Calculate DBSCAN cluster characteristics
dbscan_cluster_analysis = []

# Get unique clusters (including noise)
unique_clusters = sorted(dbscan_df['Cluster'].unique())

for cluster_id in unique_clusters:
    cluster_ages = dbscan_df[dbscan_df['Cluster'] == cluster_id]
    
    cluster_info = {
        'Cluster': cluster_id,
        'Age_Categories': list(cluster_ages.index),
        'Count': len(cluster_ages),
        'Avg_Risk_Score': cluster_ages['Risk_Score'].mean(),
        'Min_Risk_Score': cluster_ages['Risk_Score'].min(),
        'Max_Risk_Score': cluster_ages['Risk_Score'].max(),
        'Risk_Level': ''
    }
    
    dbscan_cluster_analysis.append(cluster_info)

# Sort clusters by average risk score (descending)
dbscan_cluster_analysis.sort(key=lambda x: x['Avg_Risk_Score'], reverse=True)

# Assign risk levels: High, Medium, Low
risk_levels = ['High Risk', 'Medium Risk', 'Low Risk']

# If more than 3 clusters, assign Medium Risk to all middle clusters
for i, cluster in enumerate(dbscan_cluster_analysis):
    if i == 0:
        cluster['Risk_Level'] = 'High Risk'
    elif i == len(dbscan_cluster_analysis) - 1:
        cluster['Risk_Level'] = 'Low Risk'
    else:
        cluster['Risk_Level'] = 'Medium Risk'

# Display DBSCAN cluster analysis
print("DBSCAN Risk Segment Analysis:")
print("="*80)

for cluster in dbscan_cluster_analysis:
    print(f"\n{cluster['Risk_Level']} (Cluster {cluster['Cluster']}):")
    print(f"  Age Categories: {', '.join(cluster['Age_Categories'])}")
    print(f"  Average Risk Score: {cluster['Avg_Risk_Score']:.1f}")
    print(f"  Risk Score Range: {cluster['Min_Risk_Score']:.1f} - {cluster['Max_Risk_Score']:.1f}")
    print(f"  Number of Age Groups: {cluster['Count']}")


### 4. Clustering Result Visualization

In [None]:
# Prepare combined dataframe for strip plot
strip_df = dbscan_df.copy()
strip_df['Cluster_Label'] = strip_df['Cluster'].apply(lambda x: 'Noise' if x == -1 else f'Cluster {x}')

# Create figure
fig, axes = plt.subplots(2, 2, figsize=(18, 12))
fig.suptitle('DBSCAN Clustering Analysis Summary', fontsize=20, fontweight='bold')

# Define colour palettes
unique_cluster_labels = strip_df['Cluster_Label'].unique()
cluster_palette = dict(zip(unique_cluster_labels, sns.color_palette("Set2", len(unique_cluster_labels))))

risk_palette = {'High Risk': '#E74C3C', 'Medium-High Risk': '#E67E22',
                'Medium Risk': '#F1C40F', 'Low-Medium Risk': '#3498DB', 'Low Risk': '#2ECC71'}

# 1. Cluster visualization with risk scores (strip plot) - improved
ax1 = axes[0,0]
sns.stripplot(data=strip_df, x='Risk_Score', y=strip_df.index,
              hue='Cluster_Label', palette=cluster_palette,
              dodge=False, size=10, alpha=0.8, orient='h', ax=ax1)
ax1.set_xlabel('Total Risk Score')
ax1.set_ylabel('Age Categories')
ax1.set_title('Age Categories by Risk Score and Cluster')
ax1.legend(title='Clusters', bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0)

# 2. Risk level distribution (professional horizontal bar)
ax2 = axes[0,1]
risk_counts = {cluster['Risk_Level']: cluster['Count'] for cluster in dbscan_cluster_analysis}
sns.barplot(x=list(risk_counts.values()), y=list(risk_counts.keys()),
            palette=[risk_palette.get(k, 'gray') for k in risk_counts.keys()], ax=ax2)
ax2.set_xlabel('Number of Age Groups')
ax2.set_ylabel('Risk Level')
ax2.set_title('Distribution of Age Categories by Risk Level')

# 3. Heatmap of clusters (clean diverging palette)
ax3 = axes[1,0]
cluster_matrix = dbscan_df.drop(['Cluster', 'Risk_Score'], axis=1)
sns.heatmap(cluster_matrix, cmap='coolwarm', cbar_kws={'label': 'Death Rate per 100,000'},
            yticklabels=dbscan_df.index, ax=ax3)
ax3.set_title('Death Rate Heatmap by Age and Cause')
ax3.set_xlabel('Causes of Death')
ax3.set_ylabel('Age Categories')

# 4. Cluster characteristics (vertical bar with value labels)
ax4 = axes[1,1]
cluster_stats = []
cluster_names = []
bar_colors = []
for cluster in dbscan_cluster_analysis:
    cluster_stats.append(cluster['Avg_Risk_Score'])
    cluster_name = "Noise" if cluster["Cluster"] == -1 else f'Cluster {cluster["Cluster"]}'
    cluster_names.append(cluster_name)
    bar_colors.append(risk_palette.get(cluster['Risk_Level'], 'gray'))

bars = ax4.bar(cluster_names, cluster_stats, color=bar_colors, alpha=0.85)
ax4.set_xlabel('Clusters')
ax4.set_ylabel('Average Risk Score')
ax4.set_title('Average Risk Score by Cluster')

# Add value labels on bars
for bar in bars:
    height = bar.get_height()
    ax4.annotate(f'{height:.1f}',
                 xy=(bar.get_x() + bar.get_width() / 2, height),
                 xytext=(0, 5),
                 textcoords="offset points",
                 ha='center', va='bottom')

# Final layout adjustments
plt.tight_layout(rect=[0, 0, 1, 0.95])
plt.show()


### 5. Detailed Analysis of Each Risk Segment

In [None]:
# Analyze mortality patterns for each DBSCAN risk segment
print("DBSCAN Detailed Risk Segment Analysis:")
print("="*80)

for cluster_info in dbscan_cluster_analysis:
    cluster_id = cluster_info['Cluster']
    risk_level = cluster_info['Risk_Level']
    age_categories = cluster_info['Age_Categories']
    
    print(f"\n{risk_level.upper()} (Cluster {cluster_id})")
    print("-" * 50)
    
    # Get data for this cluster
    cluster_data = dbscan_df[dbscan_df['Cluster'] == cluster_id]
    
    print(f"Age Categories: {', '.join(age_categories)}")
    
    # Find the primary causes of death for this cluster
    cluster_causes = cluster_data.drop(['Cluster', 'Risk_Score'], axis=1)
    avg_death_rates = cluster_causes.mean(axis=0).sort_values(ascending=False)
    
    print(f"\nTop Causes of Death (Average Death Rate per 100,000):")
    for i, (cause, rate) in enumerate(avg_death_rates.items(), 1):
        print(f"  {i}. {cause}: {rate:.1f}")
    
    # Calculate age-specific insights
    print(f"\nAge-Specific Risk Scores:")
    for age in age_categories:
        risk_score = dbscan_df.loc[age, 'Risk_Score']
        print(f"  - {age}: {risk_score:.1f}")
    
    print(f"\nAverage Risk Score: {cluster_info['Avg_Risk_Score']:.1f}")
    print(f"Risk Score Range: {cluster_info['Min_Risk_Score']:.1f} - {cluster_info['Max_Risk_Score']:.1f}")


### 6. DBSCAN Clustering Summary Visualisation

In [None]:
# Create DBSCAN summary visualization with varied styles
fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=(
        'DBSCAN Risk Level Distribution',
        'Age Categories by Risk Score',
        'Cluster Size Distribution (Pie)',
        'Top Causes Across All Age Groups'
    ),
    specs=[
        [{"type": "pie"}, {"type": "scatter"}],
        [{"type": "pie"}, {"type": "bar"}]
    ]
)

# 1. Risk level distribution as a pie chart
risk_counts = {cluster['Risk_Level']: cluster['Count'] for cluster in dbscan_cluster_analysis}
fig.add_trace(
    go.Pie(labels=list(risk_counts.keys()), values=list(risk_counts.values()),
           hole=0.4,
           marker=dict(colors=['#FF6F61', '#6B5B95', '#88B04B', '#F7CAC9', '#92A8D1']),
           textinfo='label+percent'),
    row=1, col=1
)

# 2. Age categories by risk score as a bubble scatter plot
fig.add_trace(
    go.Scatter(
        x=dbscan_df.index,
        y=dbscan_df['Risk_Score'],
        mode='markers',
        marker=dict(
            size=dbscan_df['Risk_Score']/10 + 10,  # bubble size
            color=dbscan_df['Risk_Score'],
            colorscale='Plasma',
            showscale=True,
            colorbar=dict(title='Risk Score')
        ),
        text=['Cluster: '+str(c) for c in dbscan_df['Cluster']],
        name='Age Risk Bubble'
    ),
    row=1, col=2
)

# 3. Cluster size distribution as a donut pie chart
cluster_sizes = dbscan_df['Cluster'].value_counts().sort_index()
fig.add_trace(
    go.Pie(labels=[f'Cluster {c}' for c in cluster_sizes.index], values=cluster_sizes.values,
           hole=0.5,
           marker=dict(colors=['#955251', '#B565A7', '#009B77', '#DD4124', '#45B8AC']),
           textinfo='label+value'),
    row=2, col=1
)

# 4. Top causes overall as horizontal bar chart
overall_causes = death_rate_matrix.mean(axis=0).sort_values(ascending=True)  # reverse for horizontal
fig.add_trace(
    go.Bar(
        x=overall_causes.values,
        y=[cause[:20] + '...' if len(cause) > 20 else cause for cause in overall_causes.index],
        orientation='h',
        marker_color='teal',
        name='Avg Death Rate'
    ),
    row=2, col=2
)

fig.update_layout(
    height=900,
    title_text="DBSCAN Clustering Analysis Summary (Enhanced Visuals)",
    title_x=0.5,
    plot_bgcolor='white',
    paper_bgcolor='white',
    font=dict(family="Arial", size=12, color="black"),
    showlegend=False
)

fig.show()


## Hierarchical Clustering Analysis

### 1. Determine Optimal Number of Clusters

In [None]:
# Test different numbers of clusters using silhouette score
cluster_range = range(2, min(8, len(scaled_df)))  # Test 2 to 7 clusters (or max possible)
silhouette_scores = []
inertias = []

for n_clusters in cluster_range:
    # Agglomerative clustering
    agg_clustering = AgglomerativeClustering(n_clusters=n_clusters, linkage='ward')
    cluster_labels = agg_clustering.fit_predict(scaled_features)
    
    # Calculate silhouette score
    silhouette_avg = silhouette_score(scaled_features, cluster_labels)
    silhouette_scores.append(silhouette_avg)
    
    # Calculate within-cluster sum of squares (WCSS)
    wcss = 0 
    for i in range(n_clusters):
        cluster_points = scaled_features[cluster_labels == i]
        if len(cluster_points) > 0:
            cluster_center = cluster_points.mean(axis=0)
            wcss += ((cluster_points - cluster_center) ** 2).sum()
    inertias.append(wcss)
# Plot the results
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

# Silhouette scores
ax1.plot(cluster_range, silhouette_scores, marker='o', linewidth=2, markersize=8)
ax1.set_xlabel('Number of Clusters')
ax1.set_ylabel('Silhouette Score')
ax1.set_title('Silhouette Score vs Number of Clusters')
ax1.grid(True, alpha=0.3)
best_silhouette_n = cluster_range[np.argmax(silhouette_scores)]
ax1.axvline(x=best_silhouette_n, color='red', linestyle='--', alpha=0.7, label=f'Best: {best_silhouette_n} clusters')
ax1.legend()

# Elbow method (WCSS)
ax2.plot(cluster_range, inertias, marker='o', linewidth=2, markersize=8)
ax2.set_xlabel('Number of Clusters')
ax2.set_ylabel('Within-Cluster Sum of Squares')
ax2.set_title('Elbow Method for Optimal Clusters')
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"Best number of clusters based on silhouette score: {best_silhouette_n}")
print(f"Silhouette scores: {dict(zip(cluster_range, [round(s, 3) for s in silhouette_scores]))}")

### 2. Generate Dendrogram

In [None]:
# Create linkage matrix for dendrogram
linkage_matrix = linkage(scaled_features, method='ward')

# Plot dendrogram
plt.figure(figsize=(12, 8))
dendrogram_plot = dendrogram(
    linkage_matrix,
    labels=death_rate_matrix.index.tolist(),
    orientation='top',
    distance_sort='descending',
    show_leaf_counts=True,
    leaf_rotation=45,
    leaf_font_size=10
)

plt.title('Hierarchical Clustering Dendrogram\n(Age Categories based on Top 5 Cause Death Rates)', 
          fontsize=14, fontweight='bold')
plt.xlabel('Age Categories')
plt.ylabel('Distance (Ward Linkage)')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

### 3. Apply Agglomerative Clustering

In [None]:
# Apply Agglomerative Clustering with optimal number of clusters
optimal_clusters = best_silhouette_n
agg_clustering = AgglomerativeClustering(n_clusters=optimal_clusters, linkage='ward')
cluster_labels = agg_clustering.fit_predict(scaled_features)

# Add cluster labels to our dataframe
clustered_df = death_rate_matrix.copy()
clustered_df['Cluster'] = cluster_labels
clustered_df['Risk_Score'] = age_stats['total_risk_score']

# Sort by cluster and risk score
clustered_df = clustered_df.sort_values(['Cluster', 'Risk_Score'], ascending=[True, False])

print(f"Clustering Results with {optimal_clusters} clusters:")
print("="*60)

for cluster_id in range(optimal_clusters):
    cluster_ages = clustered_df[clustered_df['Cluster'] == cluster_id]
    avg_risk = cluster_ages['Risk_Score'].mean()
    
    print(f"\nCluster {cluster_id} (Avg Risk Score: {avg_risk:.1f}):")
    for age in cluster_ages.index:
        risk_score = cluster_ages.loc[age, 'Risk_Score']
        print(f"  - {age}: {risk_score:.1f}")

## Risk Segment Analysis

In [None]:
# Calculate cluster characteristics
cluster_analysis = []

for cluster_id in range(optimal_clusters):
    cluster_ages = clustered_df[clustered_df['Cluster'] == cluster_id]
    
    cluster_info = {
        'Cluster': cluster_id,
        'Age_Categories': list(cluster_ages.index),
        'Count': len(cluster_ages),
        'Avg_Risk_Score': cluster_ages['Risk_Score'].mean(),
        'Min_Risk_Score': cluster_ages['Risk_Score'].min(),
        'Max_Risk_Score': cluster_ages['Risk_Score'].max(),
        'Risk_Level': ''
    }
    
    cluster_analysis.append(cluster_info)

# Sort clusters by average risk score
cluster_analysis.sort(key=lambda x: x['Avg_Risk_Score'], reverse=True)

# Assign risk levels
risk_levels = ['High Risk', 'Medium-High Risk', 'Medium Risk', 'Low-Medium Risk', 'Low Risk']
for i, cluster in enumerate(cluster_analysis):
    if i < len(risk_levels):
        cluster['Risk_Level'] = risk_levels[i]
    else:
        cluster['Risk_Level'] = 'Low Risk'

# Display cluster analysis
print("Risk Segment Analysis:")
print("="*80)

for cluster in cluster_analysis:
    print(f"\n{cluster['Risk_Level']} (Cluster {cluster['Cluster']}):")
    print(f"  Age Categories: {', '.join(cluster['Age_Categories'])}")
    print(f"  Average Risk Score: {cluster['Avg_Risk_Score']:.1f}")
    print(f"  Risk Score Range: {cluster['Min_Risk_Score']:.1f} - {cluster['Max_Risk_Score']:.1f}")
    print(f"  Number of Age Groups: {cluster['Count']}")

## Visualization of Clustering Results

In [None]:
# Create a comprehensive visualization of clustering results
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(16, 12))

# 1. Cluster visualization with risk scores
colors = plt.cm.Set3(np.linspace(0, 1, optimal_clusters))
for cluster_id in range(optimal_clusters):
    cluster_data = clustered_df[clustered_df['Cluster'] == cluster_id]
    ax1.scatter(cluster_data.index, cluster_data['Risk_Score'], 
               c=[colors[cluster_id]], s=100, alpha=0.7, label=f'Cluster {cluster_id}')

ax1.set_xlabel('Age Categories')
ax1.set_ylabel('Total Risk Score')
ax1.set_title('Age Categories Clustered by Risk Score')
ax1.legend()
ax1.tick_params(axis='x', rotation=45)
ax1.grid(True, alpha=0.3)

# 2. Risk level distribution
risk_counts = {cluster['Risk_Level']: cluster['Count'] for cluster in cluster_analysis}
ax2.pie(risk_counts.values(), labels=risk_counts.keys(), autopct='%1.1f%%', startangle=90)
ax2.set_title('Distribution of Age Categories by Risk Level')

# 3. Heatmap of clusters
cluster_matrix = clustered_df.drop(['Cluster', 'Risk_Score'], axis=1)
im = ax3.imshow(cluster_matrix.values, cmap='YlOrRd', aspect='auto')
ax3.set_xticks(range(len(cluster_matrix.columns)))
ax3.set_xticklabels([col[:15] + '...' if len(col) > 15 else col for col in cluster_matrix.columns], rotation=45)
ax3.set_yticks(range(len(cluster_matrix.index)))
ax3.set_yticklabels(cluster_matrix.index)
ax3.set_title('Death Rates Heatmap (Clustered Order)')
plt.colorbar(im, ax=ax3, label='Death Rate per 100,000')

# 4. Cluster characteristics
cluster_stats = []
cluster_names = []
for cluster_id in range(optimal_clusters):
    cluster_data = clustered_df[clustered_df['Cluster'] == cluster_id]
    cluster_stats.append(cluster_data['Risk_Score'].mean())
    cluster_names.append(f'Cluster {cluster_id}')

bars = ax4.bar(cluster_names, cluster_stats, color=colors[:len(cluster_stats)], alpha=0.7)
ax4.set_xlabel('Clusters')
ax4.set_ylabel('Average Risk Score')
ax4.set_title('Average Risk Score by Cluster')
ax4.grid(True, alpha=0.3)

# Add value labels on bars
for bar, value in zip(bars, cluster_stats):
    ax4.text(bar.get_x() + bar.get_width()/2, bar.get_height() + max(cluster_stats)*0.01, 
             f'{value:.1f}', ha='center', va='bottom')

plt.tight_layout()
plt.show()

## Detailed Analysis of Each Risk Segment

In [None]:
# Analyze the mortality patterns for each risk segment
print("Detailed Risk Segment Analysis:")
print("="*80)

for cluster_info in cluster_analysis:
    cluster_id = cluster_info['Cluster']
    risk_level = cluster_info['Risk_Level']
    age_categories = cluster_info['Age_Categories']
    
    print(f"\n{risk_level.upper()} (Cluster {cluster_id})")
    print("-" * 50)
    
    # Get data for this cluster
    cluster_data = clustered_df[clustered_df['Cluster'] == cluster_id]
    
    print(f"Age Categories: {', '.join(age_categories)}")
    
    # Find the primary causes of death for this cluster
    cluster_causes = cluster_data.drop(['Cluster', 'Risk_Score'], axis=1)
    avg_death_rates = cluster_causes.mean(axis=0).sort_values(ascending=False)
    
    print(f"\nTop Causes of Death (Average Death Rate per 100,000):")
    for i, (cause, rate) in enumerate(avg_death_rates.items(), 1):
        print(f"  {i}. {cause}: {rate:.1f}")
    
    # Calculate age-specific insights
    print(f"\nAge-Specific Risk Scores:")
    for age in age_categories:
        risk_score = clustered_df.loc[age, 'Risk_Score']
        print(f"  - {age}: {risk_score:.1f}")
    
    print(f"\nAverage Risk Score: {cluster_info['Avg_Risk_Score']:.1f}")
    print(f"Risk Score Range: {cluster_info['Min_Risk_Score']:.1f} - {cluster_info['Max_Risk_Score']:.1f}")

## Health Intervention Recommendations

In [None]:
# Generate targeted health intervention recommendations
def generate_recommendations(cluster_info, cluster_data):
    """
    Generate health intervention recommendations based on cluster characteristics
    """
    risk_level = cluster_info['Risk_Level']
    age_categories = cluster_info['Age_Categories']
    
    # Get top causes for this cluster
    cluster_causes = cluster_data.drop(['Cluster', 'Risk_Score'], axis=1)
    top_causes = cluster_causes.mean(axis=0).sort_values(ascending=False).head(3)
    
    recommendations = {
        'Risk_Level': risk_level,
        'Age_Groups': age_categories,
        'Priority': '',
        'Interventions': [],
        'Top_Causes': list(top_causes.index)
    }
    
    if 'High Risk' in risk_level:
        recommendations['Priority'] = 'URGENT - Immediate intervention required'
        recommendations['Interventions'] = [
            'Intensive health monitoring and screening programs',
            'Specialized medical facilities and emergency care',
            'Comprehensive insurance coverage',
            'Family support and palliative care services',
            'Advanced preventive care protocols'
        ]
    elif 'Medium' in risk_level:
        recommendations['Priority'] = 'MODERATE - Proactive intervention recommended'
        recommendations['Interventions'] = [
            'Regular health check-ups and screening',
            'Lifestyle modification programs',
            'Chronic disease management',
            'Health education and awareness campaigns',
            'Community health programs'
        ]
    else:
        recommendations['Priority'] = 'LOW - Preventive measures sufficient'
        recommendations['Interventions'] = [
            'Basic preventive care and vaccinations',
            'Health education and promotion',
            'Lifestyle counseling',
            'Regular wellness programs',
            'Safety and injury prevention'
        ]
    
    return recommendations

# Generate recommendations for each cluster
all_recommendations = []

print("TARGETED HEALTH INTERVENTION RECOMMENDATIONS")
print("="*80)

for cluster_info in cluster_analysis:
    cluster_id = cluster_info['Cluster']
    cluster_data = clustered_df[clustered_df['Cluster'] == cluster_id]
    
    recommendations = generate_recommendations(cluster_info, cluster_data)
    all_recommendations.append(recommendations)
    
    print(f"\n{recommendations['Risk_Level'].upper()}")
    print("-" * 60)
    print(f"Age Groups: {', '.join(recommendations['Age_Groups'])}")
    print(f"Priority Level: {recommendations['Priority']}")
    print(f"Top Causes: {', '.join(recommendations['Top_Causes'])}")
    print(f"\nRecommended Interventions:")
    for i, intervention in enumerate(recommendations['Interventions'], 1):
        print(f"  {i}. {intervention}")

## Summary and Key Insights

In [None]:
# Create a summary visualization
fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=(
        'Risk Level Distribution',
        'Age Categories by Risk Score',
        'Cluster Silhouette Analysis',
        'Top Causes Across All Age Groups'
    ),
    specs=[
        [{"type": "bar"}, {"type": "scatter"}],
        [{"type": "bar"}, {"type": "bar"}]
    ]
)

# 1. Risk level distribution
risk_counts = {cluster['Risk_Level']: cluster['Count'] for cluster in cluster_analysis}
fig.add_trace(
    go.Bar(x=list(risk_counts.keys()), y=list(risk_counts.values()), 
           marker_color='lightblue', name='Age Groups'),
    row=1, col=1
)

# 2. Age categories by risk score
fig.add_trace(
    go.Scatter(
        x=clustered_df.index,
        y=clustered_df['Risk_Score'],
        mode='markers+text',
        marker=dict(size=12, color=clustered_df['Cluster'], colorscale='viridis'),
        text=clustered_df['Cluster'],
        textposition='middle center',
        name='Risk Score'
    ),
    row=1, col=2
)

# 3. Silhouette scores
fig.add_trace(
    go.Bar(x=list(cluster_range), y=silhouette_scores, 
           marker_color='lightgreen', name='Silhouette Score'),
    row=2, col=1
)

# 4. Top causes overall
overall_causes = death_rate_matrix.mean(axis=0).sort_values(ascending=False)
fig.add_trace(
    go.Bar(x=[cause[:20] + '...' if len(cause) > 20 else cause for cause in overall_causes.index],
           y=overall_causes.values,
           marker_color='lightcoral', name='Avg Death Rate'),
    row=2, col=2
)

fig.update_layout(
    height=800,
    title_text="Hierarchical Clustering Analysis Summary",
    title_x=0.5,
    showlegend=False
)

fig.show()

In [None]:
# Final summary and insights
print("KEY INSIGHTS FROM HIERARCHICAL CLUSTERING ANALYSIS")
print("="*80)

print(f"\n1. CLUSTERING RESULTS:")
print(f"   - Optimal number of clusters: {optimal_clusters}")
print(f"   - Clustering method: Agglomerative (Ward linkage)")
print(f"   - Silhouette score: {max(silhouette_scores):.3f}")

print(f"\n2. RISK SEGMENTATION:")
high_risk_ages = []
low_risk_ages = []

for cluster in cluster_analysis:
    if 'High' in cluster['Risk_Level']:
        high_risk_ages.extend(cluster['Age_Categories'])
    elif 'Low' in cluster['Risk_Level']:
        low_risk_ages.extend(cluster['Age_Categories'])

print(f"   - Highest risk age groups: {', '.join(high_risk_ages) if high_risk_ages else 'None'}")
print(f"   - Lowest risk age groups: {', '.join(low_risk_ages) if low_risk_ages else 'None'}")

print(f"\n3. TOP MORTALITY CAUSES ACROSS ALL AGES:")
for i, (cause, rate) in enumerate(overall_causes.head(3).items(), 1):
    print(f"   {i}. {cause}: {rate:.1f} deaths per 100,000")

print(f"\n4. ACTIONABLE RECOMMENDATIONS:")
print(f"   - Focus immediate interventions on: {', '.join(high_risk_ages) if high_risk_ages else 'elderly populations'}")
print(f"   - Implement preventive measures for: {', '.join(low_risk_ages) if low_risk_ages else 'younger populations'}")
print(f"   - Priority diseases for intervention: {', '.join(list(overall_causes.head(2).index))}")

print(f"\n5. METHODOLOGY VALIDATION:")
print(f"   - Data standardization: Applied (StandardScaler)")
print(f"   - Distance metric: Euclidean (Ward linkage)")
print(f"   - Evaluation metric: Silhouette analysis")
print(f"   - Age categories analyzed: {len(death_rate_matrix)}")
print(f"   - Mortality causes considered: {len(top5_causes)} (top 5)")

print("\n" + "="*80)
print("Analysis complete. Use these insights for targeted health policy interventions.")

# K-Means Clustering Analysis

In [None]:
# Use elbow method
# Try different values of k and record the inertia (WCSS), in order to choose best k, number of clusters, elbow point, where the curve starts to bend
inertia = []
K = range(1, 10)

for k in K:
    km = KMeans(n_clusters=k, random_state=42)
    km.fit(scaled_df)
    inertia.append(km.inertia_)

# Plot Elbow curve
plt.figure(figsize=(8, 4))
plt.plot(K, inertia, marker='o')
plt.title('Elbow Method for Optimal k')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Inertia (WCSS)')
plt.grid(True)
plt.show()

In [None]:
# Apply chosen k
k = 2  # or 3, based on elbow
kmeans = KMeans(n_clusters=k, random_state=42)
death_rate_matrix['Cluster'] = kmeans.fit_predict(scaled_df)

# Sort to view clustered age categories
clustered = death_rate_matrix.copy()
clustered = clustered.sort_values(by='Cluster')
clustered[['Cluster']]

In [None]:
# Determine which cluster is high-risk based on average death rate
cluster_means = clustered.drop(columns='Cluster').groupby(clustered['Cluster']).mean().mean(axis=1)
risk_map = {cluster_means.idxmax(): 'High Risk', cluster_means.idxmin(): 'Low Risk'}

# Add risk labels
clustered['Risk Level'] = clustered['Cluster'].map(risk_map)
clustered[['Cluster', 'Risk Level']]

In [None]:
# Risk segment analysis
clustered['Risk Level'].value_counts()

In [None]:
print("High Risk Age Groups:")
print(clustered[clustered['Risk Level'] == 'High Risk'].index.tolist())

print("\nLow Risk Age Groups:")
print(clustered[clustered['Risk Level'] == 'Low Risk'].index.tolist())

In [None]:
# Mean death rate across all 5 causes for each cluster
cluster_means = clustered.drop(columns=['Cluster', 'Risk Level']).groupby(clustered['Risk Level']).mean()
cluster_means

In [None]:
# Visualize the risk profile
cluster_means.T.plot(kind='bar', figsize=(10, 6), colormap='Set1')
plt.title('Average Death Rate by Cause (Risk Segments)')
plt.ylabel('Death Rate (per 100,000)')
plt.xticks(rotation=45)
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(10, 5))
sns.heatmap(scaled_df, annot=True, cmap='coolwarm')
plt.title('Standardized Death Rate by Age Category (Before Clustering)', fontsize=14)
plt.xlabel('Cause of Death')
plt.ylabel('Age Category')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Print vulnerable Age Groups
vulnerable = clustered[clustered['Risk Level'] == 'High Risk']
print("Vulnerable Age Categories (High Risk):")
for age in vulnerable.index:
    print(f"- {age}")

# Detailed Analysis of Each Risk Segment

In [None]:
# Group by risk level and compute summary statistics
grouped_summary = clustered.groupby('Risk Level').mean().round(2)

# Print summary for each risk segment
for risk_level, group in clustered.groupby('Risk Level'):
    print(f"\n==============================")
    print(f"Risk Segment: {risk_level}")
    print("==============================")
    
    age_groups = group.index.tolist()
    print(f"👥 Age Categories in this Segment:")
    print(", ".join(age_groups))
    
    print("\nAverage Death Rate per Cause:")
    avg_deaths = group.drop(columns=['Cluster', 'Risk Level']).mean().sort_values(ascending=False)
    print(avg_deaths.to_string())

    top_cause = avg_deaths.idxmax()
    print(f"\nDominant Cause of Death: {top_cause}")
    
    total_avg = avg_deaths.mean().round(2)
    print(f"Total Average Death Rate across Top 5 Causes: {total_avg} per 100,000")

# Health Intervention Recommendations

In [None]:
# Create figure
fig, ax = plt.subplots(figsize=(12, 8))
ax.axis('off')
fig.suptitle("Health Intervention Recommendations by Risk Segment", fontsize=16, fontweight='bold', color='darkblue')

# Text for High Risk
high_risk = """
High-Risk Segment (Elderly, Age 60+)

• Stroke: Blood pressure checks, stroke awareness
• Diabetes: Free glucose tests, promote healthy diets
• Kidney/Liver: Annual function tests, reduce alcohol
• Medication: Support adherence, home delivery
• Elder Care: Mobile clinics, elderly care programs
"""

# Text for Low Risk
low_risk = """
Low-Risk Segment (Youth & Adults under 40)

• Education: Health talks in schools and colleges
• Early Screening: Checkups starting from age 30
• Lifestyle: Anti-smoking and obesity awareness
• Monitoring: Use data to detect emerging risks
"""

# Plot text boxes
ax.text(0.05, 0.75, high_risk, fontsize=12, va='top', ha='left', bbox=dict(facecolor='mistyrose', edgecolor='red', boxstyle='round,pad=0.7'))
ax.text(0.55, 0.75, low_risk, fontsize=12, va='top', ha='left', bbox=dict(facecolor='honeydew', edgecolor='green', boxstyle='round,pad=0.7'))

plt.tight_layout(rect=[0, 0, 1, 0.95])
plt.show()


# Summary and Key Insights

In [None]:
# Clustering Summary
high_risk_ages = clustered[clustered['Risk Level'] == 'High Risk'].index.tolist()
low_risk_ages = clustered[clustered['Risk Level'] == 'Low Risk'].index.tolist()

print("Analysis Goal:")
print("Cluster age categories based on death rate trends of top 5 growing causes of death:")
print(" - " + ", ".join(top5_causes))

print("\nClustering Method:")
print("- Technique Used: K-Means Clustering")
print("- Number of Clusters: 2")
print("- Features: Average Death Rates by Age Category for Each Cause")

print("\nResults:")
print(f"- High Risk Segment (Cluster): {len(high_risk_ages)} age groups")
print(f"  > {', '.join(high_risk_ages)}")
print(f"- Low Risk Segment (Cluster): {len(low_risk_ages)} age groups")
print(f"  > {', '.join(low_risk_ages)}")

print("\nInsights:")
print("- Elderly age categories dominate the High Risk segment.")
print("- Early intervention and chronic care planning are critical for High Risk groups.")
print("- Younger age groups show consistently low death rates across all top 5 causes.")

print("\nThis segmentation allows for targeted health policies based on data-driven evidence.")