In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path

sns.set_theme()
pd.set_option('display.max_columns', 200)
pd.set_option('display.width', 120)

DATA_PATH = Path('Human_Development_Index_Dataset.csv')
OUTPUT_DIR = Path('.')

In [None]:
def load_hdi_dataset(path: Path) -> pd.DataFrame:
    """Load dataset with an encoding fallback (some rows contain non-UTF8 bytes)."""
    last_err = None
    for enc in (None, 'utf-8', 'latin1', 'cp1252'):
        try:
            if enc is None:
                return pd.read_csv(path)
            return pd.read_csv(path, encoding=enc)
        except Exception as e:
            last_err = e
    raise last_err


def clean_hdi_df(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    # Normalize common missing value markers
    missing_markers = {'NA', 'N/A', '–', '—', '-', ''}
    df = df.replace(list(missing_markers), np.nan)

    # Standardize country names
    if 'country' in df.columns:
        df['country'] = df['country'].astype(str).str.strip()

    # Convert year
    if 'year' in df.columns:
        df['year'] = pd.to_numeric(df['year'], errors='coerce').astype('Int64')

    # Coerce numeric columns
    numeric_candidates = [
        'hdi',
        'life_expectancy',
        'gross_inc_percap',
        'gender_development',
        'gender_inequality',
        'pop_millions',
    ]
    for col in numeric_candidates:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')

    # Drop duplicates (exact row duplicates)
    df = df.drop_duplicates()

    return df

In [None]:
df_raw = load_hdi_dataset(DATA_PATH)
df = clean_hdi_df(df_raw)

print('Shape:', df.shape)
print('Year range:', int(df['year'].min()), 'to', int(df['year'].max()))
print('Columns:', list(df.columns))
print('Has Region column:', any(c.lower() == 'region' for c in df.columns))

## Problem 1A — Single Year HDI Exploration (Latest Year: 2022)

In [None]:
# ============================================================
# Problem 1A: Single Year HDI Exploration (Latest Year: 2022)
# ============================================================

# ---------------------------
# Task 1: Extract Latest Year
# ---------------------------
# Identify unique years in the dataset
unique_years = sorted(df['year'].dropna().unique().tolist())
print('Unique years in dataset:', unique_years)

# Filter dataset to include only 2022
hdi_2022_df = df[df['year'] == 2022].copy()
print(f'\nFiltered to year 2022: {len(hdi_2022_df)} rows')

# ---------------------------
# Task 2: Data Exploration
# ---------------------------
# Display first 10 rows
print('\n--- First 10 rows of 2022 dataset ---')
display(hdi_2022_df.head(10))

# Count rows and columns
num_rows, num_cols = hdi_2022_df.shape
print(f'\nNumber of rows: {num_rows}')
print(f'Number of columns: {num_cols}')

# List all column names and data types
print('\n--- Column names and data types ---')
display(hdi_2022_df.dtypes)

# ---------------------------
# Task 3: Missing Values & Data Cleaning
# ---------------------------
print('\n--- Missing values per column ---')
missing_counts = hdi_2022_df.isna().sum()
display(missing_counts[missing_counts > 0].sort_values(ascending=False))
print(f'Total missing values: {missing_counts.sum()}')

# Inspect for issues:
# 1. Special characters representing missing data (e.g., "\u2013", "\u2014", "-")
print('\n--- Replacing special characters with NaN ---')
special_chars = ['\u2013', '\u2014', '-', '..', 'N/A', 'NA', '']
hdi_2022_df = hdi_2022_df.replace(special_chars, np.nan)

# 2. Numeric columns stored as text - convert them
print('--- Converting numeric columns ---')
numeric_cols = ['hdi', 'life_expectancy', 'expec_yr_school', 'mean_yr_school', 'gross_inc_percap']
for col in numeric_cols:
    if col in hdi_2022_df.columns:
        hdi_2022_df[col] = pd.to_numeric(hdi_2022_df[col], errors='coerce')

# 3. Inconsistent country names - strip whitespace
if 'country' in hdi_2022_df.columns:
    hdi_2022_df['country'] = hdi_2022_df['country'].astype(str).str.strip()

# 4. Duplicate rows
duplicates_before = hdi_2022_df.duplicated().sum()
print(f'Duplicate rows found: {duplicates_before}')
hdi_2022_df = hdi_2022_df.drop_duplicates()
print(f'Duplicates removed. Rows remaining: {len(hdi_2022_df)}')

# 5. Handle missing values
# Decision: Drop rows where HDI is missing (core variable for analysis)
# Justification: HDI is the primary metric; rows without it cannot be analyzed meaningfully
rows_before = len(hdi_2022_df)
hdi_2022_df = hdi_2022_df.dropna(subset=['hdi'])
rows_after = len(hdi_2022_df)
print(f'\nDropped {rows_before - rows_after} rows with missing HDI values.')
print(f'Final cleaned dataset: {rows_after} rows')

# ---------------------------
# Task 4: Basic Statistics
# ---------------------------
print('\n--- Basic Statistics for HDI (2022) ---')
hdi_mean = hdi_2022_df['hdi'].mean()
hdi_median = hdi_2022_df['hdi'].median()
hdi_std = hdi_2022_df['hdi'].std()
print(f'Mean HDI: {hdi_mean:.4f}')
print(f'Median HDI: {hdi_median:.4f}')
print(f'Standard Deviation: {hdi_std:.4f}')

# Country with highest HDI
highest_hdi_row = hdi_2022_df.loc[hdi_2022_df['hdi'].idxmax()]
print(f'\nCountry with HIGHEST HDI: {highest_hdi_row["country"]} (HDI = {highest_hdi_row["hdi"]:.3f})')

# Country with lowest HDI
lowest_hdi_row = hdi_2022_df.loc[hdi_2022_df['hdi'].idxmin()]
print(f'Country with LOWEST HDI: {lowest_hdi_row["country"]} (HDI = {lowest_hdi_row["hdi"]:.3f})')

In [None]:
# ---------------------------
# Task 5: Filtering and Sorting
# ---------------------------
print('--- Countries with HDI > 0.800, sorted by GNI per Capita (descending) ---')

# Filter countries with HDI > 0.800
hdi_high = hdi_2022_df[hdi_2022_df['hdi'] > 0.800].copy()
print(f'Countries with HDI > 0.800: {len(hdi_high)}')

# Sort by gross_inc_percap in descending order
hdi_high_sorted = hdi_high.sort_values('gross_inc_percap', ascending=False)

# Display top 10
print('\nTop 10 countries by GNI per Capita (among HDI > 0.800):')
display(hdi_high_sorted[['country', 'hdi', 'gross_inc_percap']].head(10))

# ---------------------------
# Task 6: Adding HDI Category Column
# ---------------------------
print('\n--- Adding HDI Category Column ---')

# Define bins and labels based on UNDP classification
# Low: < 0.550
# Medium: 0.550 - 0.699
# High: 0.700 - 0.799
# Very High: >= 0.800

def classify_hdi(hdi_value):
    if pd.isna(hdi_value):
        return np.nan
    elif hdi_value < 0.550:
        return 'Low'
    elif hdi_value < 0.700:
        return 'Medium'
    elif hdi_value < 0.800:
        return 'High'
    else:
        return 'Very High'

hdi_2022_df['HDI Category'] = hdi_2022_df['hdi'].apply(classify_hdi)

# Verify classification
print('HDI Category distribution:')
display(hdi_2022_df['HDI Category'].value_counts(dropna=False))

# Show sample of each category to verify
print('\nSample verification (one from each category):')
for cat in ['Low', 'Medium', 'High', 'Very High']:
    sample = hdi_2022_df[hdi_2022_df['HDI Category'] == cat]
    if len(sample) > 0:
        row = sample.iloc[0]
        print(f"  {cat}: {row['country']} (HDI = {row['hdi']:.3f})")

# Ensure the updated dataframe includes the new column
print('\nColumns in final dataframe:')
print(list(hdi_2022_df.columns))

# Save as CSV
output_path = OUTPUT_DIR / 'HDI category added.csv'
hdi_2022_df.to_csv(output_path, index=False)
print(f'\nSaved: {output_path.resolve()}')

## Problem 1B — HDI Visualization and Trend Analysis (2020–2022)

In [None]:
# ============================================================
# Problem 1B: HDI Visualization and Trend Analysis (2020–2022)
# ============================================================

# ---------------------------
# Task 1: Data Extraction and Saving
# ---------------------------
# Filter dataset to include only years 2020, 2021, and 2022
hdi_3yr = df[df['year'].isin([2020, 2021, 2022])].copy()
print(f'Filtered to years 2020-2022: {len(hdi_3yr)} rows')

# Save the filtered dataset
out_path_1b = OUTPUT_DIR / 'HDI problem1B.csv'
hdi_3yr.to_csv(out_path_1b, index=False)
print(f'Saved: {out_path_1b.resolve()}')

# ---------------------------
# Task 2: Data Cleaning
# ---------------------------
print('\n--- Data Cleaning ---')

# Check for missing values in essential columns
essential_cols = ['hdi', 'country', 'year']
print('Missing values in essential columns:')
for col in essential_cols:
    missing = hdi_3yr[col].isna().sum()
    print(f'  {col}: {missing}')

# Replace special characters representing missing data
special_chars = ['–', '—', '-', '..', 'N/A', 'NA', '']
hdi_3yr = hdi_3yr.replace(special_chars, np.nan)
print('\nReplaced special characters with NaN')

# Convert numeric columns stored as text
numeric_cols = ['hdi', 'life_expectancy', 'expec_yr_school', 'mean_yr_school', 'gross_inc_percap']
for col in numeric_cols:
    if col in hdi_3yr.columns:
        hdi_3yr[col] = pd.to_numeric(hdi_3yr[col], errors='coerce')
print('Converted numeric columns to proper data types')

# Standardize country names (strip whitespace)
hdi_3yr['country'] = hdi_3yr['country'].astype(str).str.strip()
print('Standardized country names (stripped whitespace)')

# Check and remove duplicates
duplicates = hdi_3yr.duplicated().sum()
print(f'Duplicate rows found: {duplicates}')
hdi_3yr = hdi_3yr.drop_duplicates()

# Handle missing values in essential columns
# Justification: Rows without HDI, country, or year cannot be meaningfully analyzed
rows_before = len(hdi_3yr)
hdi_3yr = hdi_3yr.dropna(subset=essential_cols)
rows_after = len(hdi_3yr)
print(f'\nDropped {rows_before - rows_after} rows with missing essential values')
print(f'Final cleaned dataset: {rows_after} rows')

display(hdi_3yr.head())

In [None]:
# ---------------------------
# Task 3A: Line Chart — HDI Trend (Country-Level)
# ---------------------------
# Select 5 countries for trend analysis
countries = ['Nepal', 'India', 'China', 'United States', 'Norway']
line_data = hdi_3yr[hdi_3yr['country'].isin(countries)]

plt.figure(figsize=(10, 5))
sns.lineplot(data=line_data, x='year', y='hdi', hue='country', marker='o')
plt.title('HDI Trends (2020–2022) — Selected Countries')
plt.xlabel('Year')
plt.ylabel('Human Development Index (HDI)')
plt.legend(title='Country', bbox_to_anchor=(1.02, 1), loc='upper left')
plt.grid(True)
plt.xticks(sorted(line_data['year'].unique()))
plt.tight_layout()
plt.show()

print('Selected countries:', countries)
print('\nInterpretation: The line chart shows HDI trends for 5 selected countries.')
print('Norway consistently has the highest HDI, while developing nations show gradual improvement.')

In [None]:
# ---------------------------
# Task 3B: Bar Chart — Average HDI by Region (2020–2022)
# ---------------------------
# Dataset does not have a Region column, so we create one using ISO3 codes

iso3_to_region = {
    # ================= ASIA =================
    "AFG":"Asia","ARM":"Asia","AZE":"Asia","BHR":"Asia","BGD":"Asia",
    "BTN":"Asia","BRN":"Asia","KHM":"Asia","CHN":"Asia","CYP":"Asia",
    "GEO":"Asia","IND":"Asia","IDN":"Asia","IRN":"Asia","IRQ":"Asia",
    "ISR":"Asia","JPN":"Asia","JOR":"Asia","KAZ":"Asia","KWT":"Asia",
    "KGZ":"Asia","LAO":"Asia","LBN":"Asia","MYS":"Asia","MDV":"Asia",
    "MNG":"Asia","MMR":"Asia","NPL":"Asia","PRK":"Asia","OMN":"Asia",
    "PAK":"Asia","PHL":"Asia","QAT":"Asia","SAU":"Asia","SGP":"Asia",
    "KOR":"Asia","LKA":"Asia","SYR":"Asia","TJK":"Asia","THA":"Asia",
    "TUR":"Asia","TKM":"Asia","ARE":"Asia","UZB":"Asia","VNM":"Asia",
    "YEM":"Asia","PSE":"Asia",

    # ================= EUROPE =================
    "ALB":"Europe","AUT":"Europe","BEL":"Europe","BGR":"Europe",
    "BIH":"Europe","BLR":"Europe","CHE":"Europe","CZE":"Europe",
    "DEU":"Europe","DNK":"Europe","ESP":"Europe","EST":"Europe",
    "FIN":"Europe","FRA":"Europe","GBR":"Europe","GRC":"Europe",
    "HRV":"Europe","HUN":"Europe","IRL":"Europe","ISL":"Europe",
    "ITA":"Europe","LTU":"Europe","LUX":"Europe","LVA":"Europe",
    "MDA":"Europe","MKD":"Europe","MLT":"Europe","MNE":"Europe",
    "NLD":"Europe","NOR":"Europe","POL":"Europe","PRT":"Europe",
    "ROU":"Europe","RUS":"Europe","SRB":"Europe","SVK":"Europe",
    "SVN":"Europe","SWE":"Europe","UKR":"Europe",

    # ================= AFRICA =================
    "DZA":"Africa","AGO":"Africa","BEN":"Africa","BFA":"Africa",
    "BWA":"Africa","CAF":"Africa","CIV":"Africa","CMR":"Africa",
    "COD":"Africa","COG":"Africa","COM":"Africa","DJI":"Africa",
    "EGY":"Africa","ERI":"Africa","ETH":"Africa","GAB":"Africa",
    "GHA":"Africa","GIN":"Africa","GMB":"Africa","GNB":"Africa",
    "KEN":"Africa","LBR":"Africa","LBY":"Africa","LSO":"Africa",
    "MAR":"Africa","MDG":"Africa","MLI":"Africa","MOZ":"Africa",
    "MRT":"Africa","MUS":"Africa","MWI":"Africa","NAM":"Africa",
    "NER":"Africa","NGA":"Africa","RWA":"Africa","SDN":"Africa",
    "SEN":"Africa","SLE":"Africa","SOM":"Africa","SSD":"Africa",
    "STP":"Africa","TCD":"Africa","TGO":"Africa","TUN":"Africa",
    "TZA":"Africa","UGA":"Africa","ZAF":"Africa","ZMB":"Africa",
    "ZWE":"Africa","CPV":"Africa","GNQ":"Africa","SWZ":"Africa",
    "SYC":"Africa",

    # ================= AMERICAS =================
    "ATG":"Americas","ARG":"Americas","BHS":"Americas","BLZ":"Americas",
    "BOL":"Americas","BRA":"Americas","BRB":"Americas","CAN":"Americas",
    "CHL":"Americas","COL":"Americas","CRI":"Americas","CUB":"Americas",
    "DMA":"Americas","DOM":"Americas","ECU":"Americas","GRD":"Americas",
    "GTM":"Americas","GUY":"Americas","HND":"Americas","HTI":"Americas",
    "JAM":"Americas","MEX":"Americas","NIC":"Americas","PAN":"Americas",
    "PER":"Americas","PRY":"Americas","SLV":"Americas","SUR":"Americas",
    "TTO":"Americas","USA":"Americas","URY":"Americas","VEN":"Americas",
    "KNA":"Americas","LCA":"Americas","VCT":"Americas",

    # ================= OCEANIA =================
    "AUS":"Oceania","FJI":"Oceania","FSM":"Oceania","KIR":"Oceania",
    "MHL":"Oceania","NRU":"Oceania","NZL":"Oceania","PLW":"Oceania",
    "PNG":"Oceania","SLB":"Oceania","TON":"Oceania","TUV":"Oceania",
    "VUT":"Oceania","WSM":"Oceania"
}

# Map ISO3 codes to regions
hdi_3yr['region'] = hdi_3yr['iso3'].map(iso3_to_region)

# Check for unmapped countries
unmapped = hdi_3yr['region'].isna().sum()
print(f'Countries without region mapping: {unmapped}')
if unmapped > 0:
    missing_iso = hdi_3yr[hdi_3yr['region'].isna()]['iso3'].unique()
    print(f'Missing ISO3 codes: {missing_iso[:10]}...' if len(missing_iso) > 10 else f'Missing ISO3 codes: {missing_iso}')

# Group by region and compute mean HDI
region_avg = (
    hdi_3yr.dropna(subset=['region'])
    .groupby(['region', 'year'], as_index=False)['hdi']
    .mean()
    .rename(columns={'hdi': 'avg_hdi'})
)

plt.figure(figsize=(12, 6))
sns.barplot(data=region_avg, x='region', y='avg_hdi', hue='year', palette='Set2')
plt.title('Average HDI by Continent (2020–2022)')
plt.xlabel('Continent')
plt.ylabel('Average HDI')
plt.legend(title='Year')
plt.tight_layout()
plt.show()

print('\nInterpretation:')
print('- Europe and Oceania have the highest average HDI values.')
print('- Africa has the lowest average HDI, reflecting development challenges.')
print('- Asia shows moderate HDI with significant variation among countries.')

In [None]:
# ---------------------------
# Task 3B: Box Plot — HDI Distribution for 2020, 2021, and 2022
# ---------------------------
plt.figure(figsize=(8, 5))
sns.boxplot(data=hdi_3yr, x='year', y='hdi', palette='Set2')
plt.title('HDI Distribution by Year (2020–2022)')
plt.xlabel('Year')
plt.ylabel('Human Development Index (HDI)')
plt.tight_layout()
plt.show()

print('Interpretation:')
print('- The box plot shows HDI distribution spread for each year.')
print('- The median (center line) indicates the typical HDI value.')
print('- The IQR (box) shows where the middle 50% of countries fall.')
print('- Outliers (dots) represent countries with unusually high or low HDI.')
print('- Similar distributions across years suggest stable global HDI patterns during 2020-2022.')

In [None]:
# ---------------------------
# Task 3B: Scatter Plot — HDI vs. GNI per Capita
# ---------------------------
if 'gross_inc_percap' not in hdi_3yr.columns:
    print('GNI per Capita variable not available in the dataset.')
else:
    scatter_df = hdi_3yr.dropna(subset=['gross_inc_percap', 'hdi']).copy()
    
    plt.figure(figsize=(10, 6))
    sns.regplot(data=scatter_df, x='gross_inc_percap', y='hdi', 
                scatter_kws={'alpha': 0.4}, line_kws={'color': 'red'})
    plt.title('HDI vs GNI per Capita (2020–2022)')
    plt.xlabel('GNI per Capita (gross_inc_percap)')
    plt.ylabel('Human Development Index (HDI)')
    plt.tight_layout()
    plt.show()
    
    # Compute correlation
    corr = scatter_df['hdi'].corr(scatter_df['gross_inc_percap'])
    print(f'Pearson Correlation (HDI vs GNI): {corr:.4f}')
    print('\nInterpretation:')
    print('- Positive correlation indicates higher GNI is associated with higher HDI.')
    print('- The regression line shows the general trend.')
    print('- Scatter around the line shows variation in how income translates to development.')

In [None]:
# ---------------------------
# Task 4: Short Analysis Questions
# ---------------------------
print('=' * 60)
print('SHORT ANALYSIS QUESTIONS')
print('=' * 60)

# Q1: Which countries show the greatest improvement in HDI from 2020 to 2022?
print('\n--- Q1: Countries with Greatest HDI Improvement (2020→2022) ---')
pivot = hdi_3yr.pivot_table(index='country', columns='year', values='hdi', aggfunc='mean')

if 2020 in pivot.columns and 2022 in pivot.columns:
    pivot['change'] = pivot[2022] - pivot[2020]
    improvements = pivot['change'].dropna().sort_values(ascending=False)
    
    print('Top 10 countries with greatest improvement:')
    display(improvements.head(10))
else:
    print('Cannot compute: missing 2020 or 2022 data')

# Q2: Did any countries experience a decline in HDI?
print('\n--- Q2: Countries with HDI Decline ---')
if 'change' in pivot.columns:
    declines = pivot[pivot['change'] < 0]['change'].sort_values()
    if len(declines) > 0:
        print(f'Countries with declining HDI: {len(declines)}')
        display(declines.head(10))
        print('\nPossible reasons for decline:')
        print('- COVID-19 pandemic impacts on health and education')
        print('- Economic disruptions and increased poverty')
        print('- Political instability or conflict')
        print('- Natural disasters affecting infrastructure')
    else:
        print('No countries experienced HDI decline.')

# Q3: Which region has highest and lowest average HDI?
print('\n--- Q3: Regional HDI Analysis ---')
# Use the 'region' column we created from ISO3 codes
if 'region' in hdi_3yr.columns:
    region_avg = (
        hdi_3yr.dropna(subset=['region'])
        .groupby('region', as_index=False)['hdi']
        .mean()
        .sort_values('hdi', ascending=False)
    )
    display(region_avg)
    print(f'\nHighest average HDI region: {region_avg.iloc[0]["region"]}')
    print(f'Lowest average HDI region: {region_avg.iloc[-1]["region"]}')
else:
    print('Region column not available in dataset.')

# Q4: Impact of COVID-19 on HDI trends
print('\n--- Q4: COVID-19 Impact Discussion ---')
print('The COVID-19 pandemic (2020-2022) may have affected HDI trends through:')
print('1. HEALTH: Reduced life expectancy due to pandemic deaths and healthcare strain')
print('2. EDUCATION: School closures affecting expected/mean years of schooling')
print('3. INCOME: Economic recession reducing GNI per capita')
print('4. INEQUALITY: Widening gaps between developed and developing nations')
print('5. RECOVERY: Some countries showed resilience while others faced prolonged effects')

## Problem 2 — Advanced HDI Exploration (South Asia)

In [None]:
# ============================================================
# Problem 2: Advanced HDI Exploration (South Asia)
# ============================================================

# ---------------------------
# Task 1: Create South Asia Subset
# ---------------------------
south_asia = ['Afghanistan', 'Bangladesh', 'Bhutan', 'India', 
              'Maldives', 'Nepal', 'Pakistan', 'Sri Lanka']

# Filter dataset for South Asian countries
south_asia_df = df[df['country'].isin(south_asia)].copy()
south_asia_df = clean_hdi_df(south_asia_df)

print(f'South Asia subset: {len(south_asia_df)} rows')
print(f'Countries included: {south_asia_df["country"].unique().tolist()}')

# Save the filtered dataset
out_path_sa = OUTPUT_DIR / 'HDI SouthAsia.csv'
south_asia_df.to_csv(out_path_sa, index=False)
print(f'Saved: {out_path_sa.resolve()}')

display(south_asia_df.head())

In [None]:
# ---------------------------
# Task 2: Composite Development Score
# ---------------------------
print('--- Composite Development Score ---')
print('Formula: Composite Score = 0.30 × Life Expectancy + 0.30 × GNI per Capita')

# Check required columns
if 'life_expectancy' not in south_asia_df.columns or 'gross_inc_percap' not in south_asia_df.columns:
    print('Required columns (life_expectancy, gross_inc_percap) not available.')
else:
    # Create composite score
    comp_df = south_asia_df.dropna(subset=['life_expectancy', 'gross_inc_percap', 'hdi']).copy()
    comp_df['Composite_Score'] = (
        0.30 * comp_df['life_expectancy'] + 
        0.30 * comp_df['gross_inc_percap']
    )
    
    # Group by country and calculate mean HDI and Composite Score
    south_asia_avg = comp_df.groupby('country')[['hdi', 'Composite_Score']].mean().reset_index()
    
    # Rank by Composite Score
    south_asia_avg = south_asia_avg.sort_values('Composite_Score', ascending=False)
    south_asia_avg['Composite_Rank'] = range(1, len(south_asia_avg) + 1)
    
    # Rank by HDI
    south_asia_avg['HDI_Rank'] = south_asia_avg['hdi'].rank(ascending=False, method='min').astype(int)
    
    print('\nSouth Asian Countries Ranked by Composite Score:')
    display(south_asia_avg)
    
    # Plot top 5 countries (horizontal bar chart)
    top5 = south_asia_avg.head(5)
    
    plt.figure(figsize=(10, 5))
    sns.barplot(data=top5, y='country', x='Composite_Score', orient='h', palette='viridis')
    plt.title('Top 5 South Asian Countries by Composite Score')
    plt.xlabel('Composite Score')
    plt.ylabel('Country')
    plt.tight_layout()
    plt.show()
    
    # Compare rankings
    print('\n--- Comparison: Composite Score Rank vs HDI Rank ---')
    south_asia_avg['Rank_Difference'] = south_asia_avg['Composite_Rank'] - south_asia_avg['HDI_Rank']
    display(south_asia_avg[['country', 'Composite_Rank', 'HDI_Rank', 'Rank_Difference']])
    
    print('\nDiscussion:')
    print('- Composite Score rankings may differ from HDI because the formula excludes education.')
    print('- Countries with high income but lower education may rank higher on Composite Score.')
    print('- HDI uses a geometric mean of all three dimensions, balancing the components.')

In [None]:
# ---------------------------
# Task 3: Outlier Detection (1.5 × IQR Rule)
# ---------------------------
print('--- Outlier Detection using 1.5 × IQR Rule ---')

outlier_df = south_asia_df.dropna(subset=['hdi', 'gross_inc_percap']).copy()

# Calculate IQR bounds for HDI
Q1_hdi = outlier_df['hdi'].quantile(0.25)
Q3_hdi = outlier_df['hdi'].quantile(0.75)
IQR_hdi = Q3_hdi - Q1_hdi
hdi_lower = Q1_hdi - 1.5 * IQR_hdi
hdi_upper = Q3_hdi + 1.5 * IQR_hdi

# Calculate IQR bounds for GNI per Capita
Q1_gni = outlier_df['gross_inc_percap'].quantile(0.25)
Q3_gni = outlier_df['gross_inc_percap'].quantile(0.75)
IQR_gni = Q3_gni - Q1_gni
gni_lower = Q1_gni - 1.5 * IQR_gni
gni_upper = Q3_gni + 1.5 * IQR_gni

print(f'HDI bounds: [{hdi_lower:.3f}, {hdi_upper:.3f}]')
print(f'GNI bounds: [{gni_lower:.0f}, {gni_upper:.0f}]')

# Flag outliers
outlier_df['Outlier'] = (
    (outlier_df['hdi'] < hdi_lower) | 
    (outlier_df['hdi'] > hdi_upper) |
    (outlier_df['gross_inc_percap'] < gni_lower) | 
    (outlier_df['gross_inc_percap'] > gni_upper)
)

# Display outliers
outliers = outlier_df[outlier_df['Outlier']]
print(f'\nOutliers detected: {len(outliers)} rows')
display(outliers[['country', 'year', 'hdi', 'gross_inc_percap']])

# Scatter plot with outliers highlighted
plt.figure(figsize=(10, 6))
colors = outlier_df['Outlier'].map({False: 'tab:blue', True: 'tab:red'})
plt.scatter(outlier_df['gross_inc_percap'], outlier_df['hdi'], 
            c=colors, alpha=0.6, edgecolors='black', linewidth=0.5)
plt.xlabel('GNI per Capita')
plt.ylabel('HDI')
plt.title('South Asia: GNI per Capita vs HDI (Outliers in Red)')
plt.tight_layout()
plt.show()

print('\nDiscussion: Why these countries stand out as outliers:')
print('- Maldives: High GNI relative to other South Asian countries (tourism-based economy)')
print('- Afghanistan: Low HDI due to conflict and instability')
print('- Countries at extremes of either dimension are flagged as outliers')

In [None]:
# ---------------------------
# Task 4: Exploring Metric Relationships
# ---------------------------
print('--- Exploring Metric Relationships with HDI ---')

# Select two metrics
metrics = ['gender_development', 'life_expectancy']
rel_df = south_asia_df.dropna(subset=['hdi']).copy()

# Compute Pearson correlations
print('\nPearson Correlations with HDI:')
correlations = {}
for m in metrics:
    if m in rel_df.columns:
        subset = rel_df.dropna(subset=[m])
        if len(subset) >= 3:
            corr = subset['hdi'].corr(subset[m], method='pearson')
            correlations[m] = corr
            print(f'  {m}: r = {corr:.4f}')
        else:
            print(f'  {m}: Not enough data')
    else:
        print(f'  {m}: Column not available')

# Create scatter plots with trendlines
for m in metrics:
    if m in rel_df.columns:
        subset = rel_df.dropna(subset=[m])
        if len(subset) >= 3:
            plt.figure(figsize=(8, 5))
            sns.regplot(data=subset, x=m, y='hdi', scatter_kws={'alpha': 0.5}, 
                        line_kws={'color': 'red'})
            plt.title(f'South Asia: HDI vs {m}')
            plt.xlabel(m)
            plt.ylabel('HDI')
            plt.tight_layout()
            plt.show()

# Discussion
print('\n--- Discussion ---')
if correlations:
    strongest = max(correlations, key=correlations.get)
    weakest = min(correlations, key=correlations.get)
    print(f'Strongest relationship with HDI: {strongest} (r = {correlations[strongest]:.4f})')
    print(f'Weakest relationship with HDI: {weakest} (r = {correlations[weakest]:.4f})')
    print('\nLife expectancy typically shows strong correlation as it is a direct HDI component.')
    print('Gender development may show weaker correlation due to complex social factors.')

In [None]:
# ---------------------------
# Task 5: Gap Analysis
# ---------------------------
print('--- Gap Analysis: GNI_HDI_Gap ---')
print('Formula: GNI_HDI_Gap = gross_inc_percap - hdi')

gap_df = south_asia_df.dropna(subset=['gross_inc_percap', 'hdi']).copy()
gap_df['GNI_HDI_Gap'] = gap_df['gross_inc_percap'] - gap_df['hdi']

# Average gap by country
gap_by_country = (
    gap_df.groupby('country', as_index=False)['GNI_HDI_Gap'].mean()
    .sort_values('GNI_HDI_Gap', ascending=False)
)

print('\nSouth Asian Countries Ranked by GNI_HDI_Gap:')
display(gap_by_country)

# Top 3 positive gaps (descending)
top_positive = gap_by_country.head(3).copy()
top_positive['group'] = 'Top 3 Positive'

# Top 3 negative gaps (ascending)
top_negative = gap_by_country.tail(3).copy()
top_negative['group'] = 'Top 3 Negative'

# Combine for plotting
plot_data = pd.concat([top_positive, top_negative], ignore_index=True)

# Plot
plt.figure(figsize=(10, 5))
sns.barplot(data=plot_data, y='country', x='GNI_HDI_Gap', hue='group', orient='h')
plt.title('Top 3 Positive and Negative GNI–HDI Gaps (South Asia)')
plt.xlabel('GNI_HDI_Gap (GNI per Capita - HDI)')
plt.ylabel('Country')
plt.axvline(x=0, color='black', linestyle='--', linewidth=0.8)
plt.legend(title='Gap Type')
plt.tight_layout()
plt.show()

print('\n--- Discussion: Implications of GNI-HDI Gap ---')
print('POSITIVE GAP (High GNI, lower HDI than expected):')
print('- Country has economic resources but may not be investing effectively in health/education')
print('- Possible inequality: wealth concentrated among few, not translating to human development')
print('- Example: Maldives has high tourism income but may lag in education metrics')
print('\nNEGATIVE GAP (Low GNI, higher HDI than expected):')
print('- Country achieves good human development despite lower income')
print('- Effective social policies in health and education')
print('- Example: Sri Lanka historically known for strong education/health despite modest income')

## Problem 3 — Comparative Regional Analysis: South Asia vs Middle East (2020–2022)

In [None]:
# ============================================================
# Problem 3: Comparative Regional Analysis (South Asia vs Middle East)
# ============================================================

# ---------------------------
# Task 1: Create Middle East and South Asia Subsets
# ---------------------------
# Define country lists
south_asia = ['Afghanistan', 'Bangladesh', 'Bhutan', 'India', 
              'Maldives', 'Nepal', 'Pakistan', 'Sri Lanka']

# Middle East countries - using exact names as they appear in the dataset
middle_east = [
    'Bahrain', 
    'Iran (Islamic Republic of)',  # Dataset uses full UN name
    'Iraq', 
    'Israel', 
    'Jordan',
    'Kuwait', 
    'Lebanon', 
    'Oman', 
    'Palestine, State of',  # Dataset uses full UN name
    'Qatar',
    'Saudi Arabia', 
    'Syrian Arab Republic',  # Dataset uses full UN name
    'United Arab Emirates', 
    'Yemen'
]

# Use hdi_3yr from Problem 1B (years 2020-2022)
print('Using Problem 1B dataset (2020-2022)')
print(f'Total rows in hdi_3yr: {len(hdi_3yr)}')

# Create South Asia subset
sa_2020_2022 = hdi_3yr[hdi_3yr['country'].isin(south_asia)].copy()
print(f'\nSouth Asia subset: {len(sa_2020_2022)} rows')
print(f'Countries found: {sa_2020_2022["country"].unique().tolist()}')

# Create Middle East subset
me_2020_2022 = hdi_3yr[hdi_3yr['country'].isin(middle_east)].copy()
print(f'\nMiddle East subset: {len(me_2020_2022)} rows')
print(f'Countries found: {me_2020_2022["country"].unique().tolist()}')

# Save subsets as CSV
out_sa = OUTPUT_DIR / 'HDI SouthAsia 2020 2022.csv'
out_me = OUTPUT_DIR / 'HDI MiddleEast 2020 2022.csv'
sa_2020_2022.to_csv(out_sa, index=False)
me_2020_2022.to_csv(out_me, index=False)
print(f'\nSaved: {out_sa.resolve()}')
print(f'Saved: {out_me.resolve()}')

# Add region labels and combine for analysis
sa_2020_2022['RegionGroup'] = 'South Asia'
me_2020_2022['RegionGroup'] = 'Middle East'
combined = pd.concat([sa_2020_2022, me_2020_2022], ignore_index=True)
print(f'\nCombined dataset: {len(combined)} rows')
display(combined.head())

In [None]:
# ---------------------------
# Task 2: Descriptive Statistics
# ---------------------------
print('--- Descriptive Statistics: HDI by Region (2020–2022) ---')

# Compute mean and standard deviation of HDI for each region
stats = combined.groupby('RegionGroup')['hdi'].agg(['mean', 'std', 'count']).reset_index()
stats.columns = ['Region', 'Mean HDI', 'Std Dev', 'Count']
stats = stats.sort_values('Mean HDI', ascending=False)

print('\nHDI Statistics by Region:')
display(stats)

# Identify which region performs better on average
best_region = stats.iloc[0]['Region']
best_mean = stats.iloc[0]['Mean HDI']
print(f'\n** Region performing BETTER on average: {best_region} (Mean HDI = {best_mean:.4f}) **')

# Additional interpretation
print('\nInterpretation:')
print(f'- {stats.iloc[0]["Region"]} has a higher average HDI ({stats.iloc[0]["Mean HDI"]:.4f})')
print(f'- {stats.iloc[1]["Region"]} has an average HDI of {stats.iloc[1]["Mean HDI"]:.4f}')
diff = abs(stats.iloc[0]["Mean HDI"] - stats.iloc[1]["Mean HDI"])
print(f'- The difference between regions is {diff:.4f} HDI points')

In [None]:
# ---------------------------
# Task 3: Top and Bottom Performers
# ---------------------------
print('--- Top 3 and Bottom 3 Countries by HDI in Each Region ---')

# Calculate average HDI per country across 2020-2022
country_avg = (
    combined
    .groupby(['RegionGroup', 'country'], as_index=False)['hdi']
    .mean()
    .rename(columns={'hdi': 'avg_hdi'})
)

# Get top 3 and bottom 3 for each region
top3_list = []
bot3_list = []

for region in ['South Asia', 'Middle East']:
    region_data = country_avg[country_avg['RegionGroup'] == region].copy()
    region_data = region_data.sort_values('avg_hdi', ascending=False)
    
    top3 = region_data.head(3).copy()
    top3['Performer'] = 'Top 3'
    top3_list.append(top3)
    
    bot3 = region_data.tail(3).copy()
    bot3['Performer'] = 'Bottom 3'
    bot3_list.append(bot3)

top3_df = pd.concat(top3_list, ignore_index=True)
bot3_df = pd.concat(bot3_list, ignore_index=True)

print('\nTop 3 Performers by Region:')
display(top3_df)

print('\nBottom 3 Performers by Region:')
display(bot3_df)

# Combined for plotting
compare = pd.concat([top3_df, bot3_df], ignore_index=True)

# Create bar chart comparing top and bottom performers
plt.figure(figsize=(12, 6))
sns.barplot(data=compare, x='avg_hdi', y='country', hue='RegionGroup', orient='h')
plt.title('Top 3 and Bottom 3 HDI Performers by Region (2020–2022)')
plt.xlabel('Average HDI (2020–2022)')
plt.ylabel('Country')
plt.legend(title='Region', bbox_to_anchor=(1.02, 1), loc='upper left')
plt.tight_layout()
plt.show()

print('\nInterpretation:')
print('- The chart compares top and bottom performers across both regions.')
print('- Middle East oil-rich nations (Israel, UAE, Qatar) tend to have higher HDI.')
print('- South Asia shows more variation, with Sri Lanka/Maldives at top and Afghanistan at bottom.')

In [None]:
# ---------------------------
# Task 4: Metric Comparisons
# ---------------------------
print('--- Metric Comparisons: South Asia vs Middle East ---')

# Define metrics to compare
metric_cols = ['gender_development', 'life_expectancy', 'gross_inc_percap']
available_metrics = [m for m in metric_cols if m in combined.columns]
print(f'Available metrics: {available_metrics}')

# Compute mean of each metric by region
metric_means = (
    combined
    .groupby('RegionGroup')[available_metrics]
    .mean(numeric_only=True)
    .reset_index()
)

print('\nMean Values by Region:')
display(metric_means)

# Create grouped bar chart for each metric
long_format = metric_means.melt(
    id_vars='RegionGroup', 
    var_name='Metric', 
    value_name='Mean Value'
)

plt.figure(figsize=(10, 6))
sns.barplot(data=long_format, x='Metric', y='Mean Value', hue='RegionGroup')
plt.title('Comparison of Key Metrics: South Asia vs Middle East (2020–2022)')
plt.xlabel('Metric')
plt.ylabel('Mean Value')
plt.xticks(rotation=15, ha='right')
plt.legend(title='Region')
plt.tight_layout()
plt.show()

# Identify greatest disparity
if len(metric_means) == 2:
    sa_row = metric_means[metric_means['RegionGroup'] == 'South Asia'].iloc[0]
    me_row = metric_means[metric_means['RegionGroup'] == 'Middle East'].iloc[0]
    
    disparities = {}
    for m in available_metrics:
        # Calculate percentage difference for fair comparison
        avg_val = (sa_row[m] + me_row[m]) / 2
        if avg_val > 0:
            pct_diff = abs(sa_row[m] - me_row[m]) / avg_val * 100
            disparities[m] = pct_diff
    
    if disparities:
        greatest_disparity = max(disparities, key=disparities.get)
        print(f'\n** Greatest disparity between regions: {greatest_disparity} **')
        print(f'   Percentage difference: {disparities[greatest_disparity]:.2f}%')
        print('\nAll disparities (% difference):')
        for m, d in sorted(disparities.items(), key=lambda x: -x[1]):
            print(f'  {m}: {d:.2f}%')
        
        print('\nInterpretation:')
        print(f'- {greatest_disparity} shows the largest gap between regions.')
        print('- GNI per capita typically shows large disparity due to oil wealth in Middle East.')
        print('- Life expectancy may be more similar due to global health improvements.')

In [None]:
# ---------------------------
# Task 5: HDI Disparity
# ---------------------------
print('--- HDI Disparity Analysis ---')

# Compute range (max - min) and coefficient of variation (CV = std/mean)
disparity = combined.groupby('RegionGroup')['hdi'].agg(['mean', 'std', 'min', 'max']).reset_index()
disparity['Range'] = disparity['max'] - disparity['min']
disparity['CV'] = disparity['std'] / disparity['mean']

disparity.columns = ['Region', 'Mean', 'Std Dev', 'Min HDI', 'Max HDI', 'Range', 'CV']
disparity = disparity.round(4)

print('\nHDI Disparity Metrics by Region:')
display(disparity[['Region', 'Mean', 'Std Dev', 'Range', 'CV']])

# Identify region with more variation
more_variation = disparity.loc[disparity['CV'].idxmax(), 'Region']
higher_cv = disparity['CV'].max()

less_variation = disparity.loc[disparity['CV'].idxmin(), 'Region']
lower_cv = disparity['CV'].min()

print(f'\n** Region with MORE variation in HDI: {more_variation} (CV = {higher_cv:.4f}) **')
print(f'   Region with less variation: {less_variation} (CV = {lower_cv:.4f})')

# Interpretation
print('\nInterpretation:')
print('- RANGE measures the absolute spread between highest and lowest HDI in a region.')
print('- CV (Coefficient of Variation) measures relative variability, accounting for mean differences.')
print(f'- {more_variation} has higher CV, indicating more inequality in development outcomes.')
print('- Higher CV suggests larger disparities between the most and least developed countries.')

# Visualize the disparity
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

# Range comparison
ax1 = axes[0]
sns.barplot(data=disparity, x='Region', y='Range', ax=ax1, palette='Set2')
ax1.set_title('HDI Range by Region')
ax1.set_ylabel('Range (Max - Min)')

# CV comparison
ax2 = axes[1]
sns.barplot(data=disparity, x='Region', y='CV', ax=ax2, palette='Set2')
ax2.set_title('Coefficient of Variation (CV) by Region')
ax2.set_ylabel('CV (Std / Mean)')

plt.tight_layout()
plt.show()

In [None]:
# ---------------------------
# Task 6: Correlation Analysis
# ---------------------------
print('--- Correlation Analysis: HDI vs Key Metrics by Region ---')

correlation_metrics = ['gender_development', 'life_expectancy']

for region in ['South Asia', 'Middle East']:
    print(f'\n{"="*50}')
    print(f'REGION: {region}')
    print('='*50)
    
    sub = combined[combined['RegionGroup'] == region].copy()
    
    for metric in correlation_metrics:
        if metric not in sub.columns:
            print(f'{metric}: Column not available in dataset.')
            continue
            
        # Drop rows with missing values
        analysis_df = sub.dropna(subset=['hdi', metric])
        
        if len(analysis_df) < 3:
            print(f'{metric}: Not enough data for correlation analysis.')
            continue
        
        # Compute Pearson correlation
        r = analysis_df['hdi'].corr(analysis_df[metric], method='pearson')
        print(f'\nPearson correlation (HDI vs {metric}): r = {r:.4f}')
        
        # Interpret strength and direction
        if abs(r) >= 0.7:
            strength = 'Strong'
        elif abs(r) >= 0.4:
            strength = 'Moderate'
        else:
            strength = 'Weak'
        direction = 'positive' if r > 0 else 'negative'
        print(f'Interpretation: {strength} {direction} relationship')
        
        # Create scatter plot with trendline
        plt.figure(figsize=(8, 5))
        sns.regplot(data=analysis_df, x=metric, y='hdi', 
                    scatter_kws={'alpha': 0.6}, line_kws={'color': 'red'})
        plt.title(f'{region}: HDI vs {metric} (r = {r:.3f})')
        plt.xlabel(metric.replace('_', ' ').title())
        plt.ylabel('Human Development Index (HDI)')
        plt.tight_layout()
        plt.show()

print('\n--- Summary of Correlations ---')
print('Life Expectancy typically shows strong positive correlation with HDI')
print('  (it is a direct component of HDI calculation).')
print('Gender Development shows the relationship between gender equality and overall development.')
print('Stronger correlations indicate that improvements in one metric align closely with HDI gains.')

In [None]:
# ---------------------------
# Task 7: Outlier Detection (1.5 × IQR Rule)
# ---------------------------
print('--- Outlier Detection: HDI and GNI per Capita by Region ---')

if 'gross_inc_percap' not in combined.columns:
    print('gross_inc_percap column not available in dataset.')
else:
    def detect_outliers_iqr(series):
        """Flag outliers using 1.5 × IQR rule."""
        Q1 = series.quantile(0.25)
        Q3 = series.quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR
        return (series < lower) | (series > upper), lower, upper
    
    for region in ['South Asia', 'Middle East']:
        print(f'\n{"="*50}')
        print(f'REGION: {region}')
        print('='*50)
        
        sub = combined[combined['RegionGroup'] == region].dropna(subset=['hdi', 'gross_inc_percap']).copy()
        
        if len(sub) < 5:
            print(f'Not enough data points ({len(sub)}) for robust outlier detection.')
            continue
        
        # Detect outliers for HDI
        out_hdi, hdi_lo, hdi_hi = detect_outliers_iqr(sub['hdi'])
        print(f'HDI bounds: [{hdi_lo:.3f}, {hdi_hi:.3f}]')
        
        # Detect outliers for GNI per Capita
        out_gni, gni_lo, gni_hi = detect_outliers_iqr(sub['gross_inc_percap'])
        print(f'GNI bounds: [{gni_lo:.0f}, {gni_hi:.0f}]')
        
        # Flag as outlier if either metric is an outlier
        sub['is_outlier'] = out_hdi | out_gni
        
        n_outliers = sub['is_outlier'].sum()
        print(f'\nTotal outliers detected: {n_outliers}')
        
        if n_outliers > 0:
            outlier_rows = sub[sub['is_outlier']][['country', 'year', 'hdi', 'gross_inc_percap']]
            print('\nOutlier records:')
            display(outlier_rows)
        
        # Create scatter plot with outliers highlighted
        plt.figure(figsize=(9, 5))
        colors = sub['is_outlier'].map({False: 'tab:blue', True: 'tab:red'})
        plt.scatter(sub['gross_inc_percap'], sub['hdi'], c=colors, alpha=0.6, 
                    edgecolors='black', linewidth=0.5, s=60)
        
        # Add labels for outliers
        for idx, row in sub[sub['is_outlier']].iterrows():
            plt.annotate(row['country'][:10], (row['gross_inc_percap'], row['hdi']),
                        fontsize=8, alpha=0.8)
        
        plt.title(f'{region}: Outliers in GNI per Capita vs HDI (2020–2022)\n(Red = Outlier)')
        plt.xlabel('GNI per Capita')
        plt.ylabel('HDI')
        plt.tight_layout()
        plt.show()

    print('\n--- Discussion: Significance of Outliers ---')
    print('OUTLIERS may represent:')
    print('1. Exceptional performers: Countries like Israel, UAE, Qatar with very high GNI/HDI')
    print('2. Conflict-affected nations: Afghanistan, Yemen with unusually low metrics')
    print('3. Small island economies: Maldives with high tourism-based income')
    print('4. Data anomalies: Possible measurement or reporting issues')
    print('\nOutliers deserve special attention as they may:')
    print('- Skew regional averages')
    print('- Represent unique development trajectories')
    print('- Require targeted policy interventions')

: 