# Global Tuberculosis Burden Analysis: A Data Visualization Study

**MCSC 2108: Data Visualization - Final Examination Report**

**Author:** Daniel Wanjal Machimbo  
**Institution:** The Cooperative University of Kenya  
**Program:** Master of Science in Computer Science  
**Date:** October 2025

---

## Executive Summary

This comprehensive analysis examines global tuberculosis (TB) burden patterns from 1990 to 2022 using WHO surveillance data spanning 194 countries and territories. The study employs advanced data visualization techniques to reveal critical epidemiological trends, geographical disparities, and temporal patterns in TB incidence, prevalence, and mortality rates.

**Key Insights:**
- **Global Hotspots**: Sub-Saharan Africa and Southeast Asia exhibit the highest TB incidence rates (>300 per 100,000 population), with South Africa, Philippines, and India leading absolute case counts
- **Temporal Trends**: While global TB incidence has declined by approximately 2% annually since 2000, progress varies dramatically by region, with some African countries showing minimal improvement
- **Mortality Correlation**: Strong positive correlation (R² > 0.85) between incidence and mortality rates, indicating consistent case fatality patterns across diverse healthcare systems

This analysis provides evidence-based insights for targeted intervention strategies and resource allocation in global TB control programs.

In [None]:
# Import required libraries with robust error handling
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import plotly.offline as pyo
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas as gpd
import pycountry
from pathlib import Path
import warnings
from typing import Dict, List, Tuple, Optional
import os
import sys
from datetime import datetime

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

# Configure matplotlib and seaborn defaults
plt.style.use('default')
sns.set_palette("viridis")
plt.rcParams['figure.dpi'] = 300
plt.rcParams['savefig.dpi'] = 300
plt.rcParams['font.size'] = 10

# Create directory structure
directories = ['data', 'figures', 'report']
for directory in directories:
    Path(directory).mkdir(exist_ok=True)
    
print("✓ Environment setup complete")
print("✓ Required directories created")
print(f"Python version: {sys.version}")
print(f"Analysis timestamp: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

## Data Loading & Automatic Inspection

Loading WHO TB burden dataset with robust parsing to handle encoding issues and perform comprehensive data diagnostics.

In [None]:
def load_and_inspect_data(file_path: str) -> pd.DataFrame:
    """
    Load TB burden dataset with robust parsing and comprehensive diagnostics
    """
    # Robust data loading with encoding detection
    encodings = ['utf-8', 'utf-8-sig', 'latin-1', 'cp1252']
    df = None
    
    for encoding in encodings:
        try:
            df = pd.read_csv(file_path, encoding=encoding)
            print(f"✓ Successfully loaded data with {encoding} encoding")
            break
        except (UnicodeDecodeError, UnicodeError):
            continue
    
    if df is None:
        raise ValueError("Could not load data with any supported encoding")
    
    # Comprehensive data diagnostics
    print("\n" + "="*80)
    print("DATA DIAGNOSTICS REPORT")
    print("="*80)
    
    # Basic shape and structure
    print(f"Dataset Shape: {df.shape[0]:,} rows × {df.shape[1]} columns")
    print(f"Memory Usage: {df.memory_usage(deep=True).sum() / 1024**2:.1f} MB")
    
    # Column mapping for standardization
    column_mapping = {
        'Country or territory name': 'country',
        'ISO 3-character country/territory code': 'iso3',
        'ISO 2-character country/territory code': 'iso2', 
        'Region': 'region',
        'Year': 'year',
        'Estimated total population number': 'population',
        'Estimated incidence (all forms) per 100 000 population': 'incidence_per100k',
        'Estimated number of incident cases (all forms)': 'incidence_cases',
        'Estimated prevalence of TB (all forms) per 100 000 population': 'prevalence_per100k',
        'Estimated prevalence of TB (all forms)': 'prevalence_cases',
        'Estimated number of deaths from TB (all forms, excluding HIV)': 'deaths_excl_hiv',
        'Estimated mortality of TB cases (all forms, excluding HIV) per 100 000 population': 'mortality_per100k_excl_hiv',
        'Estimated number of deaths from TB in people who are HIV-positive': 'deaths_hiv_pos',
        'Estimated mortality of TB cases who are HIV-positive, per 100 000 population': 'mortality_per100k_hiv_pos'
    }
    
    # Apply column mapping for available columns
    available_mappings = {old: new for old, new in column_mapping.items() if old in df.columns}
    df = df.rename(columns=available_mappings)
    
    print(f"\n✓ Standardized {len(available_mappings)} column names")
    
    # Data quality assessment
    print(f"\nFirst 10 rows preview:")
    display_cols = ['country', 'iso3', 'region', 'year', 'population'] if all(c in df.columns for c in ['country', 'iso3', 'region', 'year', 'population']) else df.columns[:5]
    print(df[display_cols].head(10).to_string())
    
    # Data types and missing values
    print(f"\nData Types and Missing Values:")
    missing_summary = pd.DataFrame({
        'dtype': df.dtypes,
        'missing_count': df.isnull().sum(),
        'missing_pct': (df.isnull().sum() / len(df) * 100).round(2)
    })
    print(missing_summary[missing_summary['missing_count'] > 0].head(10))
    
    # Temporal coverage
    if 'year' in df.columns:
        years = pd.to_numeric(df['year'], errors='coerce').dropna()
        print(f"\nTemporal Coverage: {years.min():.0f} - {years.max():.0f} ({years.nunique()} unique years)")
    
    # Geographic coverage
    if 'country' in df.columns:
        countries = df['country'].nunique()
        print(f"Geographic Coverage: {countries} unique countries/territories")
        print(f"Top 10 countries by data availability:")
        country_counts = df['country'].value_counts().head(10)
        for country, count in country_counts.items():
            print(f"  • {country}: {count} records")
    
    # Identify potential data quality issues
    print(f"\nData Quality Checks:")
    
    if 'population' in df.columns:
        pop_issues = df[pd.to_numeric(df['population'], errors='coerce') <= 0]
        print(f"  • Records with invalid population: {len(pop_issues)}")
    
    # Check for duplicate records
    duplicates = df.duplicated().sum()
    print(f"  • Exact duplicate rows: {duplicates}")
    
    return df

# Load the dataset
df_raw = load_and_inspect_data('TB_Burden_Country.csv')

## Data Preparation & Cleaning

Comprehensive data cleaning pipeline with automatic ISO code generation, outlier detection, and derived variable creation. All transformations are documented and logged for reproducibility.