<a href="https://colab.research.google.com/github/IfeakanduBenedict/pediatric-pneumonia-ml-england/blob/main/Pediatric_Pneumonia_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Project Topic: Predicting Pediatric Pneumonia Hospitalisation Rates in English Local Authorities**
   
## Research Questions

**PRIMARY**: To what extent can machine learning models predict emergency pneumonia hospitalisation rates among children and young people under 19 across English Local Authorities using socioeconomic deprivation, demographic vulnerability, and healthcare utilisation indicators?

**SECONDARY**: Which socioeconomic, demographic, and healthcare-related factors contribute most to geographic inequalities in pneumonia hospitalisation rates among children and young people under 19 across England?


# **Notebook Setup and Installation of Needed Library**

In [None]:
# Data manipulation
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Statistical analysis
from scipy import stats
from scipy.stats import pearsonr, spearmanr

# Machine Learning - Preprocessing
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import StandardScaler, RobustScaler

# Import Variance Inflation Factor (VIF) to detect multicollinearity
# between predictor variables during feature evaluation.
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Import hyperparameter tuning tools for model optimization
# GridSearchCV performs exhaustive parameter search,
# while RandomizedSearchCV samples parameter combinations for faster exploration.
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# Machine Learning - Models
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR

# Import statistical tests for regression assumptions validation
# shapiro, normaltest: Test normality of residuals
# het_breuschpagan: Test homoscedasticity (constant variance)
# durbin_watson: Test independence (no autocorrelation)
from scipy.stats import shapiro, normaltest
from statsmodels.stats.diagnostic import het_breuschpagan
from statsmodels.stats.stattools import durbin_watson

# Import statsmodels for adding constant (required by Breusch-Pagan test)
import statsmodels.api as sm

# Machine Learning - Metrics
from sklearn.metrics import (mean_squared_error, mean_absolute_error,
                             r2_score, mean_absolute_percentage_error)

# Model Interpretation
import shap
from sklearn.inspection import permutation_importance

# Model Persistence
import joblib
import json
from datetime import datetime

# For handling file paths
import os

# Settings
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

# Configure plotting style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("husl")
sns.set_context("notebook", font_scale=1.1)

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.precision', 2)

print("✓ All libraries imported successfully!")
print(f"✓ Random seed set to: {RANDOM_STATE}")

✓ All libraries imported successfully!
✓ Random seed set to: 42


# **Connecting Google Drive**

In [None]:
# Import and mount Google Drive.
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
# Navigating to the dataset folder in Google Drive
os.chdir('/content/drive/MyDrive/')

## **Loading The Dataset**

In [None]:
# Load the dataset
pneumonia_data = pd.read_csv('pediatric_pneumonia_england_LA_dataset_2023_2024.csv')

# **Dataset Overview**

In [None]:
# Displaying the first five rows.
pneumonia_data.head()

Unnamed: 0,Area_Code,Area_Name,Pneumonia_Rate_per_100k,IMD_Score,Child_Poverty_Pct,Fuel_Poverty_Pct,Free_School_Meals_Pct,Pop_0_4_Years_Pct,Population_Density,Ethnic_Minority_Pct,Overcrowded_HH_Pct,Infant_Mortality_Rate,Birth_Rate_per_1000,Emergency_Admissions_U18,Respiratory_Admissions_0_4,Emergency_Admissions_0_4
0,E07000223,Adur,592.0,17.59,15.3,9.69,38.39,113.75,1538.65,11.07,25.96,4.6,48.4,55.79,178.75,118.62
1,E07000032,Amber Valley,405.2,17.97,21.3,11.94,37.06,84.52,478.3,4.96,29.53,3.95,50.28,39.93,126.77,86.21
2,E07000224,Arun,681.4,18.64,17.4,9.52,34.68,154.41,752.88,11.35,28.75,2.67,47.68,73.94,224.59,167.74
3,E07000170,Ashfield,748.9,26.31,28.2,12.27,34.98,74.14,1160.12,8.18,42.7,5.33,50.73,70.8,229.83,174.23
4,E07000105,Ashford,911.8,18.55,17.1,9.86,34.15,118.05,233.79,17.38,30.65,5.94,53.3,74.18,282.0,180.35
