<a href="https://colab.research.google.com/github/Immanuel-01/Global-Health/blob/main/Copy_of_Global_Health.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

# Load the dataset

gbh = pd.read_csv("Global Health Statistics.csv")

# Check the number of rows in the dataset
print(f"Total rows in the dataset: {len(gbh)}")

In [None]:
# Check the number of rows in the subset
print(f"Number of rows in the subset: {len(gbh)}")

In [None]:
# Display the first few rows of the dataset

print(gbh.head())

In [None]:
print(gbh.info())   # Get basic information about the dataset

In [None]:
print(gbh.describe())  # Summary statistics for numerical columns

In [None]:
# Check for missing values

print(gbh.isnull().sum())

In [None]:
from sklearn.impute import SimpleImputer

# Impute missing values for numerical columns with the median
numerical_columns = ['Average Treatment Cost (USD)', 'Recovery Rate (%)', 'DALYs',
                     'Improvement in 5 Years (%)', 'Per Capita Income (USD)',
                     'Education Index', 'Urbanization Rate (%)']
imputer = SimpleImputer(strategy='median')
gbh[numerical_columns] = imputer.fit_transform(gbh[numerical_columns])


In [None]:
# Impute missing values for categorical columns with the most frequent category
categorical_columns = ['Treatment Type', 'Availability of Vaccines/Treatment']
imputer_cat = SimpleImputer(strategy='most_frequent')
gbh[categorical_columns] = imputer_cat.fit_transform(gbh[categorical_columns])


In [None]:
print(gbh.isnull().sum())  # Check if there are still any missing values


In [None]:
# Check for duplicates
print(gbh.duplicated().sum())

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
diseases_counts = gbh['Disease Name'].value_counts()  # Count the occurrences of each disease

# Plot the distribution of diseases
plt.figure(figsize=(10, 6))
sns.barplot(x=diseases_counts.index, y=diseases_counts.values, palette="viridis")
plt.title("Distribution of Diseases")
plt.xlabel("Disease")
plt.ylabel("Count")
plt.xticks(rotation=45)
plt.show()

In [None]:
# Group by country and disease

country_diseases = gbh.groupby(['Disease Name','Country']).size().unstack()

# Plot the heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(country_diseases, cmap="YlGnBu", annot=True, fmt="g")
plt.title("Disease Prevalence by Country")
plt.xlabel("Disease")
plt.ylabel("Country")
plt.show()

In [None]:
# Check the percentage of missing data in each column
missing_percentage = gbh.isnull().mean() * 100
print(missing_percentage)


In [None]:
# Perform one-hot encoding for categorical columns
gbh = pd.get_dummies(gbh, columns=['Country', 'Disease Name', 'Treatment Type'], drop_first=True)


In [None]:
from sklearn.preprocessing import StandardScaler

# List of numerical columns for scaling
numerical_columns = ['Prevalence Rate (%)', 'Incidence Rate (%)', 'Mortality Rate (%)',
                     'Population Affected', 'Healthcare Access (%)', 'Doctors per 1000',
                     'Hospital Beds per 1000', 'Average Treatment Cost (USD)', 'Recovery Rate (%)',
                     'DALYs', 'Improvement in 5 Years (%)', 'Per Capita Income (USD)', 'Education Index',
                     'Urbanization Rate (%)']

# Scale the numerical features
scaler = StandardScaler()
gbh[numerical_columns] = scaler.fit_transform(gbh[numerical_columns])


In [None]:
# Feature Engineering: Combine healthcare access and doctors per 1000
gbh['Healthcare_Quality'] = gbh['Healthcare Access (%)'] * gbh['Doctors per 1000']


In [None]:
# Create a binary classification target based on Prevalence Rate (%)
# Use median as the threshold to define high/low prevalence
threshold = gbh['Prevalence Rate (%)'].median()
gbh['Prevalence Category'] = (gbh['Prevalence Rate (%)'] > threshold).astype(int)

# Check the new target column
print(gbh[['Prevalence Rate (%)', 'Prevalence Category']].head())


In [None]:
# Include the newly engineered 'Healthcare_Quality' feature
X = gbh.drop(columns=['Prevalence Rate (%)', 'Prevalence Category'])  # Exclude target and original prevalence
y = gbh['Prevalence Category']  # Target variable for classification (high/low prevalence)


In [None]:
# Include the newly engineered 'Healthcare_Quality' feature
X = gbh.drop(columns=['Prevalence Rate (%)', 'Prevalence Category'])  # Exclude target and original prevalence
y = gbh['Prevalence Category']  # Target variable for classification (high/low prevalence)

# Verify the shape of X and y
print(X.shape, y.shape)


In [None]:
from sklearn.model_selection import train_test_split

# Split the data into train and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the split
print(f"Training data shape: {X_train.shape}")
print(f"Test data shape: {X_test.shape}")


In [None]:
# One-Hot Encode the categorical variables
X_encoded = pd.get_dummies(X, drop_first=True)  # drop_first to avoid multicollinearity

# Define the target variable
y = gbh['Prevalence Category']  # This should already be binary (0 and 1)

# Split the data into training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Logistic Regression
from sklearn.linear_model import LogisticRegression
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)

# Evaluate the Logistic Regression model
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
print("Logistic Regression Evaluation")
print(f"Accuracy: {accuracy_score(y_test, y_pred_lr)}")
print(f"Confusion Matrix: \n{confusion_matrix(y_test, y_pred_lr)}")
print(f"Classification Report: \n{classification_report(y_test, y_pred_lr)}")


In [None]:
from sklearn.preprocessing import StandardScaler

# Standardize the feature set
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:
from sklearn.model_selection import GridSearchCV

# Define hyperparameter grid
param_grid = {'C': [0.01, 0.1, 1, 10, 100], 'solver': ['liblinear', 'lbfgs']}
grid_search = GridSearchCV(LogisticRegression(max_iter=1000), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_scaled, y_train)

# Use the best model
best_lr_model = grid_search.best_estimator_
y_pred_best_lr = best_lr_model.predict(X_test_scaled)

# Evaluate the tuned Logistic Regression model
print("Improved Logistic Regression Evaluation")
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Accuracy: {accuracy_score(y_test, y_pred_best_lr)}")
print(f"Confusion Matrix: \n{confusion_matrix(y_test, y_pred_best_lr)}")
print(f"Classification Report: \n{classification_report(y_test, y_pred_best_lr)}")


In [None]:
from sklearn.ensemble import RandomForestClassifier


# Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

print("Random Forest Classifier Evaluation")
print(f"Accuracy: {accuracy_score(y_test, y_pred_rf)}")
print(f"Confusion Matrix: \n{confusion_matrix(y_test, y_pred_rf)}")
print(f"Classification Report: \n{classification_report(y_test, y_pred_rf)}")



In [None]:
# XGBoost Classifier
import xgboost as xgb
xgb_model = xgb.XGBClassifier(n_estimators=100, random_state=42, use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)

print("XGBoost Classifier Evaluation")
print(f"Accuracy: {accuracy_score(y_test, y_pred_xgb)}")
print(f"Confusion Matrix: \n{confusion_matrix(y_test, y_pred_xgb)}")
print(f"Classification Report: \n{classification_report(y_test, y_pred_xgb)}")


In [None]:
# Feature importance from Random Forest
feature_importance_rf = rf_model.feature_importances_
important_features = pd.Series(feature_importance_rf, index=X_encoded.columns).sort_values(ascending=False)

plt.figure(figsize=(10, 6))
important_features.head(10).plot(kind='bar', color='skyblue')
plt.title("Top 10 Feature Importances (Random Forest)")
plt.show()


In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.preprocessing import LabelEncoder

# Encode target labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Build the neural network model
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')  # Sigmoid for binary classification
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train_scaled, y_train_encoded, epochs=50, batch_size=32, validation_split=0.2, verbose=1)

# Evaluate the model
loss, accuracy = model.evaluate(X_test_scaled, y_test_encoded)
print(f"Test Accuracy: {accuracy * 100:.2f}%")


In [None]:
import numpy as np
print("Unique Classes in y_train_encoded:", np.unique(y_train_encoded))



In [None]:
# Original target distribution
print("Original Class Distribution in y_train_encoded:")
print(pd.Series(y_train_encoded).value_counts())


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

# Model architecture with hyperparameters
def build_model(learning_rate=0.001, dropout_rate=0.3):
    model = Sequential([
        Dense(64, activation='relu', input_shape=(X_train_scaled.shape[1],)),
        Dropout(dropout_rate),
        Dense(32, activation='relu'),
        Dense(1, activation='sigmoid')  # Binary classification
    ])
    optimizer = Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    return model


In [None]:
# Build the model with default parameters
model = build_model()

# Train the model
history = model.fit(X_train_scaled, y_train_encoded,
                     validation_data=(X_test_scaled, y_test_encoded),
                     epochs=50,
                     batch_size=32,
                     verbose=1)


Epoch 1/50
[1m12874/12874[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 2ms/step - accuracy: 0.5016 - loss: 0.6986 - val_accuracy: 0.4977 - val_loss: 0.6933
Epoch 2/50
[1m12874/12874[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 2ms/step - accuracy: 0.4998 - loss: 0.6933 - val_accuracy: 0.5023 - val_loss: 0.6932
Epoch 3/50
[1m12874/12874[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 2ms/step - accuracy: 0.5017 - loss: 0.6932 - val_accuracy: 0.5023 - val_loss: 0.6932
Epoch 4/50
[1m12874/12874[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 2ms/step - accuracy: 0.5001 - loss: 0.6932 - val_accuracy: 0.4981 - val_loss: 0.6932
Epoch 5/50
[1m12874/12874[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 2ms/step - accuracy: 0.5012 - loss: 0.6932 - val_accuracy: 0.5022 - val_loss: 0.6931
Epoch 6/50
[1m12874/12874[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 2ms/step - accuracy: 0.5022 - loss: 0.6931 - val_accuracy: 0.4979 - val_loss: 0.693

In [None]:
import matplotlib.pyplot as plt

# Plot training and validation accuracy
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

# Plot training and validation loss
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()


In [None]:
# Evaluate the model
test_loss, test_accuracy = model.evaluate(X_test_scaled, y_test_encoded, verbose=0)
print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Test Loss: {test_loss:.4f}")


In [None]:
# Make predictions on test set
y_pred_probs = model.predict(X_test_scaled)
y_pred_classes = (y_pred_probs > 0.5).astype(int)

from sklearn.metrics import classification_report, confusion_matrix

print("Confusion Matrix:")
print(confusion_matrix(y_test_encoded, y_pred_classes))
print("\nClassification Report:")
print(classification_report(y_test_encoded, y_pred_classes))


In [None]:
new_data = pd.DataFrame({
    'Country': ['United Kingdom'],
    'Year': [2025],
    'Disease Name': ['Disease_ABC'],  # This would be mapped/encoded if necessary
    'Healthcare Access (%)': [92],  # Example value for UK
    'Doctors per 1000': [2.8],  # Example value for UK
    'Population Affected': [200000],  # Example affected population
    'Average Treatment Cost (USD)': [1500],  # Example value
    'Per Capita Income (USD)': [45000],  # Example value for UK
    'Urbanization Rate (%)': [80],  # Example urbanization rate
    'Recovery Rate (%)': [85],  # Example recovery rate
    'DALYs': [4.5]  # Example value
})


In [None]:
# Preprocess the new data (same as you did for training)
new_data_scaled = scaler.transform(new_data.drop(columns=['Country', 'Disease Name']))  # Exclude categorical columns

# If needed, you can also encode categorical variables, for example:
# new_data_encoded = encoder.transform(new_data['Disease Name'])


In [None]:
# Use the trained deep learning model to make predictions
prediction = model.predict(new_data_scaled)

# Since it's binary classification (0 or 1), you may want to interpret the result:
prediction_label = 'High Prevalence' if prediction[0] > 0.5 else 'Low Prevalence'
print(f"Predicted Prevalence for Disease_ABC in the UK: {prediction_label}")


In [None]:
predicted_probability = model.predict_proba(new_data_scaled)
print(f"Predicted Probability for High Prevalence: {predicted_probability[0][1]}")
