In [2]:
## Import needed Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings('ignore')


In [3]:
## Read the Dataset
df_healthcare = pd.read_csv('/workspaces/SudanChapter_AnalyzeHealthcareAccessibility/04_Data_analysis/final_data_folder/final_healthcare_facility_dataset.csv')
df_ndvi = pd.read_csv('/workspaces/SudanChapter_AnalyzeHealthcareAccessibility/04_Data_analysis/final_data_folder/final_ndvi_geographic.csv')
df_population = pd.read_csv('/workspaces/SudanChapter_AnalyzeHealthcareAccessibility/04_Data_analysis/final_data_folder/population-by-cities-data.csv')
df_ndwi = pd.read_csv('/workspaces/SudanChapter_AnalyzeHealthcareAccessibility/04_Data_analysis/final_data_folder/final_ndwi_geographic.csv')
df_socioeconomic = pd.read_csv('/workspaces/SudanChapter_AnalyzeHealthcareAccessibility/04_Data_analysis/final_data_folder/final_socioeco_demographics.csv')


In [4]:
# Display information about each dataset
datasets = {
    "Healthcare": df_healthcare,
    "NDVI": df_ndvi,
    "Population": df_population,
    "NDWI": df_ndwi,
    "Socioeconomic": df_socioeconomic
}

for name, df in datasets.items():
    print(f"Dataset: {name}")
    print(f"Shape: {df.shape}")
    print(f"Columns: {df.columns.tolist()}")
    print("-" * 40)

Dataset: Healthcare
Shape: (1129, 12)
Columns: ['Unnamed: 0', 'name', 'amenity', 'addr_city', 'osm_id', 'osm_type', 'geometry', 'name_english', 'Latitude', 'Longitude', 'City', 'State']
----------------------------------------
Dataset: NDVI
Shape: (141963, 3)
Columns: ['date', 'value', 'state']
----------------------------------------
Dataset: Population
Shape: (71, 5)
Columns: ['city', 'country', 'pop2024', 'latitude', 'longitude']
----------------------------------------
Dataset: NDWI
Shape: (141963, 3)
Columns: ['date', 'value', 'state']
----------------------------------------
Dataset: Socioeconomic
Shape: (74, 34)
Columns: ['Unnamed: 0', 'Access to anti-retroviral drugs, female (%)', 'Access to anti-retroviral drugs, male (%)', 'Agriculture, forestry, and fishing, value added (% of GDP)', 'Current health expenditure (% of GDP)', 'Domestic general government health expenditure (% of GDP)', 'Domestic general government health expenditure (% of current health expenditure)', 'Domestic

In [5]:

# Lowercase column names for consistency
datasets = [df_healthcare, df_ndvi, df_population, df_ndwi, df_socioeconomic]
for df in datasets:
    df.columns = df.columns.str.lower()

In [6]:
# Merge NDVI and NDWI datasets
print(f"NDVI Shape: {df_ndvi.shape}, NDWI Shape: {df_ndvi.shape}")
combined_env_df = pd.merge(df_ndwi, df_ndwi, on=['date', 'state'], how='outer')
print(f"Combined Environmental Data Shape: {combined_env_df.shape}")

NDVI Shape: (141963, 3), NDWI Shape: (141963, 3)
Combined Environmental Data Shape: (2539161, 4)


In [7]:
# Merge healthcare and population data
print(f"Healthcare Shape: {df_healthcare.shape}, Population Shape: {df_population.shape}")
combined_healthcare_population_df = pd.merge(df_healthcare, df_population, on='city', how='outer')
print(f"Combined Healthcare and Population Shape: {combined_healthcare_population_df.shape}")


Healthcare Shape: (1129, 12), Population Shape: (71, 5)
Combined Healthcare and Population Shape: (1173, 16)


In [8]:
# Print column names for debugging
print(f"The columns in Socio-economic DataFrame: {df_socioeconomic.columns.tolist()}")
print(f"The columns in Environmental DataFrame: {combined_env_df.columns.tolist()}")

# Find similar columns
common_columns = set(df_socioeconomic.columns).intersection(set(combined_env_df.columns))
print(f"Common Columns Between the Two DataFrames: {common_columns}")


The columns in Socio-economic DataFrame: ['unnamed: 0', 'access to anti-retroviral drugs, female (%)', 'access to anti-retroviral drugs, male (%)', 'agriculture, forestry, and fishing, value added (% of gdp)', 'current health expenditure (% of gdp)', 'domestic general government health expenditure (% of gdp)', 'domestic general government health expenditure (% of current health expenditure)', 'domestic private health expenditure (% of current health expenditure)', 'immunization, dpt (% of children ages 12-23 months)', 'immunization, hepb3 (% of one-year-old children)', 'immunization, measles (% of children ages 12-23 months)', 'incidence of malaria (per 1,000 population at risk)', 'incidence of tuberculosis (per 100,000 people)', 'internally displaced persons, new displacement associated with conflict and violence (number of cases)', 'internally displaced persons, new displacement associated with disasters (number of cases)', 'internally displaced persons, total displaced by conflict a

In [9]:
df_socioeconomic.head(2)

Unnamed: 0,unnamed: 0,"access to anti-retroviral drugs, female (%)","access to anti-retroviral drugs, male (%)","agriculture, forestry, and fishing, value added (% of gdp)",current health expenditure (% of gdp),domestic general government health expenditure (% of gdp),domestic general government health expenditure (% of current health expenditure),domestic private health expenditure (% of current health expenditure),"immunization, dpt (% of children ages 12-23 months)","immunization, hepb3 (% of one-year-old children)",...,people with basic handwashing facilities including soap and water (% of population),political stability and absence of violence/terrorism: estimate,political stability and absence of violence/terrorism: percentile rank,rural population,rural population (% of total population),tuberculosis treatment success rate (% of new cases),urban population,urban population (% of total population),year,total_deaths_from_violence
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1950.0,
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1951.0,


In [10]:
# Ensure the 'date' column is in datetime format
combined_env_df['date'] = pd.to_datetime(combined_env_df['date'])

# Extract the year and create a new column 'year'
combined_env_df['year'] = combined_env_df['date'].dt.year

# Display the first few rows to verify
print(combined_env_df.head())


        date   value_x      state   value_y  year
0 2015-10-20 -0.011545  AlJazirah -0.011545  2015
1 2015-10-20 -0.011545  AlJazirah -0.011545  2015
2 2015-10-20 -0.011545  AlJazirah -0.011545  2015
3 2015-10-20 -0.011545  AlJazirah -0.011545  2015
4 2015-10-20 -0.011545  AlJazirah -0.011545  2015


In [9]:

#print(combined_env_df['date'].value_counts)
#print(df_socioeconomic['year'].value_counts)

In [10]:
# Merge socio-economic data with environmental data
print(f"Socio-economic Shape: {df_socioeconomic.shape}")
if 'year' in df_socioeconomic.columns:
    combined_env_socio_df = pd.merge(df_socioeconomic, combined_env_df, on='year', how='outer')
else:
    combined_env_socio_df = pd.merge(df_socioeconomic, combined_env_df, on='year', how='outer')
print(f"Combined Environmental and Socio-economic Shape: {combined_env_socio_df.shape}")

Socio-economic Shape: (74, 34)
Combined Environmental and Socio-economic Shape: (2539226, 38)


: 

In [None]:
# Merge all dataframes
print(f"Combining Healthcare/Population Data with Socio-environmental Data...")
final_df = pd.merge(combined_healthcare_population_df, combined_env_socio_df, on='state', how='outer')
print(f"Final Merged DataFrame Shape: {final_df.shape}")

Combining Healthcare/Population Data with Socio-environmental Data...


In [None]:
# Save final dataset
output_path = '/workspaces/SudanChapter_AnalyzeHealthcareAccessibility/04_Data_analysis/final_data_folder/final_combined_dataset.csv'
final_df.to_csv(output_path, index=False)
print(f"Final dataset saved to: {output_path}")