In [25]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# --------------------------
# Global Variables & Settings
# --------------------------


In [31]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# --------------------------
# Global Variables & Settings
# --------------------------
# Update these feature names to match your CSV column names if needed.
env_features_to_show = [
    "e_temperature_2m", 
    "e_soil_temperature_level_1", 
    "e_snow_density", 
    "e_surface_pressure", 
    "e_surface_thermal_radiation_downwards_sum",
    "e_surface_net_solar_radiation_sum",
    "e_total_evaporation_sum"
]

# Dictionary to configure matplotlib fonts and (optional) LaTeX settings.
tex_fonts = {
    # Uncomment the next lines if you have LaTeX installed and want to use it:
    # "text.usetex": True,
    # "font.family": "serif",
    "axes.labelsize": 8,
    "font.size": 12,
    "legend.fontsize": 8,
    "axes.titlesize": 12,
    "xtick.labelsize": 10,
    "ytick.labelsize": 10,
    'text.latex.preamble': r"\usepackage{amsmath}"
}

# Output directory for saving plots and CSV results
output_dir = r"C:\Users\user\Downloads\Capstone"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# --------------------------
# Analysis Functions
# --------------------------
def get_missing_values(csv_file):
    """
    Reads the CSV file, calculates missing values per row and per column,
    prints a summary, and saves the columns with missing values to a CSV.
    """
    df = pd.read_csv(csv_file, index_col=0)
    
    # Boolean DataFrame indicating missing values
    missing_values = df.isnull()
    
    # Count how many rows have at least one missing value
    total_rows_missing = missing_values.any(axis=1).sum()
    print(f"{total_rows_missing} instances have missing values in {csv_file}")
    
    # Sum missing values per column and keep only columns with at least one missing value
    missing_values_per_column = missing_values.sum()
    columns_with_missing_values = missing_values_per_column[missing_values_per_column > 0]
    
    # Save missing value counts to CSV
    missing_file = os.path.join(output_dir, f"{os.path.basename(csv_file).split('.')[0]}_missing_values.csv")
    columns_with_missing_values.to_csv(missing_file)
    print(f"Missing values per column saved to: {missing_file}")

def plot_correlation_among_features(csv_file, features_to_show):
    """
    Reads the CSV file, computes the correlation matrix for the selected features,
    and saves a heatmap of the correlation matrix with an expanded layout for better legibility.
    """
    df = pd.read_csv(csv_file, index_col=0)
    
    # Warn if any of the requested features are not found
    missing_features = [f for f in features_to_show if f not in df.columns]
    if missing_features:
        print(f"Warning: The following features are missing in the dataset: {missing_features}")
    
    # Use only the features that are present
    features_existing = [f for f in features_to_show if f in df.columns]
    
    if not features_existing:
        print("No valid features available for correlation plot. Skipping correlation plot.")
        return

    # Compute correlation matrix for the selected features
    corr_matrix = df[features_existing].corr()
    
    # Additional guard: Check if the correlation matrix has any valid numbers
    if np.isnan(corr_matrix.values).all():
        print("Correlation matrix contains only NaN values. Skipping correlation plot.")
        return
    
    # Update plot parameters
    plt.rcParams.update(tex_fonts)
    fig, ax = plt.subplots(figsize=(8, 6))  # increased figure size for better legibility
    
    # Create a heatmap using seaborn (with annotations) and adjust the colorbar size
    sns.heatmap(corr_matrix, ax=ax, cmap="plasma", annot=True, fmt=".2f",
                cbar_kws={'shrink': 0.8, 'label': 'Correlation Coefficient'})
    
    ax.set_title("Correlation among selected features", fontsize=14)
    # Optionally, you can set custom tick parameters for further clarity:
    ax.tick_params(axis='both', which='major', labelsize=10)
    
    fig.tight_layout()
    output_file = os.path.join(output_dir, "corr_matrix.pdf")
    plt.savefig(output_file, dpi=300)
    plt.close()
    print(f"Correlation heatmap saved to: {output_file}")

def plot_distribution(csv_file, columns_of_interest, var_name="variable"):
    """
    Reads the CSV file, selects the columns of interest, reshapes the data,
    and creates a bar plot showing the distribution of values in these columns.
    """
    df = pd.read_csv(csv_file, index_col=0)
    
    # Check if the columns exist
    missing_cols = [col for col in columns_of_interest if col not in df.columns]
    if missing_cols:
        print(f"Warning: The following columns are missing: {missing_cols}")
    cols_existing = [col for col in columns_of_interest if col in df.columns]
    
    if not cols_existing:
        print("No valid columns available for distribution plot. Skipping distribution plot.")
        return
    
    # Select and reshape the data to a long format for plotting
    data = df[cols_existing]
    melted = pd.melt(data, var_name=var_name, value_name="value")
    
    fig, ax = plt.subplots(figsize=(3, 4))
    sns.barplot(data=melted, x=var_name, y="value",
                errorbar="sd", palette="dark", alpha=0.6, ax=ax, order=cols_existing)
    ax.set_title(f"{var_name} Distribution")
    ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")
    
    fig.tight_layout()
    output_file = os.path.join(output_dir, f"{var_name}_distribution.pdf")
    plt.savefig(output_file, dpi=300)
    plt.close()
    print(f"Distribution plot saved to: {output_file}")

# --------------------------
# Main Script
# --------------------------
if __name__ == "__main__":
    # Provide the full path to your CSV file
    csv_file = r"C:\Users\user\Downloads\Capstone\2020_spatial_raw_master.csv"
    
    # 1. Missing Values Analysis
    get_missing_values(csv_file)
    
    # 2. Correlation Analysis for Environmental Features
    plot_correlation_among_features(csv_file, env_features_to_show)
    
    # 3. Distribution Plot for Ethnicity (example)
    ethnicity_columns = [
        "c_percent asian", 
        "c_percent black", 
        "c_percent mixed", 
        "c_percent white"
    ]
    plot_distribution(csv_file, ethnicity_columns, var_name="Ethnicity")


5114 instances have missing values in C:\Users\user\Downloads\Capstone\2020_spatial_raw_master.csv
Missing values per column saved to: C:\Users\user\Downloads\Capstone\2020_spatial_raw_master_missing_values.csv
Correlation heatmap saved to: C:\Users\user\Downloads\Capstone\corr_matrix.pdf
Distribution plot saved to: C:\Users\user\Downloads\Capstone\Ethnicity_distribution.pdf


In [34]:
import pandas as pd

# Specify the full path to your CSV file
csv_file = r"C:\Users\user\Downloads\Capstone\2020_spatial_raw_master.csv"

# Read the CSV file into a DataFrame
df = pd.read_csv(csv_file)

# Print the list of column names
print("Columns in the CSV file:")
print(df.columns.tolist())

Columns in the CSV file:
['geography code', 'LSOA21NM', 'geometry', 'centroid_x', 'centroid_y', 'c_percent asian', 'c_percent black', 'c_percent mixed', 'c_percent white', 'c_percent sikh', 'c_percent hindu', 'c_percent christian', 'c_percent jewish', 'c_percent buddhist', 'c_percent no religion', 'c_percent muslim', 'c_percent no central heating', 'c_percent wood heating', 'c_percent communal heating', 'c_percent TFW less than 2km', 'c_percent TFW 2km to 5km', 'c_percent TFW 60km and over', 'c_percent WFH', 'c_percent part-time', 'c_percent 15 hours or less worked', 'c_percent 49 or more hours worked', 'c_percent full-time', 'c_percent commute on foot', 'c_percent commute metro rail', 'c_percent commute bus', 'c_percent commute bicycle', 'c_percent commute train', 'c_percent commute car', 'c_percent same address', 'c_percent student moved to address', 'c_percent from within UK moved to address', 'c_percent outside UK moved to address', 'c_percent occupancy rating bedrooms +2', 'c_perc

In [40]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# --------------------------
# File Paths & Output Directory
# --------------------------
csv_file = r"C:\Users\user\Downloads\Capstone\2020_spatial_raw_master.csv"
output_dir = r"C:\Users\user\Downloads\Caps"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# --------------------------
# Load Data & Basic Summary
# --------------------------
df = pd.read_csv(csv_file, index_col=0)
print("Columns in the CSV:")
print(df.columns.tolist())

print("\nData Info:")
df.info()

print("\nDescriptive Statistics:")
print(df.describe())

# Save missing values summary
missing_values = df.isnull().sum()
print("\nMissing values per column (only columns with missing values):")
missing_subset = missing_values[missing_values > 0]
print(missing_subset)
missing_subset.to_csv(os.path.join(output_dir, "missing_values_summary.csv"))
print(f"Missing values summary saved to: {os.path.join(output_dir, 'missing_values_summary.csv')}")

# --------------------------
# Define Variable Groups (Update these lists as needed)
# --------------------------
# Health outcomes (per-capita measures, prescription rates, etc.)
health_vars = [
    "o_asthma_quantity_per_capita",
    "o_hypertension_quantity_per_capita"
]

# Sociodemographic variables (ethnicity, population, income, etc.)
socio_vars = [
    "c_percent asian",
    "c_percent black",
    "c_percent mixed",
    "c_percent white",
    "c_total population",
    "c_net annual income",
    "c_pop_density",
    "c_percent unemployed"
]

# Environmental variables (temperature, soil temperature, pollution, etc.)
env_vars = [
    "e_soil_temperature_level_1",
    "e_snow_density",
    "e_surface_pressure",
    "e_surface_thermal_radiation_downwards_sum",
    "e_NO2",
    "e_total_aerosol_optical_depth_at_550nm_surface", 
    "e_particulate_matter_d_less_than_25_um_surface",
    "e_ozone",
    "e_ndvi"
]

# --------------------------
# Combined Correlation Heatmap
# --------------------------
# Combine selected variables (only keep those that exist)
combined_vars = [var for var in (health_vars + socio_vars + env_vars) if var in df.columns]
if not combined_vars:
    print("No selected variables available for combined correlation analysis.")
else:
    corr_matrix = df[combined_vars].corr()
    plt.figure(figsize=(14, 12))
    sns.heatmap(corr_matrix, cmap="coolwarm", annot=True, fmt=".2f",
                cbar_kws={'shrink': 0.8, 'label': 'Correlation Coefficient'})
    plt.title("Correlation Matrix for Health, Sociodemographic, and Environmental Variables", fontsize=16)
    plt.xticks(fontsize=10, rotation=45)
    plt.yticks(fontsize=10)
    plt.tight_layout()
    combined_corr_file = os.path.join(output_dir, "combined_correlation_matrix.pdf")
    plt.savefig(combined_corr_file, dpi=300)
    plt.close()
    print(f"Combined correlation matrix saved to: {combined_corr_file}")

# --------------------------
# Distribution Plots for Each Group
# --------------------------
def plot_histograms(var_list, group_name):
    for var in var_list:
        if var in df.columns:
            plt.figure(figsize=(6, 4))
            sns.histplot(df[var].dropna(), kde=True, color='skyblue')
            plt.title(f"Distribution of {var}", fontsize=14)
            plt.xlabel(var, fontsize=12)
            plt.ylabel("Frequency", fontsize=12)
            plt.tight_layout()
            output_file = os.path.join(output_dir, f"{var}_distribution.pdf")
            plt.savefig(output_file, dpi=300)
            plt.close()
            print(f"{group_name} variable '{var}' distribution saved to: {output_file}")
        else:
            print(f"Variable {var} not found in the data.")

plot_histograms(health_vars, "Health Outcome")
plot_histograms(socio_vars, "Sociodemographic")
plot_histograms(env_vars, "Environmental")

# --------------------------
# Scatter Plots with Regression: Environmental vs Health Outcomes
# --------------------------
def plot_scatter_plots(x_vars, y_vars, x_group, y_group):
    for y in y_vars:
        if y not in df.columns:
            print(f"{y_group} variable {y} not found.")
            continue
        for x in x_vars:
            if x not in df.columns:
                print(f"{x_group} variable {x} not found.")
                continue
            plt.figure(figsize=(6, 4))
            sns.regplot(x=df[x], y=df[y], scatter_kws={'alpha': 0.5})
            plt.title(f"{y} vs {x}", fontsize=14)
            plt.xlabel(x, fontsize=12)
            plt.ylabel(y, fontsize=12)
            plt.tight_layout()
            output_file = os.path.join(output_dir, f"scatter_{y}_vs_{x}.pdf")
            plt.savefig(output_file, dpi=300)
            plt.close()
            print(f"Scatter plot for {y_group} '{y}' vs {x_group} '{x}' saved to: {output_file}")

# Environmental predictors vs Health outcomes
plot_scatter_plots(env_vars, health_vars, "Environmental", "Health Outcome")

# Sociodemographic predictors vs Health outcomes
plot_scatter_plots(socio_vars, health_vars, "Sociodemographic", "Health Outcome")

print("EDA complete.")


Columns in the CSV:
['LSOA21NM', 'geometry', 'centroid_x', 'centroid_y', 'c_percent asian', 'c_percent black', 'c_percent mixed', 'c_percent white', 'c_percent sikh', 'c_percent hindu', 'c_percent christian', 'c_percent jewish', 'c_percent buddhist', 'c_percent no religion', 'c_percent muslim', 'c_percent no central heating', 'c_percent wood heating', 'c_percent communal heating', 'c_percent TFW less than 2km', 'c_percent TFW 2km to 5km', 'c_percent TFW 60km and over', 'c_percent WFH', 'c_percent part-time', 'c_percent 15 hours or less worked', 'c_percent 49 or more hours worked', 'c_percent full-time', 'c_percent commute on foot', 'c_percent commute metro rail', 'c_percent commute bus', 'c_percent commute bicycle', 'c_percent commute train', 'c_percent commute car', 'c_percent same address', 'c_percent student moved to address', 'c_percent from within UK moved to address', 'c_percent outside UK moved to address', 'c_percent occupancy rating bedrooms +2', 'c_percent occupancy rating be