## Import Libraries

In [49]:
import pandas as pd
import numpy as np
from pathlib import Path
from tqdm import tqdm
import json

## Variables

In [27]:
#models_folder = f"datasets/cams_aurn/UK/models/"
#file_checkpoint_ann = "model_site_aurn_cams_max_ann.weights.h5"

dataset_folder = "datasets/UK/"
climate_dataset = Path(dataset_folder)/Path("sentinel_cams_aurn.csv")

health_dataset = Path(dataset_folder)/Path("mental_health.csv")
health_climate_dataset = Path(dataset_folder)/Path("mental_health_climate.csv")

# health_dataset = Path(dataset_folder)/Path("respiratory.csv")
# health_climate_dataset = Path(dataset_folder)/Path("respiratory_climate.csv")

## **1. Load Datasets**

### 1.1 Health Dataset
1. Load the health dataset
2. Covert the IncidentTime to a date with on time element
3. remove rows where spatial data are both zero or NaNs

In [28]:
df_health = pd.read_csv(health_dataset)  # Replace with actual file path

In [29]:
df_health['IncidentTime'] = pd.to_datetime(df_health['IncidentTime']).dt.date

In [30]:
# df_health2 = df_health[(df_health['LSOA Centroid lat'] != 0) & (df_health['LSOA Centroid long'] != 0)]
df_health = df_health[
    (df_health['LSOA Centroid lat'] != 0) & 
    (df_health['LSOA Centroid long'] != 0) & 
    ~df_health['LSOA Centroid lat'].isna() & 
    ~df_health['LSOA Centroid long'].isna()
]

### 1.2. Climate Dataset
1. Load the Climate dataset

In [31]:
df_climate = pd.read_csv(climate_dataset, parse_dates=['Time'], dayfirst=True)

In [32]:
df_climate['Time'] = pd.to_datetime(df_climate['Time'])

### 1.3. Add Station Data to Health Data

#### 1.3.1 Function to compute haversine distance

In [33]:
def haversine(lat1, lon1, lat2, lon2):
    R = 6371  # Earth's radius in km
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat / 2) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2) ** 2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    return R * c

#### 1.3.2 Find Closest Station to Health Data

In [34]:
def match_and_filter(df_health, df_climate):
    # Pre-group climate data by date for quick lookup
    climate_groups = {date: group for date, group in df_climate.groupby(df_climate['Time'].dt.date)}
    
    matched_rows = []

    for _, health_row in tqdm(df_health.iterrows(), desc="Matching closest climate station..."):
        incident_date = health_row['IncidentTime']
        
        if incident_date in climate_groups:
            time_filtered = climate_groups[incident_date]
            
            # Extract health location
            health_lat, health_lon = float(health_row['LSOA Centroid lat']), float(health_row['LSOA Centroid long'])
            
            # Extract climate locations as NumPy arrays
            climate_lats = time_filtered['Latitude'].values
            climate_lons = time_filtered['Longitude'].values

            # Compute distances in a vectorized way
            distances = np.array([haversine(health_lat, health_lon, lat, lon) 
                                  for lat, lon in zip(climate_lats, climate_lons)])
            
            # Find the closest climate station
            closest_idx = np.argmin(distances)
            closest_row = time_filtered.iloc[closest_idx]

            # Append matched row
            matched_row = pd.concat([health_row, closest_row])
            matched_rows.append(matched_row)

    return pd.DataFrame(matched_rows) if matched_rows else pd.DataFrame()


In [35]:
# Match and filter the data
df_health_climate = match_and_filter(df_health, df_climate)

Matching closest climate station...: 232953it [05:50, 665.25it/s]


#### 1.3.3. Save the update health data

In [36]:
df_health_climate.to_csv(health_climate_dataset, index=False)

# Utilities: Use to merge CSV files 

In [32]:
import chardet
## Read a small part of the file to detect encoding
with open(Path(dataset_folder) / "2019.csv", 'rb') as f:
    result = chardet.detect(f.read(100000))  # Read first 100,000 bytes

print("Detected encoding:", result['encoding'])

# Now read the CSV using the detected encoding
df_2019 = pd.read_csv(Path(dataset_folder) / "2019.csv", encoding=result['encoding'])
df_2020 = pd.read_csv(Path(dataset_folder) / "2020.csv", encoding=result['encoding'])
df_2021 = pd.read_csv(Path(dataset_folder) / "2021.csv", encoding=result['encoding'])
df_2022 = pd.read_csv(Path(dataset_folder) / "2022.csv", encoding=result['encoding'])
df_2023 = pd.read_csv(Path(dataset_folder) / "2023.csv", encoding=result['encoding'])
df_2024 = pd.read_csv(Path(dataset_folder) / "2024.csv", encoding=result['encoding'])

# Merge three DataFrames with the same columns
df_merged = pd.concat([df_2019.dropna(), df_2020.dropna(), df_2021.dropna(), df_2022.dropna(),
                       df_2023.dropna(), df_2024.dropna()], ignore_index=True)


Detected encoding: ISO-8859-1


  df_2019 = pd.read_csv(Path(dataset_folder) / "2019.csv", encoding=result['encoding'])
  df_2022 = pd.read_csv(Path(dataset_folder) / "2022.csv", encoding=result['encoding'])
  df_2023 = pd.read_csv(Path(dataset_folder) / "2023.csv", encoding=result['encoding'])
  df_2024 = pd.read_csv(Path(dataset_folder) / "2024.csv", encoding=result['encoding'])


In [33]:
df_merged.to_csv(Path(dataset_folder) / "mental_health.csv", encoding="utf-8", index=False)

In [30]:
# Save this data for Common Knowledge

In [42]:
df_climate_reduced_cols = df_climate[
["Time", "SiteNumber", "SiteName", "Longitude", "Latitude", "aurn_go3", "aurn_go3_max", "ch4_c", "CH4_S", "t"]
]

In [43]:
convert_cols=["ch4_c"]

In [44]:
def mass_concentration(mmr, M_air=28.97, p=101325, T=298.15):
    """
    Calculate mass concentration in µg/m^3 from mass mixing ratio (mmr).
    Parameters:
    mmr (float): Mass mixing ratio (dimensionless).
    M_air (float): Molar mass of air in g/mol. Default is 28.97 g/mol.
    p (float): Pressure in Pascals (Pa). Default is 101325 Pa (standard atmospheric pressure).
    T (float): Temperature in Kelvin (K). Default is 298.15 K (25°C).
    
    Returns:
    float: Mass concentration in µg/m^3.
    """
    
    # Universal Gas constant (J/(mol·K))
    R = 8.3144598
    # Calculate mass concentration (µg/m^3)
    mass_concentration_value = 1e6 * mmr * M_air * (p / (R * T))

    return mass_concentration_value

In [45]:
df_climate_reduced_cols = df_climate_reduced_cols.copy() # Create a copy to avoid the view issue
for convert_col in convert_cols:
    df_climate_reduced_cols[convert_col] = df_climate_reduced_cols[convert_col].apply(mass_concentration)

In [51]:
column_docs = {
    "Time": "Represents the date. There is no time component, as the data represents daily measurements.",
    "SiteNumber": "A unique number representing an AURN site.",
    "SiteName": "An AURN-specific site name.",
    "Longitude": "The longitude coordinate of the AURN site location.",
    "Latitude": "The latitude coordinate of the AURN site location.",
    "aurn_go3": "The daily mean of ground-level ozone measured at the AURN site, expressed in µg/m³.",
    "aurn_go3_max": "The daily maximum of ground-level ozone measured at the AURN site, expressed in µg/m³.",
    "ch4_c": "The daily methane measurement at the site (Methane (CH₄) total column), collected from the CAMS dataset. It is originally measured in kg/kg (mmr), but converted to µg/m³.",
    "CH4_S": "The daily methane measurement at the site (Methane (CH₄)), collected from the Sentinel-5P dataset, measured in ppb.",
    "t": "The daily mean temperature, measured in Kelvin (K)."
}


# Save CSV
df_climate_reduced_cols.to_csv(Path(dataset_folder)/Path("climate_CK.csv"), index=False)

# Save metadata separately
metadata = {
    "columns": column_docs,
    "source": "Multiple: AURN, CAMs and Sentinel5p",  
    "curated_by": "John Atanbori"
}

with open(Path(dataset_folder)/Path("climate_metadata_CK.json"), "w") as f:
    json.dump(metadata, f, indent=4)


In [54]:
column_docs = {
    "Time": "Represents the date. There is no time component, as the data represents daily measurements.",
    "SiteNumber": "A unique number representing an AURN site.",
    "SiteName": "An AURN-specific site name.",
    "Longitude": "The longitude coordinate of the AURN site location.",
    "Latitude": "The latitude coordinate of the AURN site location.",
    "Primary_Impression_Code": "The code for the primary diagnosis that a healthcare provider identifies as the most significant at the time of the patient's visit or treatment.",
    "Primary_Impression": "The primary reason the patient is seeking medical attention.",
    "Primary_Impression_Count": "The daily primary impression counts by AURN site, representing cases reported that are closer to that AURN site.",
}

# Save metadata separately
metadata = {
    "columns": column_docs,
    "source": "Multiple: AURN, and EMAS",  
    "curated_by": "John Atanbori"
}

with open(Path(dataset_folder)/Path("health_metadata_CK.json"), "w") as f:
    json.dump(metadata, f, indent=4)


In [52]:
df_health2 = pd.read_csv(Path(dataset_folder)/Path("mental_health_summary.csv"))

Unnamed: 0,Time,SiteNumber,SiteName,Longitude,Latitude,Primary_Impression_Code,Primary_Impression,Primary_Impression_Count
0,2020-01-01,26,Burton-on-Trent Horninglow,-1.635718,52.821050,1.0,Acute Behavioural Disturbance (Mental Health),1
1,2020-01-01,26,Burton-on-Trent Horninglow,-1.635718,52.821050,2.0,Anxiety (Mental Health),1
2,2020-01-01,26,Burton-on-Trent Horninglow,-1.635718,52.821050,3.0,Intentional Drug Overdose (Mental Health),1
3,2020-01-01,39,Chesterfield Loundsley Green,-1.454946,53.244131,2.0,Anxiety (Mental Health),2
4,2020-01-01,39,Chesterfield Loundsley Green,-1.454946,53.244131,3.0,Intentional Drug Overdose (Mental Health),2
...,...,...,...,...,...,...,...,...
100389,2024-06-15,156,Tallington,-0.381000,52.656308,2.0,Anxiety (Mental Health),2
100390,2024-06-15,156,Tallington,-0.381000,52.656308,2.0,Depression (Mental Health),1
100391,2024-06-15,159,Toft Newton,-0.449788,53.374133,2.0,Depression (Mental Health),1
100392,2024-06-15,159,Toft Newton,-0.449788,53.374133,3.0,Self Harm (Mental Health),1
