In [10]:
#Import required libraries
import pandas as pd#For data handling
import os
from prophet import Prophet
from tqdm import tqdm#For progress bar on group iteration


#load the file path to your raw data
Environmental_data = os.path.join("C:/Users/emman/Downloads/PM2.5_Pollution_Data-Public/Scripts/Cleaned_Environmental_Data_Prophet.csv")

# Load the data into a pandas DataFrame
df = pd.read_csv(Environmental_data)

#Preview the shape and head to confirm it loaded correctly
print("Dataset successfully loaded.")
print(f"Shape of dataset: {df.shape}")#(rows, columns)
print(df.head())#Display first 5 rows
print(df.columns.tolist())#Check column names



Dataset successfully loaded.
Shape of dataset: (309, 39)
   Numeric Area Code  Area Code Local Authority Name  PM2.5 2012 (total)  \
0                1.0  E07000223                 Adur             10.6389   
1                2.0  E07000026            Allerdale              6.5756   
2                4.0  E07000032         Amber Valley             10.4766   
3                5.0  E07000224                 Arun             10.0845   
4                6.0  E07000170             Ashfield             11.1841   

   PM2.5 2012 (non-anthropogenic)  PM2.5 2012 (anthropogenic)  \
0                          2.2917                      8.3472   
1                          1.9619                      4.6137   
2                          2.0689                      8.4077   
3                          2.2713                      7.8132   
4                          2.0643                      9.1198   

   PM2.5 2013 (total)  PM2.5 2013 (non-anthropogenic)  \
0             10.6921                 

In [14]:
#STEP 2: RESHAPE FROM WIDE TO LONG FORMAT
# Identify PM2.5 total columns (exclude anthropogenic breakdowns)
pm_columns = [col for col in df.columns if "PM2.5" in col and "(total)" in col]

# ID columns for location metadata
id_columns = ['Local Authority Name', 'Area Code']

# Convert from wide to long format
df_long_format = df.melt(id_vars=id_columns, value_vars=pm_columns,
                  var_name='year_label', value_name='PM2.5')

# Extract year number and create date column (Prophet uses datetime)
df_long_format['year'] = df_long_format['year_label'].str.extract(r'(\d{4})').astype(int)
df_long_format['ds'] = pd.to_datetime(df_long_format['year'], format='%Y')

# Drop original label column (we don’t need it anymore)
df_long_format = df_long_format.drop(columns=['year_label'])

# OPTIONAL: Check missing data
print(df_long_format.isna().sum())
print(df.head())#Display first 5 rows
print(df.columns.tolist())#Check column names

Local Authority Name      0
Area Code                 0
PM2.5                   100
year                      0
ds                        0
dtype: int64
   Numeric Area Code  Area Code Local Authority Name  PM2.5 2012 (total)  \
0                1.0  E07000223                 Adur             10.6389   
1                2.0  E07000026            Allerdale              6.5756   
2                4.0  E07000032         Amber Valley             10.4766   
3                5.0  E07000224                 Arun             10.0845   
4                6.0  E07000170             Ashfield             11.1841   

   PM2.5 2012 (non-anthropogenic)  PM2.5 2012 (anthropogenic)  \
0                          2.2917                      8.3472   
1                          1.9619                      4.6137   
2                          2.0689                      8.4077   
3                          2.2713                      7.8132   
4                          2.0643                      9.1198   


In [19]:
#STEP 3: DEFINE A FUNCTION TO DETECT GAPS
# Function to check if there are large year gaps
def has_large_year_gaps(group):
    years = sorted(group['year'].dropna().unique())
    if len(years) < 3:
        return True  # Too little data
    gaps = [b - a for a, b in zip(years, years[1:])]
    return max(gaps) > 5  # More than 5-year gap is too large


In [20]:
#STEP 4: IMPUTE WITH PROPHET OR FALLBACK TO MEDIAN
# Container for all imputed groups
imputed_rows = []

# Loop through each Local Authority group
for (loc, code), group in tqdm(df_long_format.groupby(['Local Authority Name', 'Area Code'])):
    group = group.copy()  # Avoid modifying original group
    group_non_missing = group.dropna(subset=['PM2.5'])  # Use only available values

    # Check if Prophet can be applied
    if len(group_non_missing) < 3 or has_large_year_gaps(group_non_missing):
        # Use median imputation as fallback
        median_val = group_non_missing['PM2.5'].median()
        group['PM2.5_imputed'] = group['PM2.5']
        group['Imputation method'] = group['PM2.5'].apply(
            lambda x: 'Original' if pd.notna(x) else 'Median'
        )
        group.loc[group['PM2.5'].isna(), 'PM2.5_imputed'] = median_val
        imputed_rows.append(group)
        continue  # Move to next group

    # Fit Prophet model
    prophet_df = group_non_missing[['ds', 'PM2.5']].rename(columns={'ds': 'ds', 'PM2.5': 'y'})
    m = Prophet(daily_seasonality=False, yearly_seasonality=True)
    m.fit(prophet_df)

    # Create yearly forecast (2012–2023)
    future = pd.DataFrame({'ds': pd.date_range(start='2012-01-01', end='2023-01-01', freq='YS')})
    forecast = m.predict(future)[['ds', 'yhat']]  # Only keep prediction

    # Merge forecast with original group
    merged = pd.merge(group, forecast, on='ds', how='left')
    merged['PM2.5_imputed'] = merged['PM2.5']  # Start with original

    # Label each row by method used
    def get_method(row):
        if pd.notna(row['PM2.5']):
            return 'Original'
        elif pd.notna(row['yhat']):
            return 'Prophet'
        else:
            return 'Median'

    merged['Imputation method'] = merged.apply(get_method, axis=1)

    # Apply Prophet prediction only where PM2.5 is missing
    merged.loc[merged['PM2.5'].isna() & merged['yhat'].notna(), 'PM2.5_imputed'] = merged['yhat']

    # Median fallback in edge cases
    median_val = group_non_missing['PM2.5'].median()
    merged.loc[merged['PM2.5_imputed'].isna(), 'PM2.5_imputed'] = median_val
    merged['Imputation method'] = merged['Imputation method'].replace('Median', 'Median (fallback)')

    imputed_rows.append(merged.drop(columns=['yhat']))


  0%|                                                                                          | 0/309 [00:00<?, ?it/s]02:48:54 - cmdstanpy - INFO - Chain [1] start processing
02:48:55 - cmdstanpy - INFO - Chain [1] done processing
  0%|▎                                                                                 | 1/309 [00:03<18:49,  3.67s/it]02:48:55 - cmdstanpy - INFO - Chain [1] start processing
02:48:55 - cmdstanpy - INFO - Chain [1] done processing
  1%|▌                                                                                 | 2/309 [00:03<08:41,  1.70s/it]02:48:56 - cmdstanpy - INFO - Chain [1] start processing
02:48:56 - cmdstanpy - INFO - Chain [1] done processing
  1%|▊                                                                                 | 3/309 [00:04<05:30,  1.08s/it]02:48:56 - cmdstanpy - INFO - Chain [1] start processing
02:48:56 - cmdstanpy - INFO - Chain [1] done processing
  1%|█                                                                  

In [21]:
# Combine all imputed groups into one final DataFrame
final_df = pd.concat(imputed_rows)

# Save to CSV file
final_df.to_csv("Prophet_Median_Tracked_Imputed_PM25.csv", index=False)

# Final check
print("✅ Imputation complete. File saved as 'Prophet_Median_Tracked_Imputed_PM25.csv'")


✅ Imputation complete. File saved as 'Prophet_Median_Tracked_Imputed_PM25.csv'
