In [20]:
#Import required libraries
import pandas as pd#For working with tabular data\data handling
import os #OS module = To check if file exist
from prophet import Prophet #Time series model to forecast missing values 
from tqdm import tqdm #Adds a  progress bar on group iteration/loop

# load the file path to your raw data
CLeaned_Environmental_data = os.path.join("C:/Users/emman/Downloads/PM2.5_Pollution_Data-Public/Scripts/Training_and_Testing_Dataset/Cleaned_Environmental_Data.csv")

# 📥 Load the data into a pandas DataFrame
df = pd.read_csv(CLeaned_Environmental_data)

#Preview the shape and head to confirm it loaded correctly
print("Dataset successfully loaded.")
print(f"Shape of dataset: {df.shape}")  # (rows, columns)
print(df.head())  # Display first 5 rows


Dataset successfully loaded.
Shape of dataset: (309, 39)
   Numeric Area Code  Area Code Local Authority Name  PM2.5 2012 (total)  \
0                1.0  E07000223                 Adur             10.6389   
1                2.0  E07000026            Allerdale              6.5756   
2                4.0  E07000032         Amber Valley             10.4766   
3                5.0  E07000224                 Arun             10.0845   
4                6.0  E07000170             Ashfield             11.1841   

   PM2.5 2012 (non-anthropogenic)  PM2.5 2012 (anthropogenic)  \
0                          2.2917                      8.3472   
1                          1.9619                      4.6137   
2                          2.0689                      8.4077   
3                          2.2713                      7.8132   
4                          2.0643                      9.1198   

   PM2.5 2013 (total)  PM2.5 2013 (non-anthropogenic)  \
0             10.6921                 

In [21]:
import pandas as pd
# STEP 1 – Identify Column Groups

pm25_total_cols = [col for col in df.columns if col.endswith("(total)")]
pm25_anthro_cols = [col for col in df.columns if col.endswith("(anthropogenic)")]
pm25_nonanthro_cols = [col for col in df.columns if col.endswith("(non-anthropogenic)")]

# Check counts (sanity check)
print(f"Total: {len(pm25_total_cols)}, Anthro: {len(pm25_anthro_cols)}, Non-Anthro: {len(pm25_nonanthro_cols)}")

# STEP 2 – Melt to Long Format by Category
# Metadata columns to keep
id_columns = ['Numeric Area Code', 'Area Code', 'Local Authority Name']

# Helper function to convert one group
def melt_pm25_group(df, cols, label):
    df_long = df.melt(
        id_vars=id_columns,
        value_vars=cols,
        var_name='year_label',
        value_name=label
    )
    # Extract year
    df_long['Year'] = df_long['year_label'].str.extract(r'(\d{4})').astype(int)
    return df_long.drop(columns=['year_label'])

# Apply for each type
df_total = melt_pm25_group(df, pm25_total_cols, 'PM2.5_Total')
df_anthro = melt_pm25_group(df, pm25_anthro_cols, 'PM2.5_Anthropogenic')
df_nonanthro = melt_pm25_group(df, pm25_nonanthro_cols, 'PM2.5_Non_Anthropogenic')

# STEP 3 – Merge all three types by keys
# Merge on all metadata + Year
df_merged = df_total.merge(df_anthro, on=id_columns + ['Year'], how='outer')
df_merged = df_merged.merge(df_nonanthro, on=id_columns + ['Year'], how='outer')

# Final check
print(df_merged.head())

Total: 12, Anthro: 12, Non-Anthro: 12
   Numeric Area Code  Area Code Local Authority Name  PM2.5_Total  Year  \
0                1.0  E07000223                 Adur      10.6389  2012   
1                1.0  E07000223                 Adur      10.6921  2013   
2                1.0  E07000223                 Adur       9.9463  2014   
3                1.0  E07000223                 Adur       9.4679  2015   
4                1.0  E07000223                 Adur      11.1224  2016   

   PM2.5_Anthropogenic  PM2.5_Non_Anthropogenic  
0               8.3472                   2.2917  
1               8.4336                   2.2585  
2               7.6801                   2.2662  
3               8.0684                   1.3994  
4              10.5668                   0.5557  


In [22]:
df_long_format = df_merged.melt(
    id_vars=['Numeric Area Code', 'Area Code', 'Local Authority Name', 'Year'],
    value_vars=['PM2.5_Total', 'PM2.5_Anthropogenic', 'PM2.5_Non_Anthropogenic'],
    var_name='PM2.5_Type',
    value_name='PM2.5'
)

# Add datetime
df_long_format['ds'] = pd.to_datetime(df_long_format['Year'].astype(str) + '-01-01')


In [23]:
# Split based on Year
train_df = df_merged[df_merged['Year'] <= 2021].copy()
test_df = df_merged[df_merged['Year'] > 2021].copy()

# Sanity check
print("Training set:", train_df['Year'].min(), "to", train_df['Year'].max())
print("Test set:", test_df['Year'].min(), "to", test_df['Year'].max())


Training set: 2012 to 2021
Test set: 2022 to 2023


In [24]:
# STEP 3: DEFINE A FUNCTION TO DETECT GAPS
# Define gap-checking function (from your earlier step)
def has_large_year_gaps(group):
    years = sorted(group['Year'].dropna().unique())  
    if len(years) < 3:
        return True  # Not enough data points to model reliably
    gaps = [b - a for a, b in zip(years, years[1:])]
    return max(gaps) > 5  # Flag if any gap is larger than 5 years
#For a given group (e.g. grouped by Local Authority Name), 
#It extracts all available years (ignores missing ones), 
#If there are fewer than 3 years, it flags it as insufficient. 
#Then it checks for gaps larger than 5 years (e.g. 2012 → 2020).
#Returns True if:
#Years are too few, or
#There's a big gap.

In [None]:
# STEP 4 – Impute Missing Values
# Container for storing imputed groups
imputed_rows = []

# Filter: Training only
df_train_long = df_long_format[df_long_format['Year'] <= 2021].copy()

# Loop through TRAINING GROUPS ONLY
grouped = df_train_long.groupby(['Local Authority Name', 'Area Code', 'PM2.5_Type'])

for (loc, code, pm_type), train_group in tqdm(grouped, desc="Imputing training groups"):
    full_group = df_long_format[
        (df_long_format['Local Authority Name'] == loc) &
        (df_long_format['Area Code'] == code) &
        (df_long_format['PM2.5_Type'] == pm_type)
    ].copy()

    train_non_missing = train_group.dropna(subset=['PM2.5'])

    if len(train_non_missing) < 3 or has_large_year_gaps(train_non_missing):
        # Median fallback
        median_val = train_non_missing['PM2.5'].median()
        full_group['PM2.5_imputed'] = full_group['PM2.5']
        full_group['Imputation method'] = full_group['PM2.5'].apply(
            lambda x: 'Original' if pd.notna(x) else 'Median'
        )
        full_group.loc[full_group['PM2.5'].isna(), 'PM2.5_imputed'] = median_val
        imputed_rows.append(full_group)
        continue

    # Prophet fit ONLY on training subset
    prophet_df = train_non_missing[['ds', 'PM2.5']].rename(columns={'PM2.5': 'y'})
    m = Prophet(daily_seasonality=False, yearly_seasonality=True)
    m.fit(prophet_df)

    # Forecast entire range: 2012–2023
    future = pd.DataFrame({'ds': pd.date_range(start='2012-01-01', end='2023-01-01', freq='YS')})
    forecast = m.predict(future)[['ds', 'yhat']]

    # Merge full group (including test years!) with forecast
    merged = pd.merge(full_group, forecast, on='ds', how='left')
    merged['PM2.5_imputed'] = merged['PM2.5']

    def get_method(row):
        if pd.notna(row['PM2.5']):
            return 'Original'
        elif pd.notna(row['yhat']):
            return 'Prophet'
        else:
            return 'Median (fallback)'

    merged['Imputation method'] = merged.apply(get_method, axis=1)
    merged.loc[merged['PM2.5'].isna() & merged['yhat'].notna(), 'PM2.5_imputed'] = merged['yhat']

    # Final fallback (again for full group)
    merged.loc[merged['PM2.5_imputed'].isna(), 'PM2.5_imputed'] = train_non_missing['PM2.5'].median()

    imputed_rows.append(merged.drop(columns=['yhat']))

# Combine
df_imputed = pd.concat(imputed_rows, ignore_index=True)


Imputing training groups:   0%|                                                                | 0/927 [00:00<?, ?it/s]05:25:46 - cmdstanpy - INFO - Chain [1] start processing
05:25:47 - cmdstanpy - INFO - Chain [1] done processing
Imputing training groups:   0%|                                                        | 1/927 [00:01<20:05,  1.30s/it]05:25:47 - cmdstanpy - INFO - Chain [1] start processing
05:25:47 - cmdstanpy - INFO - Chain [1] done processing
Imputing training groups:   0%|                                                        | 2/927 [00:01<11:02,  1.40it/s]05:25:48 - cmdstanpy - INFO - Chain [1] start processing
05:25:48 - cmdstanpy - INFO - Chain [1] done processing
Imputing training groups:   0%|▏                                                       | 3/927 [00:01<07:59,  1.93it/s]05:25:48 - cmdstanpy - INFO - Chain [1] start processing
05:25:48 - cmdstanpy - INFO - Chain [1] done processing
Imputing training groups:   0%|▏                                        

In [26]:
print(df_imputed['Imputation method'].value_counts())


Imputation method
Original    10824
Prophet       219
Median         81
Name: count, dtype: int64


In [27]:
# Pivot back to wide format
df_wide = df_imputed.pivot_table(
    index=['Numeric Area Code', 'Area Code', 'Local Authority Name', 'Year'],
    columns='PM2.5_Type',
    values='PM2.5_imputed'
).reset_index()

# Flatten column names (optional, if needed)
df_wide.columns.name = None  # Remove the index name from pivot


In [28]:
df_imputed.to_csv("Imputed_Environmental_Data.csv", index=False)
