In [16]:
#Import required libraries
import pandas as pd#For working with tabular data\data handling
import os #OS module = To check if file exist
from prophet import Prophet #Time series model to forecast missing values 
from tqdm import tqdm #Adds a  progress bar on group iteration/loop

# load the file path to your raw data
CLeaned_Environmental_data = os.path.join("C:/Users/emman/Downloads/PM2.5_Pollution_Data-Public/Scripts/Training_and_Testing_Dataset/Cleaned_Environmental_Data.csv")

# 📥 Load the data into a pandas DataFrame
df = pd.read_csv(CLeaned_Environmental_data)

#Preview the shape and head to confirm it loaded correctly
print("Dataset successfully loaded.")
print(f"Shape of dataset: {df.shape}")  # (rows, columns)
print(df.head())  # Display first 5 rows


Dataset successfully loaded.
Shape of dataset: (309, 39)
   Numeric Area Code  Area Code Local Authority Name  PM2.5 2012 (total)  \
0                1.0  E07000223                 Adur             10.6389   
1                2.0  E07000026            Allerdale              6.5756   
2                4.0  E07000032         Amber Valley             10.4766   
3                5.0  E07000224                 Arun             10.0845   
4                6.0  E07000170             Ashfield             11.1841   

   PM2.5 2012 (non-anthropogenic)  PM2.5 2012 (anthropogenic)  \
0                          2.2917                      8.3472   
1                          1.9619                      4.6137   
2                          2.0689                      8.4077   
3                          2.2713                      7.8132   
4                          2.0643                      9.1198   

   PM2.5 2013 (total)  PM2.5 2013 (non-anthropogenic)  \
0             10.6921                 

In [17]:
import pandas as pd
# STEP 1 – Identify Column Groups

pm25_total_cols = [col for col in df.columns if col.endswith("(total)")]
pm25_anthro_cols = [col for col in df.columns if col.endswith("(anthropogenic)")]
pm25_nonanthro_cols = [col for col in df.columns if col.endswith("(non-anthropogenic)")]

# Check counts (sanity check)
print(f"Total: {len(pm25_total_cols)}, Anthro: {len(pm25_anthro_cols)}, Non-Anthro: {len(pm25_nonanthro_cols)}")

# STEP 2 – Melt to Long Format by Category
# Metadata columns to keep
id_columns = ['Numeric Area Code', 'Area Code', 'Local Authority Name']

# Helper function to convert one group
def melt_pm25_group(df, cols, label):
    df_long = df.melt(
        id_vars=id_columns,
        value_vars=cols,
        var_name='year_label',
        value_name=label
    )
    # Extract year
    df_long['Year'] = df_long['year_label'].str.extract(r'(\d{4})').astype(int)
    return df_long.drop(columns=['year_label'])

# Apply for each type
df_total = melt_pm25_group(df, pm25_total_cols, 'PM2.5_Total')
df_anthro = melt_pm25_group(df, pm25_anthro_cols, 'PM2.5_Anthropogenic')
df_nonanthro = melt_pm25_group(df, pm25_nonanthro_cols, 'PM2.5_Non_Anthropogenic')

# STEP 3 – Merge all three types by keys
# Merge on all metadata + Year
df_merged = df_total.merge(df_anthro, on=id_columns + ['Year'], how='outer')
df_merged = df_merged.merge(df_nonanthro, on=id_columns + ['Year'], how='outer')

# Final check
print(df_merged.head())

Total: 12, Anthro: 12, Non-Anthro: 12
   Numeric Area Code  Area Code Local Authority Name  PM2.5_Total  Year  \
0                1.0  E07000223                 Adur      10.6389  2012   
1                1.0  E07000223                 Adur      10.6921  2013   
2                1.0  E07000223                 Adur       9.9463  2014   
3                1.0  E07000223                 Adur       9.4679  2015   
4                1.0  E07000223                 Adur      11.1224  2016   

   PM2.5_Anthropogenic  PM2.5_Non_Anthropogenic  
0               8.3472                   2.2917  
1               8.4336                   2.2585  
2               7.6801                   2.2662  
3               8.0684                   1.3994  
4              10.5668                   0.5557  


In [18]:
df_long_format = df_merged.melt(
    id_vars=['Numeric Area Code', 'Area Code', 'Local Authority Name', 'Year'],
    value_vars=['PM2.5_Total', 'PM2.5_Anthropogenic', 'PM2.5_Non_Anthropogenic'],
    var_name='PM2.5_Type',
    value_name='PM2.5'
)

# Add datetime
df_long_format['ds'] = pd.to_datetime(df_long_format['Year'].astype(str) + '-01-01')


In [19]:
# Split based on Year
train_df = df_merged[df_merged['Year'] <= 2021].copy()
test_df = df_merged[df_merged['Year'] > 2021].copy()

# Sanity check
print("Training set:", train_df['Year'].min(), "to", train_df['Year'].max())
print("Test set:", test_df['Year'].min(), "to", test_df['Year'].max())


Training set: 2012 to 2021
Test set: 2022 to 2023


In [20]:
# Create training data in long format similar to df_long_format but only for training years
df_train_long = train_df.melt(
    id_vars=['Numeric Area Code', 'Area Code', 'Local Authority Name', 'Year'],
    value_vars=['PM2.5_Total', 'PM2.5_Anthropogenic', 'PM2.5_Non_Anthropogenic'],
    var_name='PM2.5_Type',
    value_name='PM2.5'
)

# Add datetime column for Prophet or other purposes
df_train_long['ds'] = pd.to_datetime(df_train_long['Year'].astype(str) + '-01-01')


In [21]:
#Validation Function to check of a region has enough continuous yearly data for prophet.
def validate_region_for_prophet(group):
    """
    Check if a region has enough continuous yearly data for Prophet.
    """
    years = sorted(group['Year'].dropna().unique())
    expected_years = list(range(min(years), max(years) + 1))
    is_continuous = years == expected_years
    enough_data = len(years) >= 5  # Set threshold, e.g., 5+ years
    
    return is_continuous and enough_data


In [22]:
#Flags on training data indicating if prophet should be used
# 2. Generate flags for each group
group_keys = ['Local Authority Name', 'Area Code', 'PM2.5_Type']
validation_flags = []

# Loop through regions to assign flags
for keys, group in df_train_long.groupby(group_keys):
    flag = validate_region_for_prophet(group)
    validation_flags.append({
        'Local Authority Name': keys[0],
        'Area Code': keys[1],
        'PM2.5_Type': keys[2],
        'good_for_prophet': flag
    })

# Create a DataFrame of flags
df_flags = pd.DataFrame(validation_flags)

# Merge flags into the training long dataframe
df_train_long = df_train_long.merge(df_flags, on=group_keys, how='left')


In [23]:
# STEP 3: DEFINE A FUNCTION TO DETECT GAPS
# Define gap-checking function (from your earlier step)
def has_large_year_gaps(group):
    years = sorted(group['Year'].dropna().unique())  
    if len(years) < 3:
        return True  # Not enough data points to model reliably
    gaps = [b - a for a, b in zip(years, years[1:])]
    return max(gaps) > 5  # Flag if any gap is larger than 5 years
#For a given group (e.g. grouped by Local Authority Name), 
#It extracts all available years (ignores missing ones), 
#If there are fewer than 3 years, it flags it as insufficient. 
#Then it checks for gaps larger than 5 years (e.g. 2012 → 2020).
#Returns True if:
#Years are too few, or
#There's a big gap.

In [28]:
# -----------------------------
# 3. Conditional Imputation Loop on training set data
# -----------------------------
imputed_rows = []

for keys, group in df_train_long.groupby(group_keys):
    use_prophet = group['good_for_prophet'].iloc[0]

    if use_prophet:
        # Prophet imputation block (same as before)
        df_prophet = group[['Year', 'PM2.5']].copy()
    df_prophet = df_prophet.rename(columns={'Year': 'ds', 'PM2.5': 'y'})
    df_prophet['ds'] = pd.to_datetime(df_prophet['ds'], format='%Y')

    if df_prophet['y'].notna().sum() >= 2:
        m = Prophet(yearly_seasonality=False, daily_seasonality=False, weekly_seasonality=False)
        m.fit(df_prophet.dropna())
    
        future = m.make_future_dataframe(periods=0, freq='YE')
        forecast = m.predict(future)

        merged = df_prophet.merge(forecast[['ds', 'yhat']], on='ds', how='left')
        merged['y'] = merged['y'].fillna(merged['yhat'])
        group['PM2.5'] = merged['y'].values
    else:
        # Not enough data for Prophet, fallback to median imputation
        median_value = group['PM2.5'].median()
        group['PM2.5'] = group['PM2.5'].fillna(median_value)
        group['Imputation method'] = 'Median'

    imputed_rows.append(group)

# Combine all imputed rows
df_imputed = pd.concat(imputed_rows)


02:33:25 - cmdstanpy - INFO - Chain [1] start processing
02:33:25 - cmdstanpy - INFO - Chain [1] done processing
02:33:26 - cmdstanpy - INFO - Chain [1] start processing
02:33:26 - cmdstanpy - INFO - Chain [1] done processing
02:33:26 - cmdstanpy - INFO - Chain [1] start processing
02:33:26 - cmdstanpy - INFO - Chain [1] done processing
02:33:26 - cmdstanpy - INFO - Chain [1] start processing
02:33:26 - cmdstanpy - INFO - Chain [1] done processing
02:33:26 - cmdstanpy - INFO - Chain [1] start processing
02:33:26 - cmdstanpy - INFO - Chain [1] done processing
02:33:27 - cmdstanpy - INFO - Chain [1] start processing
02:33:27 - cmdstanpy - INFO - Chain [1] done processing
02:33:27 - cmdstanpy - INFO - Chain [1] start processing
02:33:27 - cmdstanpy - INFO - Chain [1] done processing
02:33:27 - cmdstanpy - INFO - Chain [1] start processing
02:33:27 - cmdstanpy - INFO - Chain [1] done processing
02:33:27 - cmdstanpy - INFO - Chain [1] start processing
02:33:27 - cmdstanpy - INFO - Chain [1]

In [29]:
print(df_imputed['Imputation method'].value_counts())


Imputation method
Median    90
Name: count, dtype: int64


In [31]:
# Pivot back to wide format
df_wide = df_imputed.pivot_table(
    index=['Numeric Area Code', 'Area Code', 'Local Authority Name', 'Year'],
    columns='PM2.5_Type',
    values='PM2.5'
).reset_index()

# Flatten column names (optional, if needed)
df_wide.columns.name = None  # Remove the index name from pivot


In [33]:
df_wide.to_csv("Imputed_training_Environmental_Data.csv", index=False)


In [34]:
#Step 6: Split (Again) into Train/Test Sets
# Reload the pivoted wide dataset (optional if already in memory)
df_wide = pd.read_csv("Imputed_training_Environmental_Data.csv")

# Split based on year
train_df = df_wide[df_wide['Year'] <= 2021].copy()
test_df = df_wide[df_wide['Year'] > 2021].copy()

print(f"Train set shape: {train_df.shape}")
print(f"Test set shape: {test_df.shape}")


Train set shape: (3004, 7)
Test set shape: (0, 7)


In [35]:
#Step 7: Min-Max Scaling (Training Set Only)
!pip install scikit-learn
from sklearn.preprocessing import MinMaxScaler

# Identify PM2.5 columns
pm_cols = ['PM2.5_Total', 'PM2.5_Anthropogenic', 'PM2.5_Non_Anthropogenic']

# Initialise scaler and fit on training set
scaler = MinMaxScaler()
train_scaled_pm25 = scaler.fit_transform(train_df[pm_cols])

# Replace original PM2.5 columns with scaled ones in train set
train_df_scaled = train_df.copy()
train_df_scaled[pm_cols] = train_scaled_pm25



[notice] A new release of pip is available: 24.3.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip




In [16]:
#Step 8: Apply Same Scaling to Test Set
# Apply the same transformation (no fitting!)
#test_scaled_pm25 = scaler.transform(test_df[pm_cols])

#test_df_scaled = test_df.copy()
#test_df_scaled[pm_cols] = test_scaled_pm25


In [37]:
train_df_scaled.to_csv("Train_Scaled_PM25.csv", index=False)
#test_df_scaled.to_csv("Test_Scaled_PM25.csv", index=False)
