In [11]:
# STEP 0: Import necessary libraries
import pandas as pd                     # For data manipulation
import numpy as np                      # For numerical operations
from datetime import datetime           # To record timestamp in metadata
import json                             # To save metadata as a JSON file
import os #OS module = To check if file exist

# STEP 1: Load the cleaned, imputed dataset (from Rule 7)
prophet_imputed_environmental_data = os.path.join("C:/Users/emman/Downloads/PM2.5_Pollution_Data-Public/Scripts/Finial_Prophet_Imputed_Cleaned_Environmental_Data.csv")
df = pd.read_csv(prophet_imputed_environmental_data)

# STEP 2: Identify continuous PM2.5 columns for normalization
# We are normalizing these to a 0–1 range using Min-Max scaling
# STEP 2: Identify continuous PM2.5 columns for normalization (corrected)
pm25_cols = [
    'PM2.5_Anthropogenic',
    'PM2.5_Non_Anthropogenic',
    'PM2.5_Total'
]


In [12]:
# STEP 3: Initialize a dictionary to store scaling info
# This helps track min and max values for each column
scaling_info = {}


In [13]:
print(df.columns.tolist())
df.columns = df.columns.str.strip()  # Remove leading/trailing spaces
# Optional debug step:
print("Available columns:")
print(df.columns.tolist())


['Numeric Area Code', 'Area Code', 'Local Authority Name', 'Year', 'PM2.5_Anthropogenic', 'PM2.5_Non_Anthropogenic', 'PM2.5_Total', 'PM2.5_Anthropogenic_ImputationMethod', 'PM2.5_Non_Anthropogenic_ImputationMethod', 'PM2.5_Total_ImputationMethod']
Available columns:
['Numeric Area Code', 'Area Code', 'Local Authority Name', 'Year', 'PM2.5_Anthropogenic', 'PM2.5_Non_Anthropogenic', 'PM2.5_Total', 'PM2.5_Anthropogenic_ImputationMethod', 'PM2.5_Non_Anthropogenic_ImputationMethod', 'PM2.5_Total_ImputationMethod']


In [14]:
# STEP 4: Apply Min-Max Scaling to each PM2.5 column
# We scale AFTER imputation, as required by Rule 9

for col in pm25_cols:
    min_val = df[col].min()
    max_val = df[col].max()

    # Save original min and max for metadata documentation
    scaling_info[col] = {'min': min_val, 'max': max_val}

    # Avoid division by zero in case min == max
    if max_val != min_val:
        df[col] = (df[col] - min_val) / (max_val - min_val)
    else:
        df[col] = 0.0  # If constant column, assign scaled value of 0


In [17]:
#STEP 5: Save the normalized DataFrame to a new CSV file
# This satisfies Rule 10 for reproducibility

output_filename = "Final_Environmental_Data_Normalized_For_ML.csv"
df.to_csv(output_filename, index=False)


In [16]:
#STEP 6: Generate metadata (Rule 12)
# Includes timestamp, row count, scaling info, imputation method counts

# Identify tracking columns used for imputation
imputed_cols = [
    'PM2.5_Anthropogenic_ImputationMethod',
    'PM2.5_Non_Anthropogenic_ImputationMethod',
    'PM2.5_Total_ImputationMethod'
]

# Count how many rows used Prophet or Median for each PM2.5 metric
imputation_summary = {}
for col in imputed_cols:
    imputation_summary[col] = dict(df[col].value_counts(dropna=False))

# Count total number of rows in the final dataset
rows_final = df.shape[0]

# Compile all metadata into a dictionary
metadata = {
    'Timestamp': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
    'Final_Row_Count': rows_final,
    'PM2.5_Columns_Scaled': pm25_cols,
    'Scaling_Parameters': scaling_info,
    'Imputation_Method_Counts': imputation_summary,
    'Note': (
        "Data has been imputed (Prophet or Median), normalized using Min-Max scaling, "
        "and exported for use in ML pipelines. Tracking columns were excluded from scaling."
    )
}

# Save metadata as a JSON file
with open("Final_Environmental_Data_Metadata.json", "w") as f:
    json.dump(metadata, f, indent=4)

TypeError: Object of type int64 is not JSON serializable

In [None]:
# Final print summary for the user

print("✅ Rules 8–12 successfully applied.")
print(f"📁 Normalized dataset saved to: {output_filename}")
print("📄 Metadata saved to: Final_Environmental_Data_Metadata.json")