In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import joblib
import numpy as np
import os
import xgboost as xgb
from sklearn.metrics import mean_absolute_error, r2_score

print("--- Starting Project Evaluation ---")

# Step 1: Data Loading
file_path = '/content/drive/MyDrive/Datasets/city_day.csv'
stations_file_path = '/content/drive/MyDrive/Datasets/stations.csv'

try:
    airpollution_df = pd.read_csv(file_path)
    stations_data = pd.read_csv(stations_file_path)
    print("1. Data loaded successfully.")
except FileNotFoundError as e:
    print(f"Error: {e}. Please check the file paths.")
    exit()

# Step 2: Data Preprocessing and Feature Engineering
print("2. Preprocessing and feature engineering in progress...")

# Merge with stations data to add more features
airpollution_processed = pd.merge(airpollution_df, stations_data, on='City', how='left')

# Convert 'Datetime' column to datetime and set it as the index
airpollution_processed['Datetime'] = pd.to_datetime(airpollution_processed['Datetime'])
airpollution_processed.set_index('Datetime', inplace=True)

# Handle missing values using median for numerical columns
for col in airpollution_processed.columns:
    if airpollution_processed[col].dtype in ['float64', 'int64']:
        median_val = airpollution_processed[col].median()
        airpollution_processed[col].fillna(median_val, inplace=True)

# Drop original categorical and non-feature columns BEFORE one-hot encoding City
cols_to_drop_before_encoding = ['AQI_Bucket', 'Station', 'State', 'Location']
existing_cols_to_drop_before_encoding = [col for col in cols_to_drop_before_encoding if col in airpollution_processed.columns]
airpollution_processed.drop(existing_cols_to_drop_before_encoding, axis=1, inplace=True)

# One-hot encode the 'City' column to handle categorical data
if 'City' in airpollution_processed.columns:
    city_dummies = pd.get_dummies(airpollution_processed['City'], prefix='city', dtype=int)
    airpollution_processed = pd.concat([airpollution_processed, city_dummies], axis=1)
    airpollution_processed.drop('City', axis=1, inplace=True)

# Create new time-based features from the index
airpollution_processed['month'] = airpollution_processed.index.month
airpollution_processed['year'] = airpollution_processed.index.year
airpollution_processed['day_of_week'] = airpollution_processed.index.dayofweek
if 'AQI' in airpollution_processed.columns:
    airpollution_processed['AQI_rolling_avg'] = airpollution_processed['AQI'].rolling(window=3, min_periods=1).mean()

# Create Lag and Rolling features
lag_cols = ['PM2.5', 'PM10', 'NO', 'NO2', 'NOx', 'NH3', 'CO', 'SO2', 'O3', 'Benzene', 'Toluene', 'Xylene', 'AQI']
lag_steps = [1, 2, 7]
for col in lag_cols:
    if col in airpollution_processed.columns:
        for step in lag_steps:
            airpollution_processed[f'{col}_lag{step}'] = airpollution_processed[col].shift(step)

rolling_cols = ['PM2.5', 'PM10', 'NO', 'NO2', 'NOx', 'NH3', 'CO', 'SO2', 'O3', 'Benzene', 'Toluene', 'Xylene', 'AQI']
window_sizes = [3, 7, 30]
for col in rolling_cols:
    if col in airpollution_processed.columns:
        for window in window_sizes:
            airpollution_processed[f'{col}_rolling_mean_{window}d'] = airpollution_processed[col].rolling(window=window, min_periods=1).mean()
            airpollution_processed[f'{col}_rolling_std_{window}d'] = airpollution_processed[col].rolling(window=window, min_periods=1).std()

# Handle missing values introduced by lagging and rolling calculations
for col in airpollution_processed.columns:
    if '_lag' in col or '_rolling_' in col:
        airpollution_processed[col].fillna(airpollution_processed[col].mean(), inplace=True)

# Define X (features) and y (target)
X = airpollution_processed.select_dtypes(include=np.number).drop('AQI', axis=1, errors='ignore')
y = airpollution_processed['AQI']

# Drop rows with any remaining NaN values
combined = pd.concat([X, y], axis=1).dropna()
X = combined.drop('AQI', axis=1)
y = combined['AQI']

# Save the feature list for the frontend
joblib.dump(X.columns.tolist(), 'model_features.pkl')
print("Features saved successfully as model_features.pkl")

# Step 3: Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Train the XGBoost Model
print("\n--- Training XGBoost Model with New Features ---")

# Best parameters for tuning of XGBoost
best_xgb_params = {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 9, 'n_estimators': 300, 'subsample': 0.7}

retrained_xgb_model = xgb.XGBRegressor(**best_xgb_params, random_state=42)
retrained_xgb_model.fit(X_train, y_train)

# Step 5: Model Evaluation
tuned_xgb_predictions_new_features = retrained_xgb_model.predict(X_test)
tuned_xgb_r2_new_features = r2_score(y_test, tuned_xgb_predictions_new_features)
tuned_xgb_mae_new_features = mean_absolute_error(y_test, tuned_xgb_predictions_new_features)

# Output the results
print(f"XGBoost Model Mean Absolute Error (MAE): {tuned_xgb_mae_new_features}")
print(f"XGBoost R-squared Score (R2): {tuned_xgb_r2_new_features}")

# Step 6: Save the final trained model
joblib.dump(retrained_xgb_model, 'air_pollution_xgb_model.pkl')
print("\nModel saved successfully as as air_pollution_xgb_model.pkl")

--- Starting Project Evaluation ---
1. Data loaded successfully.
2. Preprocessing and feature engineering in progress...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  airpollution_processed[col].fillna(median_val, inplace=True)
  airpollution_processed[f'{col}_rolling_std_{window}d'] = airpollution_processed[col].rolling(window=window, min_periods=1).std()
  airpollution_processed[f'{col}_rolling_mean_{window}d'] = airpollution_processed[col].rolling(window=window, min_periods=1).mean()
  airpollution_processed[f'{col}_rolling_std_{window}d'] = airpollution_processed[col].rolling(window=window, min_periods=1).std()
  airpollution_processed[f'{col}_rolling_mean_{window}d'] = airpollution_processed[col].rolling(window=window, min_periods=1).mean()
  airpollution_processed[f'

Features saved successfully as model_features.pkl

--- Training XGBoost Model with New Features ---
XGBoost Model Mean Absolute Error (MAE): 4.1990624177579905
XGBoost R-squared Score (R2): 0.998366390470257

Model saved successfully as as air_pollution_xgb_model.pkl
