In [1]:
import pandas as pd
import numpy as np


print("Delhi Air Quality Project - Model Training")

Delhi Air Quality Project - Model Training


In [6]:
# Loading AQI data
aqi_df = pd.read_csv('../data/raw/aqi.csv')
print("Data Loaded Successfully - Here are the columns:")
print(aqi_df.columns)

df = pd.read_csv('../data/processed/final_delhi_aqi_weather_merged_final.csv')
df['date'] = pd.to_datetime(df['date'])
print(" Data Loaded Successfully")

Data Loaded Successfully - Here are the columns:
Index(['date', 'state', 'area', 'number_of_monitoring_stations',
       'prominent_pollutants', 'aqi_value', 'air_quality_status', 'unit',
       'note'],
      dtype='object')
 Data Loaded Successfully


In [None]:
def create_ml_features(df):
    """
    Creating features for AQI prediction
    """
    features = df.copy()
    
    # ========== TIME-BASED FEATURES ==========
    # 1. Lag features (most important!)
    features['yesterday_aqi'] = features['AQI'].shift(1)
    features['day_before_yesterday_aqi'] = features['AQI'].shift(2)
    
    # 2. Rolling averages
    features['last_3days_avg'] = features['AQI'].rolling(3).mean().shift(1)
    features['last_7days_avg'] = features['AQI'].rolling(7).mean().shift(1)
    features['last_14days_avg'] = features['AQI'].rolling(14).mean().shift(1)
    
    # 3. Day of week effects (Monday=0)
    features['day_of_week'] = features['date'].dt.dayofweek
    features['is_weekend'] = (features['day_of_week'] >= 5).astype(int)
    
    # 4. Monthly/seasonal patterns
    features['month'] = features['date'].dt.month
    features['month_sin'] = np.sin(2 * np.pi * features['month']/12)
    features['month_cos'] = np.cos(2 * np.pi * features['month']/12)
    
    # 5. Year progress (0 to 1)
    features['day_of_year'] = features['date'].dt.dayofyear
    features['year_progress'] = features['day_of_year'] / 365
    
    #  WEATHER FEATURES 
#     if 'wind_speed' in features.columns:
#         features['yesterday_wind'] = features['wind_speed'].shift(1)
#         features['wind_category'] = pd.cut(features['wind_speed'], 
#                                           bins=[0, 2, 4, 6, 8, 20],
#                                           labels=[0, 1, 2, 3, 4])
    
#     if 'temperature_2m_mean' in features.columns:
#         features['yesterday_temp'] = features['temperature_2m_mean'].shift(1)
#         features['temp_change'] = features['temperature_2m_mean'].diff()
        
#     if 'precipitation_mm' in features.columns:
#         features['yesterday_rain'] = features['precipitation_mm'].shift(1)
#         features['rain_last_3days'] = features['precipitation_mm'].rolling(3).sum().shift(1)
#         features['had_rain_yesterday'] = (features['precipitation_mm'].shift(1) > 0).astype(int)
    
#     if 'humidity_percent' in features.columns:
#         features['yesterday_humidity'] = features['humidity_percent'].shift(1)
    
#     # ========== POLLUTANT FEATURES ==========
#     pollutant_cols = [col for col in features.columns if col.startswith('has_')]
#     for col in pollutant_cols:
#         features[f'yesterday_{col}'] = features[col].shift(1)
    
#     # ========== INTERACTION FEATURES ==========
#     # Cold + low wind = worst pollution
#     if 'temperature_2m_mean' in features.columns and 'wind_speed' in features.columns:
#         features['cold_low_wind'] = ((features['temperature_2m_mean'] < 15) & 
#                                      (features['wind_speed'] < 2)).astype(int)
    
#     # Weekend + bad air quality day before
#     features['weekend_after_bad_air'] = ((features['is_weekend'] == 1) & 
#                                         (features['yesterday_aqi'] > 200)).astype(int)
    
#     # ========== TARGET VARIABLE ==========
#     # Predict tomorrow's AQI
#     features['target_aqi'] = features['AQI'].shift(-1)
    
#     # Remove rows with NaN values (from shifting)
#     features = features.dropna()
    
#     print(f"✅ Created {len(features.columns)} features total")
#     print(f"   • Time features: {sum(['aqi' in col.lower() or 'day' in col.lower() or 'month' in col.lower() for col in features.columns])}")
#     print(f"   • Weather features: {sum(['temp' in col.lower() or 'wind' in col.lower() or 'rain' in col.lower() or 'humid' in col.lower() for col in features.columns])}")
#     print(f"   • Target samples: {len(features)}")
    
#     return features

# # Create features
# features_df = create_ml_features(df)