<a href="https://colab.research.google.com/github/Kshitij-Tripathi87/Kshitij_AIH_2025_Screening/blob/main/Kshitij_AIH_2025_Screening.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor

class DelhiPM25Predictor:
    def __init__(self):
        self.model = None

        self.important_features = [
            'PM2.5_lag1', 'PM10_lag1', 'AQI_lag1', 'PM2.5_rolling_3',
            'CO', 'NO2', 'month'
        ]

    def load_and_clean_data(self, file_path):

        df = pd.read_csv('/content/city_day.csv')


        delhi_data = df[df['City'] == 'Delhi'].copy()
        delhi_data['Date'] = pd.to_datetime(delhi_data['Date'])
        delhi_data = delhi_data.sort_values('Date')

        return delhi_data

    def create_essential_features(self, data):


        essential_cols = ['Date', 'PM2.5', 'PM10', 'NO2', 'CO', 'AQI']
        df = data[essential_cols].copy()


        df['month'] = df['Date'].dt.month


        df['PM2.5_lag1'] = df['PM2.5'].shift(1)
        df['PM10_lag1'] = df['PM10'].shift(1)
        df['AQI_lag1'] = df['AQI'].shift(1)


        df['PM2.5_rolling_3'] = df['PM2.5'].rolling(window=3, min_periods=1).mean()


        df_clean = df.ffill().bfill()

        return df_clean

    def train_and_predict(self, data):


        X = data[self.important_features]
        y = data['PM2.5']


        valid_mask = ~(X.isnull().any(axis=1) | y.isnull())
        X_clean = X[valid_mask]
        y_clean = y[valid_mask]


        self.model = RandomForestRegressor(
            n_estimators=30,
            max_depth=6,
            min_samples_split=5,
            random_state=42,
            n_jobs=-1
        )

        self.model.fit(X_clean, y_clean)


        last_features = X_clean.iloc[[-1]][self.important_features]


        prediction = self.model.predict(last_features)[0]

        last_date = data['Date'].max()
        next_date = last_date + pd.Timedelta(days=1)


        print(f"Predicted PM2.5 for {next_date.date()}: {prediction:.2f} μg/m³")

        return prediction


def predict_next_day_pm25():
    predictor = DelhiPM25Predictor()


    data = predictor.load_and_clean_data('/content/city_day.csv')
    processed_data = predictor.create_essential_features(data)


    prediction = predictor.train_and_predict(processed_data)

    return prediction


final_prediction = predict_next_day_pm25()

Predicted PM2.5 for 2020-07-02: 44.83 μg/m³
