In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

In [2]:
pollutants=['PM2.5 (ug/m3)','PM10 (ug/m3)','NO2 (ug/m3)','SO2 (ug/m3)','CO_ug (mg/m3)','ozone']

## Train

In [3]:
df = pd.read_csv('train_delhi_aqi_new.csv')

In [4]:
df.head()

Unnamed: 0,PM2_5,PM10,NO2,SO2,CO_ug,Ozone,Temp,RH,WS,BP,WD,AQI (PM2.5)
0,142.7,390.0,17.92,2.78,0.86922,39.24,23.098722,38.61,1.45,942.72473,243.4,195.947367
1,130.43,326.85,17.61,2.69,0.86922,30.93,23.098722,39.48,1.89,942.72473,250.31,189.48947
2,114.45,274.28,24.29,2.45,0.86922,26.15,23.098722,41.33,1.34,942.72473,237.67,181.078946
3,148.47,292.0,37.68,2.5,0.86922,22.16,23.098722,48.07,1.45,942.72473,234.44,198.984211
4,151.22,449.3,46.61,3.21,0.86922,21.95,23.098722,51.32,2.34,942.72473,238.67,200.820001


In [5]:
df.columns

Index(['PM2_5', 'PM10', 'NO2', 'SO2', 'CO_ug', 'Ozone', 'Temp', 'RH', 'WS',
       'BP', 'WD', 'AQI (PM2.5)'],
      dtype='object')

## Normalization

In [6]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(df)
normalized_df = pd.DataFrame(scaled_data, columns=df.columns)

In [7]:
normalized_df.head()

Unnamed: 0,PM2_5,PM10,NO2,SO2,CO_ug,Ozone,Temp,RH,WS,BP,WD,AQI (PM2.5)
0,0.142683,0.389994,0.035823,0.013851,0.0878,0.19616,0.385005,0.386039,0.030303,0.608074,0.676084,0.391793
1,0.130413,0.326843,0.035203,0.013401,0.0878,0.154608,0.385005,0.394739,0.039562,0.608074,0.69528,0.378875
2,0.114432,0.274273,0.048564,0.012201,0.0878,0.130707,0.385005,0.413241,0.027988,0.608074,0.660166,0.362052
3,0.148453,0.291993,0.075346,0.012451,0.0878,0.110756,0.385005,0.480648,0.030303,0.608074,0.651193,0.397868
4,0.151203,0.449294,0.093207,0.016001,0.0878,0.109705,0.385005,0.513151,0.049032,0.608074,0.662944,0.40154


In [8]:
X = normalized_df.drop(columns=['AQI (PM2.5)'])  # Features
y = normalized_df['AQI (PM2.5)']  # Target

## Gradient Boosting Regressor

In [9]:
from sklearn.metrics import mean_squared_error

In [10]:
from sklearn.ensemble import GradientBoostingRegressor

# Initialize Gradient Boosting Regressor
gb_regressor = GradientBoostingRegressor(n_estimators=100, random_state=42) 

# Training Gradient Boosting Regressor
gb_regressor.fit(X, y)

## Validation

In [11]:
df_v = pd.read_csv('val_delhi_aqi_new.csv')

In [12]:
scaled_data_v = scaler.fit_transform(df_v)
normalized_df_v = pd.DataFrame(scaled_data_v, columns=df.columns)

In [13]:
X_val = normalized_df_v.drop(columns=['AQI (PM2.5)'])
y_val = normalized_df_v['AQI (PM2.5)']

In [14]:
y_val.info

<bound method Series.info of 0         0.245274
1         0.260077
2         0.238423
3         0.288781
4         0.314201
            ...   
139804    0.468051
139805    0.531882
139806    0.455909
139807    0.455909
139808    0.744917
Name: AQI (PM2.5), Length: 139809, dtype: float64>

In [15]:
# Predictions
y_pred_val = gb_regressor.predict(X_val)

# Evaluating the model
val_rmse = mean_squared_error(y_val, y_pred_val, squared=False)

print("Validation RMSE:", val_rmse)

Validation RMSE: 0.0014770710193058203


In [16]:
from sklearn.metrics import r2_score

# Calculate R-squared score
r2_v = r2_score(y_val, y_pred_val)
print("R-squared score:", r2_v)

R-squared score: 0.9999480430028876


## Test

In [17]:
df_t = pd.read_csv('test_delhi_aqi_new.csv')

In [18]:
scaled_data_t = scaler.fit_transform(df_t)
normalized_df_t = pd.DataFrame(scaled_data_t, columns=df.columns)

In [19]:
X_test = normalized_df_t.drop(columns=['AQI (PM2.5)'])
y_test = normalized_df_t['AQI (PM2.5)']

In [20]:
y_pred_test = gb_regressor.predict(X_test)

test_rmse = mean_squared_error(y_test, y_pred_test, squared=False)

print("Test RMSE:", test_rmse)

Test RMSE: 0.0013332186033744402


In [21]:
from sklearn.metrics import r2_score

r2_t = r2_score(y_test, y_pred_test)
print("R-squared score:", r2_t)

R-squared score: 0.9998839755125422
