In [2]:
!pip install scikit-learn==1.5.2

Collecting scikit-learn==1.5.2
  Downloading scikit_learn-1.5.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading scikit_learn-1.5.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.3/13.3 MB[0m [31m88.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.6.0
    Uninstalling scikit-learn-1.6.0:
Successfully installed scikit-learn-1.5.2


In [2]:
import pandas as pd

In [3]:
df = islamabad_data = pd.read_csv('Islamabad_AQI.csv')

In [4]:
aqi_breakpoints = {
    "PM2.5": [(0, 12, 0, 50), (12.1, 35.4, 51, 100), (35.5, 55.4, 101, 150), (55.5, 150.4, 151, 200)],
    "NO2": [(0, 53, 0, 50), (54, 100, 51, 100), (101, 360, 101, 150), (361, 649, 151, 200)],
    "SO2": [(0, 35, 0, 50), (36, 75, 51, 100), (76, 185, 101, 150), (186, 304, 151, 200)]
}

In [5]:
def calculate_aqi(concentration, pollutant):
  for bp in aqi_breakpoints[pollutant]:
    c_low, c_high, i_low, i_high = bp
    if c_low <= concentration <= c_high:
      aqi = ((i_high - i_low) / (c_high - c_low)) * (concentration - c_low) + i_low
      return round(aqi)
  return None

In [6]:
df[df.isnull().any(axis=1)]

Unnamed: 0,Temperature,Humidity,NO2,SO2,PM2.5,Date
824,27.0,,9.3,13.77,19.3,2021-09-03
840,28.0,,10.1,22.7,34.8,2021-09-19
949,,91.7,7.72,19.66,57.83,2022-01-06
950,,90.1,7.45,18.83,44.67,2022-01-07


In [7]:
df.dropna(inplace=True)

In [8]:
df

Unnamed: 0,Temperature,Humidity,NO2,SO2,PM2.5,Date
0,29.32,33.23,18.61,18.54,32.12,2019-06-01
1,35.04,30.76,20.42,7.77,30.60,2019-06-02
2,25.95,36.43,11.54,11.06,30.18,2019-06-03
3,30.15,38.38,11.71,11.40,30.91,2019-06-04
4,25.87,36.55,11.97,11.57,30.74,2019-06-05
...,...,...,...,...,...,...
1393,18.00,63.00,3.81,16.90,19.70,2023-03-27
1394,18.33,64.00,3.82,17.11,22.00,2023-03-28
1395,17.33,53.33,4.15,16.56,27.33,2023-03-29
1396,16.00,81.00,3.80,15.33,18.90,2023-03-30


In [9]:
df["AQI_PM2.5"] = df["PM2.5"].apply(lambda x: calculate_aqi(x, "PM2.5"))
df["AQI_NO2"] = df["NO2"].apply(lambda x: calculate_aqi(x, "NO2"))
df["AQI_SO2"] = df["SO2"].apply(lambda x: calculate_aqi(x, "SO2"))

In [10]:
df["AQI"] = df[["AQI_PM2.5", "AQI_NO2", "AQI_SO2"]].max(axis=1)

In [11]:
df = df[['Temperature', 'Humidity', 'NO2','SO2','AQI']]

In [12]:
X = df.drop('AQI', axis=1)
y = df['AQI']

In [13]:
# prompt: normalize data

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X = scaler.fit_transform(X)

In [14]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
from xgboost import XGBRegressor #Import the XGBRegressor instead of XGBClassifier
from sklearn.metrics import mean_squared_error
xgb_model = XGBRegressor(max_depth=3, n_estimators=500, learning_rate=0.1, )
xgb_model.fit(X_train, y_train)

In [16]:
# evaluate XGBoost model
y_train_pred = xgb_model.predict(X_train)
mse = mean_squared_error(y_train, y_train_pred)
print("Mean Squared Error:", mse)

y_test_pred = xgb_model.predict(X_test)
mse = mean_squared_error(y_test, y_test_pred)
print("Mean Squared Error:", mse)

Mean Squared Error: 108.15092934198493
Mean Squared Error: 371.11500959223565


In [17]:
xgb_model.save_model("xgb_model.json")

In [25]:
import pickle

with open("xgb_model.pkl", "wb") as f:
    pickle.dump(xgb_model, f)