In [1]:
# Read air quality data for all cities, add a 'city' column, and combine everything into one dataset.
import pandas as pd

files = {
    "Islamabad": "islamabad_complete_data.xlsx",
    "Karachi": "karachi_complete_data.xlsx",
    "Lahore": "lahore_complete_data.xlsx",
    "Peshawar": "peshawar_complete_data.csv",
    "Quetta": "quetta_complete_data.csv"
}

dfs = []

for city, file in files.items():
    if file.endswith(".xlsx"):
        df = pd.read_excel(file)
    else:
        df = pd.read_csv(file)

    df['city'] = city
    dfs.append(df)

df = pd.concat(dfs, ignore_index=True)


In [2]:
df.columns


Index(['datetime', 'main.aqi', 'components.co', 'components.no',
       'components.no2', 'components.o3', 'components.so2', 'components.pm2_5',
       'components.pm10', 'components.nh3', 'temperature_2m',
       'relative_humidity_2m', 'dew_point_2m', 'precipitation',
       'surface_pressure', 'wind_speed_10m', 'wind_direction_10m',
       'shortwave_radiation', 'city'],
      dtype='object')

In [3]:
df = df.rename(columns={
    'components.pm2_5': 'pm2_5',
    'components.pm10': 'pm10',
    'components.no2': 'no2',
    'components.o3': 'o3',
    'components.so2': 'so2',
    'components.co': 'co',
    'components.nh3': 'nh3',
    'temperature_2m': 'temperature',
    'relative_humidity_2m': 'humidity',
    'wind_speed_10m': 'wind_speed'
})


In [4]:
# Convert datetime column, extract date, and calculate daily average values for each city.
import pandas as pd

df['datetime'] = pd.to_datetime(df['datetime'])
df['date'] = df['datetime'].dt.date

df_daily = (
    df
    .groupby(['city', 'date'])
    .mean(numeric_only=True)
    .reset_index()
)

df_daily['date'] = pd.to_datetime(df_daily['date'])


In [5]:
# Sort properly
df_daily = df_daily.sort_values(['city', 'date'])

# Remove invalid values
df_daily['pm2_5'] = df_daily['pm2_5'].clip(lower=0)

# Handle missing values
df_daily.fillna(method='ffill', inplace=True)
df_daily.fillna(method='bfill', inplace=True)


  df_daily.fillna(method='ffill', inplace=True)
  df_daily.fillna(method='bfill', inplace=True)


In [6]:
# Create lag features for PM2.5 to use past 1, 3, and 7 days as input for prediction.
df_daily['pm2_5_lag1'] = df_daily.groupby('city')['pm2_5'].shift(1)
df_daily['pm2_5_lag3'] = df_daily.groupby('city')['pm2_5'].shift(3)
df_daily['pm2_5_lag7'] = df_daily.groupby('city')['pm2_5'].shift(7)


In [7]:
# Create 3-day and 7-day rolling average of PM2.5 for each city to capture trends.
df_daily['pm2_5_roll3'] = df_daily.groupby('city')['pm2_5'].rolling(3).mean().reset_index(0,drop=True)
df_daily['pm2_5_roll7'] = df_daily.groupby('city')['pm2_5'].rolling(7).mean().reset_index(0,drop=True)


In [8]:
df_daily.dropna(inplace=True)


In [9]:
def aqi_category(pm):
    if pm <= 50:
        return "Good"
    elif pm <= 100:
        return "Moderate"
    elif pm <= 150:
        return "Unhealthy for Sensitive"
    elif pm <= 200:
        return "Unhealthy"
    else:
        return "Very Unhealthy"

df_daily['AQI_Category'] = df_daily['pm2_5'].apply(aqi_category)


In [10]:
# Convert city names and AQI categories into numbers so the model can understand them.
from sklearn.preprocessing import LabelEncoder

le_city = LabelEncoder()
df_daily['city_encoded'] = le_city.fit_transform(df_daily['city'])

le_target = LabelEncoder()
df_daily['target'] = le_target.fit_transform(df_daily['AQI_Category'])


In [11]:
features = [
    'pm2_5_lag1', 'pm2_5_lag3', 'pm2_5_lag7',
    'pm2_5_roll3', 'pm2_5_roll7',
    'temperature', 'humidity', 'wind_speed',
    'pm10', 'no2', 'o3', 'so2', 'co',
    'city_encoded'
]

X = df_daily[features]
y = df_daily['target']


In [12]:
# Split the data into training (before 2024) and testing (2024 onwards) sets for model training and evaluation.
train = df_daily[df_daily['date'] < '2024-01-01']
test  = df_daily[df_daily['date'] >= '2024-01-01']

X_train, y_train = train[features], train['target']
X_test, y_test = test[features], test['target']


In [13]:
# Set up and train XGBoost model with 300 trees, learning rate 0.05, max depth 6, and subsampling for better accuracy.
from xgboost import XGBClassifier

model = XGBClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

model.fit(X_train, y_train)


In [15]:
from sklearn.metrics import classification_report

pred = model.predict(X_test)
print(classification_report(y_test, pred, target_names=le_target.classes_))


                         precision    recall  f1-score   support

                   Good       0.99      0.95      0.97       865
               Moderate       0.85      0.93      0.89       358
              Unhealthy       0.86      0.93      0.89        80
Unhealthy for Sensitive       0.89      0.86      0.88       181
         Very Unhealthy       0.98      0.98      0.98       194

               accuracy                           0.94      1678
              macro avg       0.91      0.93      0.92      1678
           weighted avg       0.94      0.94      0.94      1678



In [16]:
import joblib

joblib.dump(model, "aqi_model.pkl")
joblib.dump(le_city, "city_encoder.pkl")
joblib.dump(le_target, "aqi_encoder.pkl")


['aqi_encoder.pkl']