<a href="https://colab.research.google.com/github/KirZa-alt/Air-Quality-Level-Prediction-ML-/blob/main/Hackathon_Task.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!unzip "archive (2).zip" "*.csv" -d data



In [None]:
# Import pandas
import pandas as pd

# Import data
data=pd.read_csv("data/Training/concatenated_dataset_Aug_2021_to_July_2024.csv")
data.head()

In [None]:
# Data Informartion

print("rows/col of data:\n",data.shape)
print("\n Description of data:", data.describe())
print("\n info of data:", data.info())
# Checking null values in data
print("Checking null values:\n",data.isnull().sum())

# Checking duplicates
print("Checking duplicates:\n",data.duplicated().sum)

In [None]:
import pandas as pd

data.head(2)
# Fixed date
# Replace 'date' with your column name
data['datetime'] = pd.to_datetime(data['datetime'], errors='coerce')

# Checking missing values in datetime
data[data['datetime'].isna()]

# Sort data by date and city
data = data.sort_values(['datetime'])
data.reset_index(drop=True, inplace=True)


# Verify
data.head()
data.dtypes

In [None]:
# Feature Engineering and preperation for ml

data['date'] = data['datetime'].dt.date
data['hour'] = data['datetime'].dt.hour
data['weekday'] = data['datetime'].dt.weekday
data['month'] = data['datetime'].dt.month

# capture past pollution levels
# Lag features
data['PM2.5_lag1'] = data['components_pm2_5'].shift(1)
data['PM2.5_lag3'] = data['components_pm2_5'].shift(3)
data['AQI_lag1']   = data['main_aqi'].shift(1)

# Rolling mean
data['PM2.5_roll3'] = data['components_pm2_5'].rolling(3).mean()
data['AQI_roll3']   = data['main_aqi'].rolling(3).mean()

data = data.dropna().reset_index(drop=True)


In [None]:
# Create AQI category

def aqi_category(aqi):
    if aqi <= 50: return "Good"
    elif aqi <= 100: return "Moderate"
    elif aqi <= 150: return "Unhealthy (SG)"
    elif aqi <= 200: return "Unhealthy"
    else: return "Very Unhealthy"

data['AQI_Category'] = data['main_aqi'].apply(aqi_category)


In [None]:
# Train test split
data.columns
train = data[data['datetime'] < '2024-01-01']
test  = data[data['datetime'] >= '2024-01-01']

features = ['components_pm2_5','PM2.5_lag1','PM2.5_lag3','PM2.5_roll3',
            'AQI_lag1','AQI_roll3','hour','weekday','month']

X_train = train[features]
y_train = train['AQI_Category']

X_test = test[features]
y_test = test['AQI_Category']

In [None]:
# Apply random forest classifier model on data

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

model=RandomForestClassifier()
model.fit(X_train,y_train)

pred=model.predict(X_test)

print(accuracy_score(y_test,pred))
print(classification_report(y_test,pred))

In [None]:
# XGBoost model

from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder



le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)
y_test_encoded = le.transform(y_test)


from xgboost import XGBClassifier

model = XGBClassifier(
    objective='multi:softmax',
    num_class=len(le.classes_),
    n_estimators=200,
    learning_rate=0.1,
    max_depth=5,
    random_state=42,
    eval_metric='mlogloss'
)

model.fit(X_train, y_train_encoded)

pred = model.predict(X_test)

print(accuracy_score(y_test_encoded, pred))
print(classification_report(y_test_encoded, pred))

In [None]:
# import tensorflow as tf
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Dense, LSTM
# from tensorflow.keras.utils import to_categorical
# import numpy as np

# # Reshape: add time dimension (1 timestep for now)
# X_train_lstm = np.expand_dims(X_train.values, axis=1)  # shape: (samples, 1, features)
# X_test_lstm = np.expand_dims(X_test.values, axis=1)


# from tensorflow.keras.utils import to_categorical

# # Encode AQI categories
# le = LabelEncoder()
# y_train_encoded = le.fit_transform(y_train)
# y_test_encoded = le.transform(y_test)

# # Convert to categorical
# y_train_cat = to_categorical(y_train_encoded)
# y_test_cat = to_categorical(y_test_encoded)


# model = Sequential()
# model.add(LSTM(64, input_shape=(X_train_lstm.shape[1], X_train_lstm.shape[2])))
# model.add(Dense(y_train_cat.shape[1], activation='softmax'))  # number of classes

# model.compile(
#     optimizer='adam',
#     loss='categorical_crossentropy',
#     metrics=['accuracy']
# )

# history = model.fit(
#     X_train_lstm,
#     y_train_cat,
#     epochs=50,
#     batch_size=32,
#     validation_split=0.2
# )

# test_loss, test_acc = model.evaluate(X_test_lstm, y_test_cat)
# print("Test Accuracy:", test_acc)




In [None]:

last_row = data[features].iloc[-1:].copy()

# prepare a placeholder
forecast_days = 3
forecast_results = []

# loop over days
current_row = last_row.copy()

for i in range(forecast_days):
    # Predict AQI category
    pred_encoded = model.predict(current_row)[0]
    pred_label = le.inverse_transform([pred_encoded])[0]

    forecast_results.append({
        'day': i+1,
        'predicted_AQI_Category': pred_label
    })

    # Update lag features for next day
    current_row['AQI_lag1'] = current_row['AQI_lag1']  # optionally update with numeric AQI if available
    current_row['PM2.5_lag1'] = current_row['components_pm2_5']
    current_row['PM2.5_roll3'] = (current_row['PM2.5_roll3']*2 + current_row['components_pm2_5'])/3

    forecast_df = pd.DataFrame(forecast_results)
forecast_df


In [None]:
forecast_df.to_csv("aqi_forecast_next3days.csv", index=False)

import matplotlib.pyplot as plt

plt.figure(figsize=(8,4))
plt.plot(forecast_df['day'], forecast_df['predicted_AQI_Category'], marker='o')
plt.title("Next 3 Days AQI Forecast")
plt.xlabel("Day")
plt.ylabel("AQI Category")
plt.grid(True)
plt.show()


In [None]:
import joblib

# Save model
joblib.dump(model, "aqi_model.pkl")

# Save LabelEncoder
joblib.dump(le, "aqi_label_encoder.pkl")


In [None]:
from google.colab import files
files.download("aqi_label_encoder.pkl")


In [None]:
from sklearn.preprocessing import LabelEncoder
import joblib

labels = ["Good", "Moderate", "Unhealthy", "Very Unhealthy", "Hazardous"]  # your AQI categories
le = LabelEncoder()
le.fit(labels)

# Save it
joblib.dump(le, "aqi_label_encoder.pkl")


In [None]:
from google.colab import files

# Upload all required files
uploaded = files.upload()




forecast_df


In [141]:
from google.colab import files
uploaded = files.upload()  # then select your CSV/model files


Saving aqi_forecast_next3days.csv to aqi_forecast_next3days (1).csv
