In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('Desktop/Data Science/Air quality/dataset/city_data_aqi.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'Desktop/Data Science/Air quality/dataset/city_data_aqi.csv'

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
city_counts = df['City'].value_counts()
print(city_counts)

In [None]:
print(df['City'].unique())

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
sns.set_style("whitegrid")
plt.scatter(x = 'City', y ='AQI', data = df)
plt.xticks(rotation=90)
plt.title("AQI by City")
plt.ylabel("AQI")
plt.xlabel("City")
plt.show()

In [None]:
city_avg_aqi = df.groupby('City')['AQI'].mean().sort_values(ascending=False)

# Set plot style
sns.set_style("whitegrid")
plt.figure(figsize=(12, 8))

# Bar plot of average AQI per city
sns.barplot(x=city_avg_aqi.index, y=city_avg_aqi.values, palette="viridis")

plt.title("Average AQI by City", fontsize=16)
plt.xlabel("City", fontsize=12)
plt.ylabel("Average AQI", fontsize=14)
plt.xticks(rotation=45, ha='right')  # Rotate x-axis labels for readability
plt.tight_layout()
plt.show()

In [None]:
df = df.copy()

for col in df.columns:
    if df[col].isnull().any() and df[col].dtype != 'object':
        df[col] = df.groupby('City')[col].transform(lambda x : x.fillna(x.mean()))

##filling missing values by mean of same city

In [None]:
df.isnull().sum()

In [None]:
for col in ['PM10','NH3','Benzene','Toluene','Xylene']:
    df[col] = df[col].fillna(df[col].mean())
## for feature where city mean is zero fill them with total mean

In [None]:
df.info()

In [None]:
numeric_df = df.select_dtypes(include =['number'])
corr_matrix = numeric_df.corr()

plt.figure(figsize=(12,8))
sns.heatmap(corr_matrix, annot = True, cmap = 'coolwarm', fmt = '.2f', linewidths = 0.5)
plt.title('Correlation Heatmap')
plt.show()

In [None]:
df = df.drop(columns = ['Xylene','Benzene','NO'])

In [None]:
df.info()

In [None]:
df.columns

In [None]:
X = df[['PM2.5', 'PM10', 'NO2', 'NOx', 'NH3', 'CO', 'SO2', 'O3','Toluene']]
y = df['AQI']

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=37)

In [None]:
param_grids = {
    "Decision Tree": {
        "max_depth": [5, 10, 20, None],
        "min_samples_split": [2, 5, 10],
        "min_samples_leaf": [1, 2, 5]
    },
    "Random Forest": {
        "n_estimators": [50, 100],
        "max_depth": [10, None],
        "min_samples_split": [2, 5],
        "min_samples_leaf": [1, 2]
    },
    "XGBoost": {
        "n_estimators": [50, 100, 200],
        "learning_rate": [0.01, 0.1, 0.2],
        "max_depth": [3, 5, 10],
        "subsample": [0.5, 0.7, 1.0]
    }
}
# Models dictionary
models = {
    "Decision Tree": DecisionTreeRegressor(),
    "XGBoost": XGBRegressor(),
    "Random Forest": RandomForestRegressor(),
}

best_models = {}

# Hyperparameter tuning
for model_name, model in models.items():
    print(f"Tuning {model_name}...")
    grid_search = RandomizedSearchCV(model, param_grids[model_name], cv=3, scoring='r2', n_jobs=-1, verbose=1, n_iter=10)
    grid_search.fit(X_train, y_train)
    
    best_models[model_name] = grid_search.best_estimator_
    print(f"Best parameters for {model_name}: {grid_search.best_params_}\n")

lr = LinearRegression()
lr.fit(X_train, y_train)
best_models["Linear Regression"] = lr

In [None]:
# Model Evaluation
mse_scores = []
r2_scores = []
model_names = []

for model_name, model in best_models.items():
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    mse_scores.append(mse)
    r2_scores.append(r2)
    model_names.append(model_name)
    
    print(f"{model_name} (Tuned) - MSE: {mse:.4f}, R2: {r2:.4f}")

## i will be using xgbregressor beacuse its result is similar to random forest but random forest is computationaly very expensive


In [None]:
# Evaluate all models on test data
for name, model in best_models.items():
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    print(f"{name} → R²: {r2:.4f}, RMSE: {rmse:.2f}")

In [None]:
xgb_model = best_models["XGBoost"]

In [None]:
import pickle
with open("xgb_model.pkl", "wb") as file:
    pickle.dump(xgb_model, file)

In [None]:
import json

with open("model_columns.json", "w") as f:
    json.dump(list(X_train.columns), f)