**Project Title:**
Air_Quality_Monitoring_and_Health_Risk_Assesment

**Problem Statement:**
Air pollution in India is rising at an alarming rate, with pollutants like PM2.5, PM10, NO2, and SO2 posing major risks to human health. There is a need for effective prediction of air quality and its related health impacts to support timely preventive action.

**Project Description:**
This project analyzes Indian air quality data and applies machine learning to predict AQI levels while classifying them into health risk categories such as Good, Moderate, Poor, and Severe, providing insights for public health and policy decisions.


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier, GradientBoostingRegressor
from sklearn.linear_model import LogisticRegression
import joblib

df = pd.read_csv('city_day.csv')
df.head()




In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.describe().T

In [None]:
df.shape

In [None]:
# columns
df.columns

In [None]:
# Univariate Analysis
sns.countplot(x='AQI_Bucket', data=df)
plt.title('Distribution of AQI Categories')
plt.show()

In [None]:
num_cols = ['PM2.5', 'PM10', 'NO', 'NO2', 'NOx', 'NH3', 'CO', 'SO2', 'O3', 'Benzene', 'Toluene', 'Xylene', 'AQI']
fig, axes = plt.subplots(5, 3, figsize=(15, 15))
axes = axes.flatten()
for ax, col in zip(axes, num_cols):
    sns.histplot(df[col], kde=True, ax = ax)
    ax.set_title(col)
# hide unused axes
for ax in range(len(num_cols), len(axes)):
    fig.delaxes(axes[ax])
plt.tight_layout()
plt.show()

In [None]:
#Bivariate Analysis
fig, axes = plt.subplots(1, 3, figsize=(20, 6))
sns.boxplot(x="AQI_Bucket", y="PM2.5", data=df, ax=axes[0]).set_title("PM2.5 vs AQI_Bucket")
sns.countplot(x="City", hue="AQI_Bucket", data=df, ax=axes[1]).set_title("AQI_Bucket Distribution across Cities")
axes[1].tick_params(axis='x', rotation=90)
sns.boxplot(x="AQI_Bucket", y="NO2", data=df, ax=axes[2]).set_title("NO2 vs AQI_Bucket")

plt.tight_layout()
plt.show()

In [None]:
# Select only numerical columns
num_cols_corr = df.select_dtypes(include=['float64', 'int64']).columns

# Correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(df[num_cols_corr].corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix')
plt.show()

In [None]:
# Pick the most important columns for visualization
selected_features = ["PM2.5", "PM10", "NO2", "AQI"]

# Pairplot with AQI_Bucket as hue
sns.pairplot(
    df.sample(2000, random_state=42),
    vars=selected_features,
    hue="AQI_Bucket",
    diag_kind="kde",
    palette="Set2",
    plot_kws={"alpha": 0.5, "s": 15}
)
plt.show()
city_names = df['City'].unique().tolist()

In [None]:
#Data preprocessing 
# Use separate encoders
city_le = LabelEncoder()
aqi_le = LabelEncoder()

df['City'] = city_le.fit_transform(df['City'])
df['AQI_Bucket'] = aqi_le.fit_transform(df['AQI_Bucket'])

In [None]:
#Feature and Target selection 
X = df.drop(['AQI_Bucket', 'Datetime'], axis = 1)
y = df['AQI_Bucket']

In [None]:
#Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
#scale num features
scaler = StandardScaler()
num_cols_scale = X_train.select_dtypes(include=['float64', 'int64']).columns
X_train[num_cols_scale] = scaler.fit_transform(X_train[num_cols_scale])
X_test[num_cols_scale] = scaler.transform(X_test[num_cols_scale])

In [None]:
X_train

In [None]:
# Initialize the model
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_classifier.fit(X_train, y_train)

# Make predictions on test set
y_pred = rf_classifier.predict(X_test)

In [None]:
# Evaluate the model
print("Accuracy Score:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


In [None]:
plt.figure(figsize=(8,6))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt="d", cmap="Blues")
plt.title('Confusion Matrix - Random Forest')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
importances = rf_classifier.feature_importances_
features = X.columns
plt.figure(figsize=(10,6))
plt.barh(features, importances)
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Feature Importance - Random Forest')
plt.show()

In [None]:
logistic_model = LogisticRegression(max_iter=500, random_state=42)
logistic_model.fit(X_train, y_train)

y_pred_logistic = logistic_model.predict(X_test)

print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_logistic))
print("\nClassification Report (Logistic Regression):\n", classification_report(y_test, y_pred_logistic))

plt.figure(figsize=(8,6))
sns.heatmap(confusion_matrix(y_test, y_pred_logistic), annot=True, fmt="d", cmap="Blues")
plt.title('Confusion Matrix - Logistic Regression')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()


In [None]:
joblib.dump(rf_classifier, 'rf_aqi_model.pkl')
joblib.dump(logistic_model, 'logistic_aqi_model.pkl')
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(city_le, 'label_encoder.pkl')  # This is for cities now
joblib.dump(aqi_le, 'aqi_label_encoder.pkl')  # Save AQI encoder separately
feature_names = X.columns.tolist()
joblib.dump(feature_names, 'feature_names.pkl')
cat_dict = {'City': city_names}  # Use original city names
joblib.dump(cat_dict, 'cat_dict.pkl')