Importing the necessary libraries


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import shap
import joblib

Loading the dataset

In [4]:
import chardet

with open("/home/hgidea/Desktop/Coding/Python/hackthon/intel/archive/global air pollution dataset.csv", "rb") as f:
    encoding = chardet.detect(f.read())["encoding"]

df = pd.read_csv("/home/hgidea/Desktop/Coding/Python/hackthon/intel/archive/global air pollution dataset.csv", encoding=encoding)

Visualising the dataset

In [None]:
df.head()

Checking null values and dropping them

In [None]:

df = df.dropna()

Plotting graph of categories 

In [None]:
sns.boxplot(
    x = "AQI Category",
    y = "AQI Value",
    showmeans=True,
    data=df
)
plt.xlabel("AQI Category")
plt.ylabel("Air Quality Index (AQI)")
plt.title("Distribution of AQI Across Categories")
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()


 Encode categorical variables


In [None]:

label_encoders = {}
categorical_cols = ['AQI Category', 'CO AQI Category', 'Ozone AQI Category', 'NO2 AQI Category', 'PM2.5 AQI Category']

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

In [None]:

# Select only numeric columns for correlation matrix
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()


Plotting Correlation heatmap


In [None]:

plt.figure(figsize=(12, 8))
sns.heatmap(df[numeric_cols].corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()



Split data into features and target


In [None]:

features_to_use = ['PM2.5 AQI Value', 'CO AQI Value', 'Ozone AQI Value', 'NO2 AQI Value',
                   'AQI Category', 'CO AQI Category', 'Ozone AQI Category', 'NO2 AQI Category', 'PM2.5 AQI Category']
X = df[features_to_use]
y = df['AQI Value']

Create a pipeline for preprocessing


In [None]:

numeric_features = ['PM2.5 AQI Value', 'Ozone AQI Value', 'CO AQI Value', 'NO2 AQI Value']
categorical_features = ['AQI Category', 'CO AQI Category', 'Ozone AQI Category', 'NO2 AQI Category', 'PM2.5 AQI Category']


In [None]:

numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', 'passthrough', categorical_features)
    ])


In [None]:

X_processed = preprocessor.fit_transform(X)

Splitting into train and test

In [None]:

X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)



Training Model

In [None]:

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2_score(y_test, y_pred)}')

In [None]:
# GridSearchCV for RandomForestRegressor
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_features': ['sqrt', 'log2', None],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}


In [None]:
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, n_jobs=-1, scoring='neg_mean_squared_error', error_score='raise')
grid_search.fit(X_train, y_train)


In [None]:
# Get the best model
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

In [None]:
# Make predictions
y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [None]:

print(f'Best parameters: {best_params}')
print(f'Best Model Mean Squared Error: {mse}')
print(f'R-squared: {r2}')

In [None]:
# SHAP values
explainer = shap.TreeExplainer(best_model)
shap_values = explainer.shap_values(X_test)
shap.summary_plot(shap_values, X_test, feature_names=numeric_features + categorical_features)


In [None]:

# Residual Analysis
residuals = y_test - y_pred

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(y_pred, residuals)
plt.hlines(0, xmin=min(y_pred), xmax=max(y_pred), colors='r', linestyles='dashed')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.show()


In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(residuals, kde=True)
plt.title('Distribution of Residuals')
plt.show()


In [None]:
# Save the model
joblib.dump(best_model, '/home/hgidea/Desktop/Coding/Python/hackthon/intel/best_random_forest_model (3).pkl')


In [None]:
sns.pairplot(df[['AQI Value', 'CO AQI Value', 'Ozone AQI Value', 'NO2 AQI Value', 'PM2.5 AQI Value']], diag_kind="kde")
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(
    x="AQI Category",
    data=df,
    palette="Set2"
)
plt.xticks(rotation=45)
plt.xlabel("AQI Category")
plt.ylabel("Count")
plt.title("Distribution of AQI Categories")
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()