In [None]:

# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
import seaborn as sns


In [None]:

# Simulate synthetic air pollution data
np.random.seed(42)
data_size = 500

# Features: Temperature, Humidity, Wind Speed, Industrial Emissions
temperature = np.random.uniform(15, 35, data_size)
humidity = np.random.uniform(30, 90, data_size)
wind_speed = np.random.uniform(0, 20, data_size)
industrial_emissions = np.random.uniform(50, 200, data_size)

# Target: Air Quality Index (AQI)
aqi = 50 + 0.4 * temperature - 0.3 * humidity + 0.5 * wind_speed + 0.7 * industrial_emissions + np.random.normal(0, 5, data_size)

# Combine into a DataFrame
data = pd.DataFrame({
    "Temperature": temperature,
    "Humidity": humidity,
    "Wind Speed": wind_speed,
    "Industrial Emissions": industrial_emissions,
    "AQI": aqi
})

data.head()


In [None]:

# Check for missing values and normalize data
data.isnull().sum()
data.describe()

# Normalize features
features = ["Temperature", "Humidity", "Wind Speed", "Industrial Emissions"]
target = "AQI"
data[features] = (data[features] - data[features].mean()) / data[features].std()

data.head()


In [None]:

# Split the data into training and testing sets
X = data[features]
y = data[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.shape, X_test.shape


In [None]:

# Train a Random Forest Regressor
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict on test set
y_pred = model.predict(X_test)


In [None]:

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")


In [None]:

# Plot actual vs predicted AQI
plt.figure(figsize=(10, 6))
sns.scatterplot(x=y_test, y=y_pred, alpha=0.7)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r', linestyle='--', label='Perfect Prediction')
plt.xlabel("Actual AQI")
plt.ylabel("Predicted AQI")
plt.title("Actual vs Predicted AQI")
plt.legend()
plt.show()
