In [None]:

# !pip install pandas numpy matplotlib seaborn scikit-learn joblib

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import joblib


In [None]:

df = pd.read_csv('PB_All_2000_2021.csv', sep=';')
print("✅ Dataset loaded successfully.")
print("Shape:", df.shape)


In [None]:

df['date'] = pd.to_datetime(df['date'], format='%d.%m.%Y')
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df = df.sort_values(by=['id', 'date'])


In [None]:

df.fillna(df.median(numeric_only=True), inplace=True)
print("✅ Missing values handled via median imputation.")


In [None]:

pollutants = ['O2', 'NO3', 'NO2', 'SO4', 'PO4', 'CL']
features = ['id', 'year', 'NH4', 'BSK5', 'Suspended']

X = df[features]
y = df[pollutants]


In [None]:

X_encoded = pd.get_dummies(X, columns=['id'], drop_first=True)


In [None]:

X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y, test_size=0.2, random_state=42
)

print("✅ Data split complete.")
print(f"Training samples: {X_train.shape[0]}, Testing samples: {X_test.shape[0]}")


In [None]:

model = MultiOutputRegressor(RandomForestRegressor(n_estimators=100, random_state=42))
model.fit(X_train, y_train)
print("✅ Model training complete!")


In [None]:

y_pred = model.predict(X_test)

print("\n🔍 Model Performance on Test Set:")
for i, pollutant in enumerate(pollutants):
    print(f"{pollutant}:")
    print(f"   MSE: {mean_squared_error(y_test.iloc[:, i], y_pred[:, i]):.2f}")
    print(f"   R² : {r2_score(y_test.iloc[:, i], y_pred[:, i]):.4f}")
    print()


In [None]:

for i, pollutant in enumerate(pollutants):
    plt.figure(figsize=(6, 4))
    sns.regplot(x=y_test.iloc[:, i], y=y_pred[:, i],
                scatter_kws={'alpha':0.5}, line_kws={'color':'red'})
    plt.xlabel(f"Actual {pollutant}")
    plt.ylabel(f"Predicted {pollutant}")
    plt.title(f"{pollutant} Prediction (R² = {r2_score(y_test.iloc[:, i], y_pred[:, i]):.2f})")
    plt.grid(True)
    plt.tight_layout()
    plt.show()


In [None]:

station_id = 22
year_input = 2024

input_data = pd.DataFrame({
    'year': [year_input],
    'id': [station_id],
    'NH4': [0.5],
    'BSK5': [3.0],
    'Suspended': [10]
})

input_encoded = pd.get_dummies(input_data, columns=['id'])

missing_cols = set(X_encoded.columns) - set(input_encoded.columns)
for col in missing_cols:
    input_encoded[col] = 0

input_encoded = input_encoded[X_encoded.columns]

predicted_pollutants = model.predict(input_encoded)[0]

print(f"\n📌 Predicted pollutant levels for Station {station_id} in {year_input}:")
for p, val in zip(pollutants, predicted_pollutants):
    print(f"  {p}: {val:.2f}")


In [None]:
import joblib
joblib.dump(model, 'pollution_model.pkl')
joblib.dump(X_encoded.columns.tolist(), 'model_columns.pkl')
print('✅ Model and model_columns.pkl saved successfully!')

