In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

class WeatherPollutionPredictor:
    def __init__(self):
        # Initialize models and scalers
        self.weather_model = RandomForestRegressor(n_estimators=100, random_state=42)
        self.pollution_model = RandomForestRegressor(n_estimators=100, random_state=42)
        self.weather_scaler = StandardScaler()
        self.pollution_scaler = StandardScaler()
        
    @staticmethod
    def preprocess_weather_data(weather_df):
        """Convert weather data from wide to long format and standardize months"""
        month_map = {
            'JAN': 1, 'FEB': 2, 'MAR': 3, 'APR': 4, 'MAY': 5, 'JUN': 6,
            'JUL': 7, 'AUG': 8, 'SEP': 9, 'OCT': 10, 'NOV': 11, 'DEC': 12
        }
        
        weather_monthly = weather_df.melt(
            id_vars=['YEAR'],
            value_vars=list(month_map.keys()),
            var_name='Month',
            value_name='Temperature'
        )
        weather_monthly['Month'] = weather_monthly['Month'].map(month_map)
        return weather_monthly
    
    @staticmethod
    def preprocess_pollution_data(pollution_df):
        """Aggregate pollution data to monthly averages"""
        pollution_df['Timestamp'] = pd.to_datetime(pollution_df['Timestamp'])
        return pollution_df.groupby(['Year', 'Month'])['PM2.5'].mean().reset_index()
    
    @staticmethod
    def prepare_features(df):
        """Create month dummy variables and year feature"""
        features = pd.DataFrame()
        for month in range(1, 13):
            features[f'month_{month}'] = (df['Month'] == month).astype(int)
        features['Year'] = df['Year'] if 'Year' in df.columns else df['YEAR']
        return features
    
    def train(self, weather_df, pollution_df):
        """Train both weather and pollution models"""
        # Preprocess data
        weather_monthly = self.preprocess_weather_data(weather_df)
        pollution_monthly = self.preprocess_pollution_data(pollution_df)
        
        # Prepare features
        weather_features = self.prepare_features(weather_monthly)
        pollution_features = self.prepare_features(pollution_monthly)
        
        # Split data
        X_train_w, X_test_w, y_train_w, y_test_w = train_test_split(
            weather_features, weather_monthly['Temperature'], test_size=0.2
        )
        X_train_p, X_test_p, y_train_p, y_test_p = train_test_split(
            pollution_features, pollution_monthly['PM2.5'], test_size=0.2
        )
        
        # Train models
        # Weather model
        X_train_w_scaled = self.weather_scaler.fit_transform(X_train_w)
        self.weather_model.fit(X_train_w_scaled, y_train_w)
        
        # Pollution model
        X_train_p_scaled = self.pollution_scaler.fit_transform(X_train_p)
        self.pollution_model.fit(X_train_p_scaled, y_train_p)
        
        # Return test sets for evaluation
        return (X_test_w, y_test_w), (X_test_p, y_test_p)
    
    def predict(self, year, month):
        """Make predictions for a specific year and month"""
        # Create feature vector
        new_data = pd.DataFrame(index=[0])
        for m in range(1, 13):
            new_data[f'month_{m}'] = 1 if m == month else 0
        new_data['Year'] = year
        
        # Make predictions
        temp_pred = self.weather_model.predict(
            self.weather_scaler.transform(new_data)
        )[0]
        pm25_pred = self.pollution_model.predict(
            self.pollution_scaler.transform(new_data)
        )[0]
        
        return temp_pred, pm25_pred
    
    def evaluate(self, X_test_w, y_test_w, X_test_p, y_test_p):
        """Evaluate model performance"""
        # Make predictions
        weather_preds = self.weather_model.predict(
            self.weather_scaler.transform(X_test_w)
        )
        pollution_preds = self.pollution_model.predict(
            self.pollution_scaler.transform(X_test_p)
        )
        
        # Calculate metrics
        weather_metrics = {
            'R2': r2_score(y_test_w, weather_preds),
            'RMSE': np.sqrt(mean_squared_error(y_test_w, weather_preds))
        }
        pollution_metrics = {
            'R2': r2_score(y_test_p, pollution_preds),
            'RMSE': np.sqrt(mean_squared_error(y_test_p, pollution_preds))
        }
        
        return weather_metrics, pollution_metrics