In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

def load_and_preprocess_data(file_path):
    """
    Load and preprocess the PM2.5 dataset
    """
    # Load the dataset
    df = pd.read_csv(file_path)
    
    # Handle missing values
    df.dropna(inplace=True)
    
    return df

def engineer_features(df):
    """
    Create engineered features for air quality prediction
    """
    # 1. Temporal Features
    df['date'] = pd.to_datetime(df[['year', 'month', 'day']])
    df['day_of_week'] = df['date'].dt.dayofweek
    df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)
    
    # 2. Wind-related Features
    # Create a numeric representation of wind direction
    wind_dir_map = {
        'NW': 1, 'NE': 2, 'SW': 3, 'SE': 4
    }
    df['wind_dir_numeric'] = df['cbwd'].map(wind_dir_map)
    
    # 3. Interaction Features
    # Temperature and Humidity Interaction
    df['temp_humidity_index'] = df['TEMP'] * df['HUMI'] / 100
    
    # Precipitation and Pressure Interaction
    df['precip_pressure_index'] = df['precipitation'] / (df['PRES'] / 1000)
    
    # 4. Rolling Window Features
    # Short-term and medium-term trends
    df['temp_3hr_rolling_avg'] = df.groupby('season')['TEMP'].rolling(window=3, min_periods=1).mean().reset_index(0, drop=True)
    df['pm25_24hr_rolling_avg'] = df.groupby('season')['PM'].rolling(window=24, min_periods=1).mean().reset_index(0, drop=True)
    
    return df

def prepare_data(df):
    """
    Prepare data for modeling with appropriate feature selection
    """
    # Engineer features
    df = engineer_features(df)
    
    # Select features
    numeric_features = [
        'TEMP', 'DEWP', 'HUMI', 'PRES', 
        'Iws', 'precipitation', 'Iprec',
        'wind_dir_numeric', 
        'temp_humidity_index', 
        'precip_pressure_index',
        'temp_3hr_rolling_avg',
        'pm25_24hr_rolling_avg'
    ]
    
    categorical_features = ['season', 'cbwd']
    
    # Prepare features and target
    X = df[numeric_features + categorical_features]
    y = df['PM']
    
    return X, y

def create_preprocessing_pipeline(X):
    """
    Create a preprocessing pipeline with appropriate transformers
    """
    # Identify numeric and categorical columns
    numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
    categorical_features = X.select_dtypes(include=['object']).columns
    
    # Create preprocessor
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numeric_features),
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
        ])
    
    return preprocessor

def train_models(X, y):
    """
    Train and evaluate multiple models with preprocessing
    """
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    
    # Create preprocessor
    preprocessor = create_preprocessing_pipeline(X_train)
    
    # Models and their hyperparameter grids
    models = {
        'Random Forest': {
            'model': RandomForestRegressor(random_state=42),
            'params': {
                'regressor__n_estimators': [50, 100, 200],
                'regressor__max_depth': [None, 10, 20],
                'regressor__min_samples_split': [2, 5, 10]
            }
        },
        'Gradient Boosting': {
            'model': GradientBoostingRegressor(random_state=42),
            'params': {
                'regressor__n_estimators': [50, 100, 200],
                'regressor__learning_rate': [0.01, 0.1, 0.2],
                'regressor__max_depth': [3, 4, 5]
            }
        },
        'Support Vector Regression': {
            'model': SVR(),
            'params': {
                'regressor__kernel': ['rbf', 'linear'],
                'regressor__C': [0.1, 1, 10],
                'regressor__epsilon': [0.1, 0.2, 0.3]
            }
        }
    }
    
    # Results storage
    results = {}
    
    # Train and evaluate each model
    for name, config in models.items():
        # Create pipeline
        pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('regressor', config['model'])
        ])
        
        # Grid Search with Cross-Validation
        grid_search = GridSearchCV(
            estimator=pipeline, 
            param_grid={**config['params']},
            cv=5, 
            scoring='neg_mean_absolute_error'
        )
        
        # Fit the grid search
        grid_search.fit(X_train, y_train)
        
        # Best model predictions
        y_pred = grid_search.predict(X_test)
        
        # Evaluation metrics
        results[name] = {
            'best_params': grid_search.best_params_,
            'mae': mean_absolute_error(y_test, y_pred),
            'rmse': np.sqrt(mean_squared_error(y_test, y_pred)),
            'r2': r2_score(y_test, y_pred)
        }
    
    return results

def visualize_results(df):
    """
    Create visualizations to understand data and features
    """
    # Correlation heatmap
    plt.figure(figsize=(12, 10))
    correlation_matrix = df.select_dtypes(include=['float64', 'int64']).corr()
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
    plt.title('Feature Correlation Heatmap')
    plt.tight_layout()
    plt.show()
    
    # PM2.5 distribution by season
    plt.figure(figsize=(10, 6))
    sns.boxplot(x='season', y='PM', data=df)
    plt.title('PM2.5 Distribution by Season')
    plt.show()

def main(file_path):
    # 1. Load the data
    df = load_and_preprocess_data(file_path)
    
    # 2. Visualize initial insights
    visualize_results(df)
    
    # 3. Prepare data for modeling
    X, y = prepare_data(df)
    
    # 4. Train and evaluate models
    model_results = train_models(X, y)
    
    # 5. Print results
    for model_name, results in model_results.items():
        print(f"\n{model_name} Results:")
        for metric, value in results.items():
            print(f"{metric}: {value}")
    
    return model_results