In [1]:
import os
import json
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib

# Load config
with open('config.json', 'r') as f:
    config = json.load(f)

# Create predictions directory
os.makedirs(config["predictions_dir"], exist_ok=True)

# Load data
data = pd.read_csv(config["dataset_path"])
X = data.drop('Target', axis=1)
y = data['Target']

# Train-test split (90-10)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=config["test_size"], 
    random_state=config["random_state"]
)

# Define models
models = {
    'LR_Baseline': LinearRegression(),
    'RF_Baseline': RandomForestRegressor(random_state=config["random_state"]),
    'XGB_Baseline': XGBRegressor(random_state=config["random_state"])
}

# Initialize combined predictions dataframe
combined_preds = pd.DataFrame({
    'Actual': y_test.reset_index(drop=True)  # Reset index for alignment
})

# Metrics storage
metrics = {}
best_model_name = None
best_score = -float('inf')

# Initialize k-fold cross-validation
kf = KFold(n_splits=config["cv_folds"], shuffle=True, random_state=config["random_state"])

# Cross-validation and model evaluation
for model_name, model in models.items():
    cv_mae_scores = []
    cv_mse_scores = []
    cv_r2_scores = []
    
    # Perform k-fold cross-validation on the training set
    for train_idx, val_idx in kf.split(X_train):
        # Split into training and validation sets
        X_train_fold, X_val_fold = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_train_fold, y_val_fold = y_train.iloc[train_idx], y_train.iloc[val_idx]
        
        # Train the model
        model.fit(X_train_fold, y_train_fold)
        
        # Validate
        y_val_pred = model.predict(X_val_fold)
        
        # Calculate metrics
        cv_mae_scores.append(mean_absolute_error(y_val_fold, y_val_pred))
        cv_mse_scores.append(mean_squared_error(y_val_fold, y_val_pred))
        cv_r2_scores.append(r2_score(y_val_fold, y_val_pred))
    
    # Calculate average CV metrics
    cv_metrics = {
        'CV_MAE': np.mean(cv_mae_scores),
        'CV_MSE': np.mean(cv_mse_scores),
        'CV_R2': np.mean(cv_r2_scores)
    }
    
    # Track best model
    if cv_metrics['CV_R2'] > best_score:
        best_score = cv_metrics['CV_R2']
        best_model_name = model_name
        best_model = model
    
    # Final training and testing
    model.fit(X_train, y_train)
    test_preds = model.predict(X_test)
    
    # Add to combined predictions
    combined_preds[model_name] = test_preds
    
    # Save individual predictions
    pd.DataFrame({
        'Actual': y_test,
        'Predicted': test_preds
    }).to_csv(f"{config['predictions_dir']}/{model_name}_predictions.csv", index=False)
    
    # Calculate test metrics
    test_metrics = {
        'Test_MAE': mean_absolute_error(y_test, test_preds),
        'Test_MSE': mean_squared_error(y_test, test_preds),
        'Test_R2': r2_score(y_test, test_preds)
    }
    
    # Store metrics
    metrics[model_name] = {**cv_metrics, **test_metrics}

# Save combined predictions
combined_preds.to_csv(f"{config['predictions_dir']}/Baseline_predictions.csv", index=False)

# Save metrics to CSV
pd.DataFrame(metrics).T.to_csv(f"{config['predictions_dir']}/baseline_model_metrics.csv")

# Save best model
joblib.dump(best_model, f"{best_model_name}.pkl")

print("Execution complete. Outputs saved in:", config["predictions_dir"])
print(f"Best model: {best_model_name} (CV R²: {best_score:.3f})")

Execution complete. Outputs saved in: predictions
Best model: RF_Baseline (CV R²: 0.716)
