In [1]:
import os
import random
import pandas as pd
import numpy as np
import joblib
from typing import Tuple, List, Dict, Optional
from datetime import datetime

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

In [2]:
def set_seed(seed: int = 42):
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {DEVICE}')

def create_directories(directories: List[str]):
    for directory in directories:
        if not os.path.exists(directory):
            os.makedirs(directory)
            print(f"Directory '{directory}' created.")
        else:
            print(f"Directory '{directory}' already exists.")

MODEL_DIR = 'models'
RESULT_DIR = 'results'

create_directories([MODEL_DIR, RESULT_DIR])

def load_and_preprocess_data(
    data_path: str,
    selected_features: List[str],
    target_column: str
) -> Tuple[np.ndarray, np.ndarray, Pipeline]:

    try:
        data = pd.read_csv(data_path)
        print("Data loaded successfully.")
    except FileNotFoundError as e:
        print(f"Error: {e}")
        raise

    missing_features = set(selected_features + [target_column]) - set(data.columns)
    if missing_features:
        raise KeyError(f"Missing columns in data: {missing_features}")

    features = data[selected_features]
    target = data[target_column].values
    print(f"Selected features: {selected_features}")
    print(f"Target column: {target_column}")

    preprocessing_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])

    X_processed = preprocessing_pipeline.fit_transform(features)
    print("Preprocessing completed using Pipeline.")

    return X_processed, target, preprocessing_pipeline

DATA_PATH = 'data/fr_DID_mean.csv'

SELECTED_FEATURES = [
    'Shape_GG', 'AvgStress', 'cross', 
    'PathLength', 'pShape_KNN', 'pAvgStress', 'pCrossNo', 
    'pMinAng', 'pContinu', 'pGeode', 
    '|V|', '|E|'
]

TARGET_COLUMN = 'accuracy'

# Load and preprocess data
X, y, preprocessing_pipeline = load_and_preprocess_data(DATA_PATH, SELECTED_FEATURES, TARGET_COLUMN)

Using device: cuda
Directory 'models' already exists.
Directory 'results' already exists.
Data loaded successfully.
Selected features: ['Shape_GG', 'AvgStress', 'cross', 'PathLength', 'pShape_KNN', 'pAvgStress', 'pCrossNo', 'pMinAng', 'pContinu', 'pGeode', '|V|', '|E|']
Target column: accuracy
Preprocessing completed using Pipeline.


In [3]:
class GraphDataset(Dataset):
    def __init__(self, X: np.ndarray, y: np.ndarray):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32).unsqueeze(1)
        
    def __len__(self) -> int:
        return len(self.X)
    
    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
        return self.X[idx], self.y[idx]

class SimpleNN(nn.Module):
    def __init__(self, input_size: int, output_size: int = 1):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 32)
        self.fc4 = nn.Linear(32, output_size)
        self.relu = nn.ReLU()
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.relu(self.fc3(x))
        x = self.fc4(x)
        return x

def evaluate_and_plot(
    y_true: np.ndarray,
    y_pred: np.ndarray,
    model_name: str
):

    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    
    print(f"{model_name} - MSE: {mse:.4f}, MAE: {mae:.4f}, R²: {r2:.4f}")
    
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

    plt.figure(figsize=(8,6))
    plt.scatter(y_true, y_pred, alpha=0.7)
    plt.plot([y_true.min(), y_true.max()], [y_true.min(), y_true.max()], 'r--')
    plt.xlabel('Actual')
    plt.ylabel('Predicted')
    plt.title(f'{model_name}: Actual vs Predicted')
    plt.tight_layout()
    plot_filename = f'{model_name}_actual_vs_predicted_{timestamp}.png'
    plt.savefig(os.path.join(RESULT_DIR, plot_filename))
    plt.close()
    print(f"Saved plot: {plot_filename}")
    
    residuals = y_true - y_pred
    plt.figure(figsize=(8,6))
    plt.scatter(y_pred, residuals, alpha=0.7)
    plt.axhline(0, color='r', linestyle='--')
    plt.xlabel('Predicted')
    plt.ylabel('Residuals')
    plt.title(f'{model_name}: Residuals vs Predicted')
    plt.tight_layout()
    plot_filename = f'{model_name}_residuals_{timestamp}.png'
    plt.savefig(os.path.join(RESULT_DIR, plot_filename))
    plt.close()
    print(f"Saved plot: {plot_filename}")

def train_nn_model(
    X_train: np.ndarray,
    y_train: np.ndarray,
    X_val: np.ndarray,
    y_val: np.ndarray,
    X_test: np.ndarray,
    y_test: np.ndarray,
    epochs: int = 200,
    batch_size: int = 64,
    learning_rate: float = 0.001,
    patience: int = 10
):

    train_dataset = GraphDataset(X_train, y_train)
    val_dataset = GraphDataset(X_val, y_val)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    model = SimpleNN(input_size=X_train.shape[1]).to(DEVICE)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', 
                                                     factor=0.5, patience=5, 
                                                     verbose=True)
    
    best_model_state = None
    best_val_loss = float('inf')
    epochs_no_improve = 0
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        
        avg_train_loss = running_loss / len(train_loader)
        
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                val_loss += loss.item()
        
        avg_val_loss = val_loss / len(val_loader)
        
        print(f"Epoch [{epoch+1}/{epochs}], Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")
        
        scheduler.step(avg_val_loss)

        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            epochs_no_improve = 0
            best_model_state = model.state_dict().copy()
            print("Validation loss improved. Best model updated.")
        else:
            epochs_no_improve += 1
            print(f"No improvement in validation loss for {epochs_no_improve} epoch(s).")
            if epochs_no_improve >= patience:
                print("Early stopping triggered.")
                break

    model.load_state_dict(best_model_state)

    model_filename = f'best_nn_model_{timestamp}.pth'
    torch.save(model.state_dict(), os.path.join(MODEL_DIR, model_filename))
    print(f"Best model saved to {os.path.join(MODEL_DIR, model_filename)}.")

    model.eval()
    test_dataset = GraphDataset(X_test, y_test)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    y_pred = []
    y_true = []
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs = inputs.to(DEVICE)
            outputs = model(inputs)
            y_pred.append(outputs.cpu().numpy())
            y_true.append(labels.numpy())
    y_pred = np.concatenate(y_pred, axis=0)
    y_true = np.concatenate(y_true, axis=0)
    evaluate_and_plot(y_true, y_pred, "Neural_Network")

    return model

def train_gradient_boosting(
    X_train: np.ndarray,
    y_train: np.ndarray,
    param_distributions: Optional[Dict] = None,
    n_iter: int = 100
) -> GradientBoostingRegressor:

    if param_distributions:
        print("Starting Randomized Search for Gradient Boosting...")
        gbr = GradientBoostingRegressor(random_state=42)
        randomized_search = RandomizedSearchCV(
            estimator=gbr,
            param_distributions=param_distributions,
            n_iter=n_iter,
            cv=5,
            scoring='neg_mean_squared_error',
            verbose=2,
            random_state=42,
            n_jobs=-1
        )
        randomized_search.fit(X_train, y_train)
        best_model = randomized_search.best_estimator_
        print(f"Best Gradient Boosting Params: {randomized_search.best_params_}")
    else:
        best_model = GradientBoostingRegressor(random_state=42)
        best_model.fit(X_train, y_train)
        print("Gradient Boosting model trained with default parameters.")

    return best_model

def run_experiment(
    X: np.ndarray,
    target: np.ndarray,
    model_type: str = 'nn',
    param_distributions: Optional[Dict] = None,
    n_iter: int = 100
):

    X_train_val, X_test, y_train_val, y_test = train_test_split(
        X, target, test_size=0.2, random_state=42
    )

    X_train, X_val, y_train, y_val = train_test_split(
        X_train_val, y_train_val, test_size=0.25, random_state=42  # 0.25 x 0.8 = 0.2
    )
    
    if model_type == 'nn':
        print(f"\nRunning Neural Network with features: {SELECTED_FEATURES}")
        model = train_nn_model(X_train, y_train, X_val, y_val, X_test, y_test)
    elif model_type == 'boosting':
        print(f"\nRunning Gradient Boosting with features: {SELECTED_FEATURES}")
        best_model = train_gradient_boosting(X_train, y_train, param_distributions, n_iter)
        # Evaluation on test set
        y_pred = best_model.predict(X_test)
        evaluate_and_plot(y_test, y_pred, "Gradient_Boosting")
        # Save the trained model
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        model_path = os.path.join(MODEL_DIR, f'best_gradient_boosting_model_{timestamp}.pkl')
        joblib.dump(best_model, model_path)
        print(f"Gradient Boosting model saved to {model_path}.")
    else:
        print(f"Unsupported model type: {model_type}")

In [4]:
if __name__ == "__main__":
    print("Starting experiments...")

    PARAM_DISTRIBUTIONS = {
        'n_estimators': np.arange(100, 1001, 100),
        'learning_rate': np.linspace(0.01, 0.3, 30),
        'max_depth': np.arange(3, 11, 1),
        'min_samples_split': np.arange(2, 21, 2),
        'min_samples_leaf': np.arange(1, 21, 2),
        'subsample': np.linspace(0.5, 1.0, 6),
        'max_features': ['auto', 'sqrt', 'log2', None]
    }

    print("\nExperiment: Predicting Accuracy - Neural Network")
    run_experiment(X, y, model_type='nn')

    print("\nExperiment: Predicting Accuracy - Gradient Boosting")
    run_experiment(X, y, model_type='boosting', param_distributions=PARAM_DISTRIBUTIONS, n_iter=100)

    print("\nExperiments completed.")

Starting experiments...

Experiment: Predicting Accuracy - Neural Network

Running Neural Network with features: ['Shape_GG', 'AvgStress', 'cross', 'PathLength', 'pShape_KNN', 'pAvgStress', 'pCrossNo', 'pMinAng', 'pContinu', 'pGeode', '|V|', '|E|']
Epoch [1/200], Train Loss: 0.6918, Val Loss: 0.7057
Validation loss improved. Best model updated.
Epoch [2/200], Train Loss: 0.6639, Val Loss: 0.6973
Validation loss improved. Best model updated.
Epoch [3/200], Train Loss: 0.6560, Val Loss: 0.6998
No improvement in validation loss for 1 epoch(s).
Epoch [4/200], Train Loss: 0.6516, Val Loss: 0.7030
No improvement in validation loss for 2 epoch(s).
Epoch [5/200], Train Loss: 0.6479, Val Loss: 0.6893
Validation loss improved. Best model updated.
Epoch [6/200], Train Loss: 0.6417, Val Loss: 0.6934
No improvement in validation loss for 1 epoch(s).
Epoch [7/200], Train Loss: 0.6404, Val Loss: 0.7033
No improvement in validation loss for 2 epoch(s).
Epoch [8/200], Train Loss: 0.6391, Val Loss: 0.69

130 fits failed out of a total of 500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
50 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\69418\anaconda3\envs\dl\lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\69418\anaconda3\envs\dl\lib\site-packages\sklearn\base.py", line 1467, in wrapper
    estimator._validate_params()
  File "C:\Users\69418\anaconda3\envs\dl\lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\69418\anaconda3\envs\dl\lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_cons

Best Gradient Boosting Params: {'subsample': 0.8, 'n_estimators': 500, 'min_samples_split': 16, 'min_samples_leaf': 17, 'max_features': None, 'max_depth': 3, 'learning_rate': 0.01}
Gradient_Boosting - MSE: 0.6527, MAE: 0.6646, R²: 0.1061
Saved plot: Gradient_Boosting_actual_vs_predicted_20241023_111655.png
Saved plot: Gradient_Boosting_residuals_20241023_111655.png
Gradient Boosting model saved to models\best_gradient_boosting_model_20241023_111655.pkl.

Experiments completed.
