In [11]:
# Import libraries
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from scipy.sparse import issparse
import time
from itertools import product

# Check GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load and sort data
start_time = time.time()
df = pd.read_excel('ML Training Test Dataset.xlsx')  # Replace with your CSV file path
df = df.sort_values(by=['Match ID 18Char', 'Completion Date'])

# Identify features and target
target_column = "Match Length"
categorical_cols = df.select_dtypes(include=["object", "category"]).columns.tolist()
numerical_cols = df.select_dtypes(include=["number"]).columns.tolist()
if target_column in numerical_cols:
    numerical_cols.remove(target_column)
feature_cols = categorical_cols + numerical_cols

df.describe()
print("Categorical columns:", len(categorical_cols))



Using device: cuda
Categorical columns:
Numerical columns:
Feature columns:


['Match ID 18Char',
 'Program Type',
 'Program',
 'Big Enrollment: Record Type',
 'Little ID',
 'Little Gender',
 'Little Participant: Race/Ethnicity',
 'Little County',
 'Little State',
 'Big ID',
 'Big Gender',
 'Big Race/Ethnicity',
 'Big Occupation',
 'Big Level of Education',
 'Big County',
 'Big State',
 'Big Contact: Marital Status',
 'Big Contact: Former Big/Little',
 'Completion Date',
 'Big Assessment Uploaded',
 'Big Days Interview to Acceptance',
 'Big Days Interview to Match',
 'Big Days Acceptance to Match',
 'Match Activation To Update Days',
 'Match Activation Date',
 'Little Age',
 'Little Mean Household Income',
 'Litte Median Household Income',
 'Big Age',
 'Big Mean Household Income',
 'Big Median Household Income',
 'green_flag_count',
 'red_flag_count',
 'Match closure Discussed',
 'Changing Match Type',
 'COVID impact',
 'Child/Family: Feels incompatible with volunteer',
 'Child/Family: Moved',
 'Child/Family: Lost contact with agency',
 'Child/Family: Lost conta

In [17]:

# Preprocessing
start_time = time.time()
numerical_transformer = Pipeline(steps=[
    ('imputer', KNNImputer(n_neighbors=5)),
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=True))
])
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ], n_jobs=-1
)



X = df[feature_cols]
X_preprocessed = preprocessor.fit_transform(X)
if issparse(X_preprocessed):
    X_preprocessed = X_preprocessed.toarray()


cat_encoder = preprocessor.named_transformers_['cat']['onehot']
cat_feature_names = cat_encoder.get_feature_names_out(categorical_cols)
all_feature_names = numerical_cols + list(cat_feature_names)
df_transformed = pd.DataFrame(X_preprocessed, columns=all_feature_names, index=df.index)
df_transformed['Match ID 18Char'] = df['Match ID 18Char']
df_transformed['Match Length'] = df['Match Length']
print(f"Preprocessing time: {time.time() - start_time:.2f} seconds")

# Create sequences for LSTM
start_time = time.time()
grouped = df_transformed.groupby('Match ID 18Char')
sequences = []
targets = []
sequence_lengths = []
for match_id, group in grouped:
    sequence = group[all_feature_names].values
    sequences.append(sequence)
    targets.append(group['Match Length'].iloc[0])
    sequence_lengths.append(len(sequence))
max_len = max(sequence_lengths)
X_padded = np.array([np.pad(seq, ((0, max_len - len(seq)), (0, 0)), 'constant', constant_values=0) for seq in sequences])
y = np.array(targets, dtype='float32')
print(f"Sequence creation time: {time.time() - start_time:.2f} seconds")

# Split data
X_train_val, X_test, y_train_val, y_test = train_test_split(X_padded, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=42)

# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32).to(device)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).unsqueeze(1).to(device)
X_val_tensor = torch.tensor(X_val, dtype=torch.float32).to(device)
y_val_tensor = torch.tensor(y_val, dtype=torch.float32).unsqueeze(1).to(device)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32).to(device)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32).unsqueeze(1).to(device)




Preprocessing time: 30.07 seconds
Sequence creation time: 11.48 seconds


In [20]:
print(len(X_padded))


3264


In [21]:

# Hyperparameter grid
param_grid = {
    'hidden_size': [32, 64, 128, 256, 512],           
    'num_layers': [1, 2, 3],                        
    'dropout': [0.1, 0.2, 0.3, 0.5],                
    'learning_rate': [0.01, 0.001, 0.0005, 0.0001],
    'batch_size': [32, 64, 128, 256, 512]
}

param_combinations = list(product(*param_grid.values()))

# Define LSTM model
class LSTMRegressor(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, dropout):
        super(LSTMRegressor, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_size, 1)
    
    def forward(self, x, lengths):
        x_packed = nn.utils.rnn.pack_padded_sequence(x, lengths, batch_first=True, enforce_sorted=False)
        _, (h_n, _) = self.lstm(x_packed)
        out = self.fc(h_n[-1])
        return out


# Training function
def train_model(model, train_loader, val_loader, criterion, optimizer, epochs=100, patience=10):
    best_val_loss = float('inf')
    patience_counter = 0
    for epoch in range(epochs):
        model.train()
        train_loss = 0.0
        for X_batch, y_batch, lengths in train_loader:
            optimizer.zero_grad()
            outputs = model(X_batch, lengths)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()
            train_loss += loss.item() * X_batch.size(0)
        train_loss /= len(train_loader.dataset)
        
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for X_batch, y_batch, lengths in val_loader:
                outputs = model(X_batch, lengths)
                loss = criterion(outputs, y_batch)
                val_loss += loss.item() * X_batch.size(0)
        val_loss /= len(val_loader.dataset)
        
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= patience:
                break
    return best_val_loss

# Data loader helper
def get_dataloaders(X_train, y_train, X_val, y_val, batch_size):
    train_dataset = TensorDataset(X_train, y_train, torch.tensor(sequence_lengths[:len(X_train)], dtype=torch.int64))
    val_dataset = TensorDataset(X_val, y_val, torch.tensor(sequence_lengths[len(X_train):len(X_train)+len(X_val)], dtype=torch.int64))
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    return train_loader, val_loader



In [22]:
# Hyperparameter tuning
best_rmse = float('inf')
best_model_state = None
best_params = None
for params in param_combinations:
    hidden_size, num_layers, dropout, learning_rate, batch_size = params
    print(f"\nTesting: hidden_size={hidden_size}, num_layers={num_layers}, dropout={dropout}, lr={learning_rate}, batch_size={batch_size}")
    start_time = time.time()
    
    model = LSTMRegressor(input_size=X_train.shape[2], hidden_size=hidden_size, num_layers=num_layers, dropout=dropout).to(device)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1e-5)
    
    train_loader, val_loader = get_dataloaders(X_train_tensor, y_train_tensor, X_val_tensor, y_val_tensor, batch_size)
    val_loss = train_model(model, train_loader, val_loader, criterion, optimizer)
    val_rmse = np.sqrt(val_loss)
    
    print(f"Validation RMSE: {val_rmse:.4f}, Time: {time.time() - start_time:.2f} seconds")
    
    if val_rmse < best_rmse:
        best_rmse = val_rmse
        best_model_state = model.state_dict()
        best_params = params

# Load and test the best model
best_hidden_size, best_num_layers, best_dropout, _, _ = best_params
best_model = LSTMRegressor(input_size=X_train.shape[2], hidden_size=best_hidden_size, num_layers=best_num_layers, dropout=best_dropout).to(device)
best_model.load_state_dict(best_model_state)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor, torch.tensor(sequence_lengths[len(X_train_val):], dtype=torch.int64))
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False)
best_model.eval()
y_pred_test = []
with torch.no_grad():
    for X_batch, _, lengths in test_loader:
        outputs = best_model(X_batch, lengths)
        y_pred_test.extend(outputs.cpu().numpy())
y_pred_test = np.array(y_pred_test).flatten()
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
print(f"\nBest Parameters: hidden_size={best_hidden_size}, num_layers={best_num_layers}, dropout={best_dropout}, lr={best_params[3]}, batch_size={best_params[4]}")
print(f"Test RMSE: {test_rmse:.4f}")


Testing: hidden_size=32, num_layers=1, dropout=0.1, lr=0.01, batch_size=32




Validation RMSE: 7.0212, Time: 7.23 seconds

Testing: hidden_size=32, num_layers=1, dropout=0.1, lr=0.01, batch_size=64
Validation RMSE: 7.1636, Time: 3.86 seconds

Testing: hidden_size=32, num_layers=1, dropout=0.1, lr=0.01, batch_size=128
Validation RMSE: 7.8211, Time: 4.04 seconds

Testing: hidden_size=32, num_layers=1, dropout=0.1, lr=0.01, batch_size=256
Validation RMSE: 8.4968, Time: 3.72 seconds

Testing: hidden_size=32, num_layers=1, dropout=0.1, lr=0.01, batch_size=512
Validation RMSE: 8.9889, Time: 4.17 seconds

Testing: hidden_size=32, num_layers=1, dropout=0.1, lr=0.001, batch_size=32
Validation RMSE: 7.2436, Time: 18.39 seconds

Testing: hidden_size=32, num_layers=1, dropout=0.1, lr=0.001, batch_size=64
Validation RMSE: 8.5129, Time: 10.10 seconds

Testing: hidden_size=32, num_layers=1, dropout=0.1, lr=0.001, batch_size=128
Validation RMSE: 11.9520, Time: 6.00 seconds

Testing: hidden_size=32, num_layers=1, dropout=0.1, lr=0.001, batch_size=256
Validation RMSE: 16.2596, Ti



Validation RMSE: 6.9145, Time: 6.15 seconds

Testing: hidden_size=32, num_layers=1, dropout=0.2, lr=0.01, batch_size=64
Validation RMSE: 7.2953, Time: 4.80 seconds

Testing: hidden_size=32, num_layers=1, dropout=0.2, lr=0.01, batch_size=128
Validation RMSE: 8.1338, Time: 2.95 seconds

Testing: hidden_size=32, num_layers=1, dropout=0.2, lr=0.01, batch_size=256
Validation RMSE: 7.6542, Time: 4.90 seconds

Testing: hidden_size=32, num_layers=1, dropout=0.2, lr=0.01, batch_size=512
Validation RMSE: 8.9759, Time: 4.27 seconds

Testing: hidden_size=32, num_layers=1, dropout=0.2, lr=0.001, batch_size=32
Validation RMSE: 7.3058, Time: 16.55 seconds

Testing: hidden_size=32, num_layers=1, dropout=0.2, lr=0.001, batch_size=64
Validation RMSE: 8.3009, Time: 9.20 seconds

Testing: hidden_size=32, num_layers=1, dropout=0.2, lr=0.001, batch_size=128
Validation RMSE: 12.0472, Time: 6.08 seconds

Testing: hidden_size=32, num_layers=1, dropout=0.2, lr=0.001, batch_size=256
Validation RMSE: 16.3163, Tim



Validation RMSE: 6.4922, Time: 7.95 seconds

Testing: hidden_size=32, num_layers=1, dropout=0.3, lr=0.01, batch_size=64
Validation RMSE: 7.3551, Time: 4.39 seconds

Testing: hidden_size=32, num_layers=1, dropout=0.3, lr=0.01, batch_size=128
Validation RMSE: 7.7416, Time: 5.03 seconds

Testing: hidden_size=32, num_layers=1, dropout=0.3, lr=0.01, batch_size=256
Validation RMSE: 8.3759, Time: 4.12 seconds

Testing: hidden_size=32, num_layers=1, dropout=0.3, lr=0.01, batch_size=512
Validation RMSE: 9.0579, Time: 4.48 seconds

Testing: hidden_size=32, num_layers=1, dropout=0.3, lr=0.001, batch_size=32
Validation RMSE: 7.4017, Time: 20.36 seconds

Testing: hidden_size=32, num_layers=1, dropout=0.3, lr=0.001, batch_size=64
Validation RMSE: 8.5018, Time: 9.89 seconds

Testing: hidden_size=32, num_layers=1, dropout=0.3, lr=0.001, batch_size=128
Validation RMSE: 11.9160, Time: 6.17 seconds

Testing: hidden_size=32, num_layers=1, dropout=0.3, lr=0.001, batch_size=256
Validation RMSE: 16.1629, Tim



Validation RMSE: 6.7683, Time: 6.45 seconds

Testing: hidden_size=32, num_layers=1, dropout=0.5, lr=0.01, batch_size=64
Validation RMSE: 7.0812, Time: 6.27 seconds

Testing: hidden_size=32, num_layers=1, dropout=0.5, lr=0.01, batch_size=128
Validation RMSE: 7.8387, Time: 3.79 seconds

Testing: hidden_size=32, num_layers=1, dropout=0.5, lr=0.01, batch_size=256
Validation RMSE: 8.4497, Time: 3.44 seconds

Testing: hidden_size=32, num_layers=1, dropout=0.5, lr=0.01, batch_size=512
Validation RMSE: 8.5627, Time: 4.53 seconds

Testing: hidden_size=32, num_layers=1, dropout=0.5, lr=0.001, batch_size=32
Validation RMSE: 7.1152, Time: 18.11 seconds

Testing: hidden_size=32, num_layers=1, dropout=0.5, lr=0.001, batch_size=64
Validation RMSE: 8.4150, Time: 9.70 seconds

Testing: hidden_size=32, num_layers=1, dropout=0.5, lr=0.001, batch_size=128
Validation RMSE: 11.8466, Time: 6.06 seconds

Testing: hidden_size=32, num_layers=1, dropout=0.5, lr=0.001, batch_size=256
Validation RMSE: 16.2186, Tim