In [15]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from scipy.sparse import issparse
from keras.utils import pad_sequences
from keras.models import Sequential
from keras.layers import Masking, LSTM, Dense, Dropout
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping

# Load the data 
df = pd.read_excel("ML Training Test Dataset.xlsx")
df = df.sort_values(by=['Match ID 18Char', 'Completion Date'])

df.head()

Unnamed: 0,Match ID 18Char,Completion Date,Program Type,Program,Big Enrollment: Record Type,Big Assessment Uploaded,Big Days Interview to Acceptance,Big Days Interview to Match,Big Days Acceptance to Match,Match Activation To Update Days,...,Volunteer: Deceased,Volunteer: Lost contact with child/family,Volunteer: Infraction of match rules/agency policies,Volunteer: Unrealistic expectations,Volunteer: Pregnancy,Volunteer: Changed workplace/school partnership,Agency: Challenges with program/partnership,Agency: Concern with Volunteer re: child safety,Event severity total,Match Length
22535,a1v2J0000027CWYQA2,42817,Site,YIP 2016,,,,,,7.0,...,0,0,0,0,0,0,0,0,0,5.5
22536,a1v2J0000027CWYQA2,42866,Site,YIP 2016,,,,,,7.0,...,0,0,0,0,0,0,0,0,0,5.5
22537,a1v2J0000027CWYQA2,42922,Site,YIP 2016,,,,,,7.0,...,0,0,0,0,0,0,0,0,0,5.5
22538,a1v2J0000027CWYQA2,42978,Site,YIP 2016,,,,,,7.0,...,0,0,0,0,0,0,0,0,0,5.5
22539,a1v2J0000027CWfQAM,42774,Site,YIP 2016,,,,,,16.0,...,0,0,0,0,0,0,0,0,0,8.5


In [16]:
# Column Defination

target_column_ = "Match Length"
categorical_cols = df.select_dtypes(include=["object", "category"]).columns.tolist()
numerical_cols = df.select_dtypes(include=["number"]).columns.tolist()

# Remove the target column from numerical_cols (if it's included)
if target_column_ in numerical_cols:
    numerical_cols.remove(target_column_)

print("Categorical columns:", categorical_cols)
print("Numerical columns:", numerical_cols)

feature_cols = categorical_cols + numerical_cols



Categorical columns: ['Match ID 18Char', 'Program Type', 'Program', 'Big Enrollment: Record Type', 'Little ID', 'Little Gender', 'Little Participant: Race/Ethnicity', 'Little County', 'Little State', 'Big ID', 'Big Gender', 'Big Race/Ethnicity', 'Big Occupation', 'Big Level of Education', 'Big County', 'Big State', 'Big Contact: Marital Status', 'Big Contact: Former Big/Little']
Numerical columns: ['Completion Date', 'Big Assessment Uploaded', 'Big Days Interview to Acceptance', 'Big Days Interview to Match', 'Big Days Acceptance to Match', 'Match Activation To Update Days', 'Match Activation Date', 'Little Age', 'Little Mean Household Income', 'Litte Median Household Income', 'Big Age', 'Big Mean Household Income', 'Big Median Household Income', 'green_flag_count', 'red_flag_count', 'Match closure Discussed', 'Changing Match Type', 'COVID impact', 'Child/Family: Feels incompatible with volunteer', 'Child/Family: Moved', 'Child/Family: Lost contact with agency', 'Child/Family: Lost con

In [17]:
# Define Pipeline
# Define transformers

numerical_transformer = Pipeline(steps=[
    ('imputer', KNNImputer(n_neighbors=5)),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=True))
])


preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)


X = df[feature_cols]
X_preprocessed = preprocessor.fit_transform(X)


if issparse(X_preprocessed):
    X_preprocessed = X_preprocessed.toarray()


cat_encoder = preprocessor.named_transformers_['cat']['onehot']
cat_feature_names = cat_encoder.get_feature_names_out(categorical_cols)
all_feature_names = numerical_cols + list(cat_feature_names)


df_transformed = pd.DataFrame(X_preprocessed, columns=all_feature_names, index=df.index)


df_transformed['Match ID 18Char'] = df['Match ID 18Char']
df_transformed['Completion Date'] = df['Completion Date']
df_transformed['Match Length'] = df['Match Length']

In [18]:
# Group the data by "Match ID 18Char," create sequences of checkups, and pad them to a uniform length.

grouped = df_transformed.groupby('Match ID 18Char')
sequences = []
targets = []

for match_id, group in grouped:

    sequence = group[all_feature_names].values  # Shape: (seq_len, num_features)
    sequences.append(sequence)

    target = group['Match Length'].iloc[0]
    targets.append(target)


max_len = max(len(seq) for seq in sequences)
X = pad_sequences(sequences, maxlen=max_len, dtype='float32', padding='post', value=0.0)
y = np.array(targets)

# X shape: (num_matches, max_len, num_features)
# y shape: (num_matches,)

In [19]:
# Split into train+validation and test

X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=42)


In [20]:
#Define the model

def build_model(units=64, layers=1, dropout=0.2, learning_rate=0.001):
    model = Sequential()
    model.add(Masking(mask_value=0., input_shape=(max_len, X.shape[2])))
    for i in range(layers - 1):
        model.add(LSTM(units, return_sequences=True))
        model.add(Dropout(dropout))
    model.add(LSTM(units))
    model.add(Dropout(dropout))
    model.add(Dense(1))  # Single output for regression
    model.compile(optimizer=Adam(learning_rate=learning_rate), loss='mse')
    return model

In [21]:
# Hyperparameter grid

param_grid = [
    {'units': 32, 'layers': 1, 'dropout': 0.2, 'learning_rate': 0.001, 'batch_size': 32},
    {'units': 64, 'layers': 1, 'dropout': 0.2, 'learning_rate': 0.001, 'batch_size': 32},
    {'units': 32, 'layers': 2, 'dropout': 0.2, 'learning_rate': 0.001, 'batch_size': 32},
    {'units': 64, 'layers': 2, 'dropout': 0.2, 'learning_rate': 0.001, 'batch_size': 32},
    {'units': 32, 'layers': 1, 'dropout': 0.5, 'learning_rate': 0.001, 'batch_size': 32},
    {'units': 64, 'layers': 1, 'dropout': 0.5, 'learning_rate': 0.001, 'batch_size': 32},
    {'units': 32, 'layers': 2, 'dropout': 0.5, 'learning_rate': 0.01, 'batch_size': 64},
    {'units': 64, 'layers': 2, 'dropout': 0.5, 'learning_rate': 0.01, 'batch_size': 64}
]

# Train and evaluate models
best_rmse = float('inf')
best_model = None
best_params = None

for params in param_grid:
    model = build_model(units=params['units'], layers=params['layers'], 
                        dropout=params['dropout'], learning_rate=params['learning_rate'])
    
    early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
    
    history = model.fit(X_train, y_train, validation_data=(X_val, y_val), 
                        epochs=100, batch_size=params['batch_size'], 
                        callbacks=[early_stopping], verbose=0)
    
    y_pred_val = model.predict(X_val, verbose=0)
    val_rmse = np.sqrt(mean_squared_error(y_val, y_pred_val))
    
    print(f"Params: {params}, Validation RMSE: {val_rmse:.4f}")
    
    if val_rmse < best_rmse:
        best_rmse = val_rmse
        best_model = model
        best_params = params

Params: {'units': 32, 'layers': 1, 'dropout': 0.2, 'learning_rate': 0.001, 'batch_size': 32}, Validation RMSE: 19.5958
Params: {'units': 64, 'layers': 1, 'dropout': 0.2, 'learning_rate': 0.001, 'batch_size': 32}, Validation RMSE: 19.5960


KeyboardInterrupt: 

In [None]:
# Predict on test set with the best model

y_pred_test = best_model.predict(X_test, verbose=0)
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))

print(f"\nBest Parameters: {best_params}")
print(f"Test RMSE: {test_rmse:.4f}")