In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error

# Load the data 
df = pd.read_excel("ML Completed Subset.xlsx")


In [17]:
# Impute 
categorical_columns = [
    'Program Type', 'Program', 'Little Gender', 'Little Participant: Race/Ethnicity', 
    'Little County', 'Little State', 'Big Gender', 'Big Race/Ethnicity', 
    'Big Occupation', 'Big Level of Education', 'Big County', 'Big State', 
    'Big Contact: Marital Status', 'Big Contact: Former Big/Little'
]
numerical_columns = [
    'Little Age', 'Little Mean Household Income', 'Litte Median Household Income', 
    'Big Age', 'Big Mean Household Income', 'Big Median Household Income',
    'Match Activation To Update Days', 'green_flag_count', 'red_flag_count', 
    'Match closure Discussed', 'Changing Match Type', 'COVID impact',
    'Child/Family: Feels incompatible with volunteer', 'Child/Family: Moved',
    'Child/Family: Lost contact with agency', 'Child/Family: Lost contact with volunteer/agency',
    'Child/Family: Lost contact with volunteer', 'Child/Family: Moved out of service',
    'Child/Family: Unrealistic expectations', 'Child/Family: Time constraints',
    'Child/Family: Infraction of match rules/agency policies', 'Child/Family: Moved within service area',
    'Child: Graduated', 'Child: Transportation Issues', 'Child: Changed school/site',
    'Child: Lost interest', 'Child: Family structure changed', 'Child: Severity of challenges',
    'Volunteer: Transportation Issues', 'Volunteer: Moved out of service area',
    'Volunteer: Moved within service area', 'Volunteer: Lost contact with agency',
    'Volunteer: Lost contact with child/agency', 'Volunteer: Feels incompatible with child/family',
    'Volunteer: Time constraint', 'Volunteer: Deceased', 'Volunteer: Lost contact with child/family',
    'Volunteer: Infraction of match rules/agency policies', 'Volunteer: Unrealistic expectations',
    'Volunteer: Pregnancy', 'Volunteer: Changed workplace/school partnership',
    'Agency: Challenges with program/partnership', 'Agency: Concern with Volunteer re: child safety',
    'Event severity total'
]

# Categorical: most frequent
cat_imputer = SimpleImputer(strategy='most_frequent')
df[categorical_columns] = cat_imputer.fit_transform(df[categorical_columns])

# Numerical: KNN Imputer
num_imputer = KNNImputer(n_neighbors=5)
df[numerical_columns] = num_imputer.fit_transform(df[numerical_columns])

grouped = df.groupby('Match ID 18Char')

In [18]:
# Static features (take first value as theyâ€™re consistent per match)
static_features = [
    'Little Age', 'Little Gender', 'Little Participant: Race/Ethnicity', 'Little County',
    'Little State', 'Little Mean Household Income', 'Litte Median Household Income',
    'Big Age', 'Big Gender', 'Big Race/Ethnicity', 'Big Occupation', 'Big Level of Education',
    'Big County', 'Big State', 'Big Contact: Marital Status', 'Big Mean Household Income',
    'Big Median Household Income', 'Big Contact: Former Big/Little', 'Program Type', 'Program'
]
agg_df = grouped[static_features].first()

# Aggregated features from time-varying data
agg_df['number_checkups'] = grouped.size()
agg_df['sum_green_flags'] = grouped['green_flag_count'].sum()
agg_df['sum_red_flags'] = grouped['red_flag_count'].sum()
agg_df['sum_event_severity'] = grouped['Event severity total'].sum()
agg_df['avg_update_days'] = grouped['Match Activation To Update Days'].mean()
agg_df['max_update_days'] = grouped['Match Activation To Update Days'].max()

# Flag sums (example subset; include all relevant flag columns as needed)
flag_columns = [
    'Match closure Discussed', 'Changing Match Type', 'COVID impact',
    'Child/Family: Feels incompatible with volunteer', 'Child: Lost interest'
    # Add all other flag columns here for completeness
]
for col in flag_columns:
    agg_df[f'sum_{col.replace(":", "_").replace(" ", "_")}'] = grouped[col].sum()



In [20]:
# Create Train test split

# Target variable
agg_df['Match Length'] = grouped['Match Length'].first()

# Step 3: Prepare Features and Target
X = agg_df.drop('Match Length', axis=1)
y = agg_df['Match Length']

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Define Preprocessing Pipeline
cat_features = [col for col in X.columns if col in categorical_columns]
num_features = [col for col in X.columns if col not in categorical_columns]

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features),
        ('num', StandardScaler(), num_features)
    ]
)


In [21]:
# Grid search Model Definations : 
models = {
    'LinearRegression': {
        'model': LinearRegression(),
        'params': {}
    },
    'RandomForestRegressor': {
        'model': RandomForestRegressor(random_state=42),
        'params': {
            'regressor__n_estimators': [100, 200],
            'regressor__max_depth': [None, 10, 20],
            'regressor__min_samples_split': [2, 5]
        }
    },
    'GradientBoostingRegressor': {
        'model': GradientBoostingRegressor(random_state=42),
        'params': {
            'regressor__n_estimators': [100, 200],
            'regressor__learning_rate': [0.01, 0.1],
            'regressor__max_depth': [3, 5]
        }
    },
    'SVR': {
        'model': SVR(),
        'params': {
            'regressor__C': [0.1, 1, 10],
            'regressor__epsilon': [0.01, 0.1],
            'regressor__kernel': ['linear', 'rbf']
        }
    },
    'MLPRegressor': {
        'model': MLPRegressor(max_iter=1000, random_state=42),
        'params': {
            'regressor__hidden_layer_sizes': [(50,), (100,), (50, 50)],
            'regressor__activation': ['relu', 'tanh'],
            'regressor__alpha': [0.0001, 0.001]
        }
    }
}

In [23]:
# Run Grid search Models!

best_models = {}
for name, model_info in models.items():
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model_info['model'])
    ])
    grid_search = GridSearchCV(
        pipeline,
        model_info['params'],
        cv=5,
        scoring='neg_root_mean_squared_error',
        n_jobs=-1  # Use all available CPU cores
    )
    grid_search.fit(X_train, y_train)
    best_models[name] = {
        'best_estimator': grid_search.best_estimator_,
        'best_params': grid_search.best_params_,
        'best_score': -grid_search.best_score_  # Convert back to positive RMSE
    }
    print(f"{name} - Best RMSE: {best_models[name]['best_score']:.4f}, "
          f"Best Params: {best_models[name]['best_params']}")

# Step 7: Select and Evaluate the Best Model
best_model_name = min(best_models, key=lambda k: best_models[k]['best_score'])
best_model = best_models[best_model_name]['best_estimator']
best_rmse = best_models[best_model_name]['best_score']

# Test set evaluation
y_pred = best_model.predict(X_test)
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f"\nBest Model: {best_model_name}")
print(f"Cross-Validation RMSE: {best_rmse:.4f}")
print(f"Test RMSE: {test_rmse:.4f}")

LinearRegression - Best RMSE: 2.9415, Best Params: {}
RandomForestRegressor - Best RMSE: 2.7296, Best Params: {'regressor__max_depth': 10, 'regressor__min_samples_split': 2, 'regressor__n_estimators': 100}
GradientBoostingRegressor - Best RMSE: 2.7455, Best Params: {'regressor__learning_rate': 0.1, 'regressor__max_depth': 5, 'regressor__n_estimators': 100}
SVR - Best RMSE: 2.7966, Best Params: {'regressor__C': 10, 'regressor__epsilon': 0.1, 'regressor__kernel': 'linear'}
MLPRegressor - Best RMSE: 3.6566, Best Params: {'regressor__activation': 'relu', 'regressor__alpha': 0.001, 'regressor__hidden_layer_sizes': (100,)}

Best Model: RandomForestRegressor
Cross-Validation RMSE: 2.7296
Test RMSE: 2.8893


