In [28]:
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
import numpy as np
from scipy.stats import loguniform
import joblib

In [29]:
# Load reduced dataset
train_data = pd.read_csv('../data/reduced_dataset/reduced_train_data.csv')
test_data = pd.read_csv('../data/reduced_dataset/reduced_test_data.csv')

# Split to X_train, X_test, y_train, y_test
X_train = train_data[train_data.columns[:-1]]
X_test = test_data[test_data.columns[:-1]]

y_train = train_data['goal']
y_test = test_data['goal']

# Prepare the train and test data
X_train_scaled = X_train.to_numpy()
X_test_scaled = X_test.to_numpy()

### Random Forest model

In [30]:
# Initialize the Random Forest model
rf_model = RandomForestClassifier(random_state=42)

In [31]:
# GridSearchCV
# Define a grid of hyperparameters to search
param_grid = {
    'n_estimators': [100, 200, 300],      # Number of trees in the forest
    'max_depth': [None, 10, 20],          # Maximum depth of each tree
    'min_samples_split': [2, 5]           # Minimum number of samples required to split a node
}

# Set up GridSearchCV
grid_search_rf = GridSearchCV(
    estimator=rf_model,                # The model to optimize
    param_grid=param_grid,       # The grid of parameters to try
    cv=5,                        # Use 5-fold cross-validation
    scoring='accuracy',          # Evaluate model performance using accuracy
    n_jobs=-1                    # Use all available CPU cores for parallel processing
)

# Run the grid search
grid_search_rf.fit(X_train_scaled, y_train)

# Print the best parameters and best score found
print("Best Parameters (GridSearchCV):", grid_search_rf.best_params_)
print("Best Score:", grid_search_rf.best_score_)

Best Parameters (GridSearchCV): {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 100}
Best Score: 0.8056737588652483


In [32]:
# RandomizedSearchCV
# Define distributions instead of fixed lists for random sampling
param_dist = {
    'n_estimators': randint(100, 500),    # Randomly select from 100 to 499 trees
    'max_depth': [None, 10, 20, 30],      # Choose from these fixed depth values
    'min_samples_split': randint(2, 10)   # Randomly select split threshold between 2 and 9
}

# Set up RandomizedSearchCV
random_search_rf = RandomizedSearchCV(
    estimator=rf_model,                    # The model to optimize
    param_distributions=param_dist, # Parameter distributions to sample from
    n_iter=20,                       # Try 20 random combinations
    cv=5,                            # 5-fold cross-validation
    scoring='accuracy',             # Use accuracy as the scoring metric
    n_jobs=-1,                       # Use all CPU cores
    random_state=42                 # Seed for reproducibility
)

# Run the random search
random_search_rf.fit(X_train, y_train)

# Print the best parameters and best score found
print("Best Parameters (RandomizedSearchCV):", random_search_rf.best_params_)
print("Best Score:", random_search_rf.best_score_)

Best Parameters (RandomizedSearchCV): {'max_depth': None, 'min_samples_split': 6, 'n_estimators': 202}
Best Score: 0.8225177304964539


### Logistic Regression model

In [33]:
# Initialize the Logistic Regression model
lr_model = LogisticRegression()

In [34]:
# GridSearchCV
# Define grid of hyperparameters to search
param_grid = {
    'C': [0.01, 0.1, 1, 10],                 # Inverse of regularization strength (smaller values = stronger regularization)
    'penalty': ['l2'],                       # Type of regularization (L2 = Ridge)
    'solver': ['lbfgs', 'liblinear']         # Optimization algorithm (lbfgs for multi-class, liblinear for binary/small data)
}

# Set up GridSearchCV
grid_search_lr = GridSearchCV(
    estimator=lr_model,          # The model to optimize
    param_grid=param_grid,       # The grid of parameters to try
    cv=5,                        # Use 5-fold cross-validation
    scoring='accuracy',          # Evaluate model performance using accuracy
    n_jobs=-1                    # Use all available CPU cores for parallel processing
)

# Run the grid search
grid_search_lr.fit(X_train_scaled, y_train)

# Print the best parameters and best score found
print("Best Parameters (GridSearchCV):", grid_search_lr.best_params_)
print("Best Score:", grid_search_lr.best_score_)

Best Parameters (GridSearchCV): {'C': 0.1, 'penalty': 'l2', 'solver': 'lbfgs'}
Best Score: 0.8398049645390072


In [35]:
# RandomizedSearchCV
# Define hyperparameter distributions
param_dist = {
    'C': loguniform(1e-3, 1e3),               # Sample C from a log-uniform distribution between 0.001 and 1000
    'penalty': ['l2'],                        # Only L2 supported by lbfgs
    'solver': ['lbfgs']                       # lbfgs supports multi-class problems and L2 penalty
}

# Set up RandomizedSearchCV
random_search_lr = RandomizedSearchCV(
    estimator=lr_model,                     # Model to optimize
    param_distributions=param_dist,  # Distributions for random sampling
    n_iter=20,                        # Number of random combinations to try
    cv=5,                             # 5-fold cross-validation
    scoring='accuracy',              # Use accuracy for evaluation
    n_jobs=-1,                        # Use all available cores
    random_state=42                  # Set random seed for reproducibility
)

# Run the random search
random_search_lr.fit(X_train, y_train)

# Print best parameters and score
print("Best Parameters (RandomizedSearchCV):", random_search_lr.best_params_)
print("Best Score:", random_search_lr.best_score_)

Best Parameters (RandomizedSearchCV): {'C': 0.066904211664988, 'penalty': 'l2', 'solver': 'lbfgs'}
Best Score: 0.8398049645390072


### Decision Tree

In [36]:
dt_model = DecisionTreeClassifier(random_state=42, max_depth=5)

In [37]:
# GridSearchCV
# Define the grid of hyperparameters
param_grid = {
    'max_depth': [None, 5, 10, 20],             # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],            # Minimum number of samples to split a node
    'min_samples_leaf': [1, 2, 4],              # Minimum number of samples required at a leaf node
    'criterion': ['gini', 'entropy']            # Splitting strategy: 'gini' or 'entropy'
}

# Set up GridSearchCV
grid_search_dt = GridSearchCV(
    estimator=dt_model,              # Model to optimize
    param_grid=param_grid,     # Grid of hyperparameters to search
    cv=5,                      # 5-fold cross-validation
    scoring='accuracy',        # Use accuracy for scoring
    n_jobs=-1                  # Use all CPU cores
)

# Fit the model
grid_search_dt.fit(X_train, y_train)

# Results
print("Best Parameters (GridSearchCV):", grid_search_dt.best_params_)
print("Best Cross-Validated Accuracy:", grid_search_dt.best_score_)

Best Parameters (GridSearchCV): {'criterion': 'gini', 'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 5}
Best Cross-Validated Accuracy: 0.7678191489361702


In [38]:
# RandomizedSearchCV 
# Define hyperparameter distributions for random search
param_dist = {
    'max_depth': [None, 5, 10, 20, 30],               # Varying tree depth options
    'min_samples_split': randint(2, 11),              # Random integers from 2 to 10
    'min_samples_leaf': randint(1, 5),                # Random integers from 1 to 4
    'criterion': ['gini', 'entropy']                  # Splitting criteria
}

# Set up RandomizedSearchCV
random_search_dt = RandomizedSearchCV(
    estimator=dt_model,                     # Model to optimize
    param_distributions=param_dist,  # Distributions to sample from
    n_iter=20,                        # Number of random combinations to try
    cv=5,                             # 5-fold cross-validation
    scoring='accuracy',              # Accuracy scoring
    n_jobs=-1,                        # Use all CPU cores
    random_state=42                  # For reproducibility
)

# Fit the model
random_search_dt.fit(X_train, y_train)

# Results
print("Best Parameters (RandomizedSearchCV):", random_search_dt.best_params_)
print("Best Cross-Validated Accuracy:", random_search_dt.best_score_)

Best Parameters (RandomizedSearchCV): {'criterion': 'entropy', 'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 6}
Best Cross-Validated Accuracy: 0.7507978723404255


### Support Vector Machine (SVM)

In [39]:
svm_model = SVC(kernel='rbf', random_state=42)

In [40]:
# GridSearchCV 
# Define the grid of hyperparameters
param_grid = {
    'C': [0.1, 1, 10],                      # Regularization parameter (higher = less regularization)
    'kernel': ['linear', 'rbf'],            # Type of kernel: linear or RBF (radial basis function)
    'gamma': ['scale', 'auto']              # Kernel coefficient (only used with 'rbf')
}

# Set up GridSearchCV
grid_search_svm = GridSearchCV(
    estimator=svm_model,              # Model to optimize
    param_grid=param_grid,      # Grid of hyperparameters
    cv=5,                       # 5-fold cross-validation
    scoring='accuracy',         # Accuracy as performance metric
    n_jobs=-1                   # Use all CPU cores
)

# Fit the model
grid_search_svm.fit(X_train_scaled, y_train)

# Output best parameters and score
print("Best Parameters (GridSearchCV):", grid_search_svm.best_params_)
print("Best Cross-Validated Accuracy:", grid_search_svm.best_score_)

Best Parameters (GridSearchCV): {'C': 10, 'gamma': 'scale', 'kernel': 'linear'}
Best Cross-Validated Accuracy: 0.8311170212765958


In [41]:
# RandomizedSearchCV
# Define distributions for random search
param_dist = {
    'C': loguniform(1e-2, 1e2),              # Log-uniform sampling for C between 0.01 and 100
    'kernel': ['linear', 'rbf', 'poly'],     # Try multiple kernel types
    'gamma': ['scale', 'auto']              # Kernel coefficient
}

# Set up RandomizedSearchCV
random_search_svm = RandomizedSearchCV(
    estimator=svm_model,                     # Model to optimize
    param_distributions=param_dist,   # Parameter distributions
    n_iter=20,                         # Number of random combinations to try
    cv=5,                              # 5-fold cross-validation
    scoring='accuracy',               # Accuracy metric
    n_jobs=-1,                         # Use all CPU cores
    random_state=42                   # For reproducibility
)

# Fit the model
random_search_svm.fit(X_train_scaled, y_train)

# Output results
print("Best Parameters (RandomizedSearchCV):", random_search_svm.best_params_)
print("Best Cross-Validated Accuracy:", random_search_svm.best_score_)

Best Parameters (RandomizedSearchCV): {'C': 0.314891164795686, 'gamma': 'scale', 'kernel': 'poly'}
Best Cross-Validated Accuracy: 0.8311170212765957


## Comparison

In [42]:
def binary_classification_metrics(y_true, y_pred):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    
    TP = np.sum((y_true == 1) & (y_pred == 1))
    TN = np.sum((y_true == 0) & (y_pred == 0))
    FP = np.sum((y_true == 0) & (y_pred == 1))
    FN = np.sum((y_true == 1) & (y_pred == 0))

    accuracy = (TP + TN) / (TP + TN + FP + FN)
    precision = TP / (TP + FP) if (TP + FP) > 0 else 0
    recall = TP / (TP + FN) if (TP + FN) > 0 else 0
    f1_score = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0

    confusion_matrix = np.array([[TN, FP],
                   [FN, TP]])
    
    return [confusion_matrix, accuracy, precision, recall, f1_score]


In [43]:
# Evaluate Logistic Regression on test set
# GridSearchCV
gs_best_model_lr = grid_search_lr.best_estimator_
y_pred = gs_best_model_lr.predict(X_test_scaled)
_, accuracy, _, _, _ = binary_classification_metrics(y_test, y_pred)
print("Logistic Regression Accuracy (GridSearchCV):", accuracy)

# RandomizedSearchCV 
rs_best_model_lr = random_search_lr.best_estimator_
y_pred = rs_best_model_lr.predict(X_test_scaled)
_, accuracy, _, _, _ = binary_classification_metrics(y_test, y_pred)
print("Logistic Regression Accuracy (RandomizedSearchCV):", accuracy)

Logistic Regression Accuracy (GridSearchCV): 0.9
Logistic Regression Accuracy (RandomizedSearchCV): 0.9




In [44]:
# Evaluate Decision Tree on test set
# GridSearchCV
gs_best_model_dt = grid_search_dt.best_estimator_
y_pred = gs_best_model_dt.predict(X_test_scaled)
_, accuracy, _, _, _ = binary_classification_metrics(y_test, y_pred)
print("Decision Tree Accuracy (GridSearchCV):", accuracy)

# RandomizedSearchCV 
rs_best_model_dt = random_search_dt.best_estimator_
y_pred = rs_best_model_dt.predict(X_test_scaled)
_, accuracy, _, _, _ = binary_classification_metrics(y_test, y_pred)
print("Decision Tree Accuracy (RandomizedSearchCV):", accuracy)

Decision Tree Accuracy (GridSearchCV): 0.85
Decision Tree Accuracy (RandomizedSearchCV): 0.7833333333333333




In [45]:
# Evaluate Random Forest on test set
# GridSearchCV
gs_best_model_rf = grid_search_rf.best_estimator_
y_pred = gs_best_model_rf.predict(X_test_scaled)
_, accuracy, _, _, _ = binary_classification_metrics(y_test, y_pred)
print("Random Forest Accuracy (GridSearchCV):", accuracy)

# RandomizedSearchCV 
rs_best_model_rf = random_search_rf.best_estimator_
y_pred = rs_best_model_rf.predict(X_test_scaled)
_, accuracy, _, _, _ = binary_classification_metrics(y_test, y_pred)
print("Random Forest Accuracy (RandomizedSearchCV):", accuracy)

Random Forest Accuracy (GridSearchCV): 0.8666666666666667
Random Forest Accuracy (RandomizedSearchCV): 0.8833333333333333




In [46]:
# Evaluate SVM on test set
# GridSearchCV
gs_best_model_svm = grid_search_svm.best_estimator_
y_pred = gs_best_model_svm.predict(X_test_scaled)
_, accuracy, _, _, _ = binary_classification_metrics(y_test, y_pred)
print("SVM Accuracy (GridSearchCV):", accuracy)

# RandomizedSearchCV 
rs_best_model_svm = random_search_svm.best_estimator_
y_pred = rs_best_model_svm.predict(X_test_scaled)
_, accuracy, _, _, _ = binary_classification_metrics(y_test, y_pred)
print("SVM Accuracy (RandomizedSearchCV):", accuracy)

SVM Accuracy (GridSearchCV): 0.85
SVM Accuracy (RandomizedSearchCV): 0.8833333333333333


### Table of comparison
| Model       | Base model accuracy | Grid Search | RandomizedSearch |
|-------------|---------------------|----------------------------|----------------------------------|
| Logistic Regression    | 0.88                | 0.9                         | 0.9                             |
| Random Forest     | 0.83                | 0.87                       | 0.88                             |
| SVM     | 0.9                | 0.85                       | 0.88                             |
| Decision Tree     | 0.85                | 0.85                       | 0.78                             |


### Save best models with the heighest accuracy 

In [47]:
joblib.dump(rs_best_model_rf, '../models/best_models/best_random_forest_model.pkl')
joblib.dump(gs_best_model_lr, '../models/best_models/best_logistic_regression_model.pkl')

['../models/best_models/best_logistic_regression_model.pkl']