# MLR Modeling
Multinomial Linear Regression

## Import Libraries

In [2]:
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from skopt import BayesSearchCV
from scipy.stats import loguniform

In [None]:
df = pd.read_csv('../data/processed/HRDataset_p_v4.csv', index_col=0)
df.head()

In [5]:
categorical_features = [
    'GenderID',
    'FromDiversityJobFairID',
    'State',
    'CitizenDesc',
    'HispanicLatino',
    'RaceDesc',
    'Department',
    'ManagerName',
    'RecruitmentSource',
    'HireYear',
    'HireMonth'
]
numeric_features = [
    'Salary',
    'EngagementSurvey',
    'EmpSatisfaction',
    'SpecialProjectsCount',
    'DaysLateLast30',
    'Absences',
    'Age',
    'NumberOfColleagues'
]
label = 'PerfScoreID'

In [None]:
df[categorical_features + numeric_features].shape

### Split the Data

In [7]:
# Separate features and labels
X = df[categorical_features + numeric_features]
y = df[label]

# Split into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # stratify=y


### Encode and Scale 

In [8]:
# Create a ColumnTransformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='mean')),  # we don't have missing numerical values
            ('scaler', StandardScaler())
        ]), numeric_features),
        ('cat', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OneHotEncoder())
        ]), categorical_features)
    ])

## MLR Modeling

In [10]:
# Base model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(solver='lbfgs', max_iter=1000)) # multi_class='multinomial',
])

### No Optimization  (Baseline Model)

In [None]:
# Fit the model without optimization
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Evaluate baseline performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f'Grid Search Accuracy: {accuracy:.4f}')
print(f'Grid Search Precision: {precision:.4f}')
print(f'Grid Search Recall: {recall:.4f}')
print(f'Grid Search F1 Score: {f1:.4f}')


### Grid Search Optimization

In [None]:
# Grid Search parameters
param_grid = {
    'classifier__C': [0.1, 1, 10],
    'classifier__penalty': ['l2']  # 'l2' regularization
}

grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)
y_pred_grid = grid_search.predict(X_test)

# Evaluate Grid Search performance
accuracy_grid = accuracy_score(y_test, y_pred_grid)
precision_grid = precision_score(y_test, y_pred_grid, average='weighted')
recall_grid = recall_score(y_test, y_pred_grid, average='weighted')
f1_grid = f1_score(y_test, y_pred_grid, average='weighted')

print("Grid Search Optimization:")
print(f'Best Parameters: {grid_search.best_params_}')
print(f"Classification Report:\n{classification_report(y_test, y_pred_grid)}\n")
print(f'Grid Search Accuracy: {accuracy_grid:.4f}')
print(f'Grid Search Precision: {precision_grid:.4f}')
print(f'Grid Search Recall: {recall_grid:.4f}')
print(f'Grid Search F1 Score: {f1_grid:.4f}')


### Random Search Optimization

In [None]:
# Random Search parameters
param_dist = {
    'classifier__C': loguniform(1e-3, 1e3),
    'classifier__penalty': ['l2']
}

random_search = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=50, cv=5, scoring='accuracy', random_state=42)
random_search.fit(X_train, y_train)
y_pred_random = random_search.predict(X_test)

# Evaluate Random Search performance
accuracy_random = accuracy_score(y_test, y_pred_random)
precision_random = precision_score(y_test, y_pred_random, average='weighted')
recall_random = recall_score(y_test, y_pred_random, average='weighted')
f1_random = f1_score(y_test, y_pred_random, average='weighted')

print("Random Search Optimization:")
print(f'Best Parameters: {random_search.best_params_}')
print(f"Classification Report:\n{classification_report(y_test, y_pred_random)}\n")
print(f'Random Search Accuracy: {accuracy_random:.4f}')
print(f'Random Search Precision: {precision_random:.4f}')
print(f'Random Search Recall: {recall_random:.4f}')
print(f'Random Search F1 Score: {f1_random:.4f}')


Bayesian Optimization (using `skopt`)

In [None]:
# Define the parameter space for Bayesian optimization
param_space = {
    'classifier__C': (1e-6, 1e+6, 'log-uniform'),
    'classifier__penalty': ['l2'],
    'classifier__solver': ['lbfgs'],
    'classifier__max_iter': (100, 3000),
    'classifier__multi_class': ['multinomial'],
}

# Define the cross-validation strategy
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Set up the Bayesian search
bayes_search = BayesSearchCV(
    estimator=model,
    search_spaces=param_space,
    n_iter=32,  # Number of parameter settings that are sampled
    cv=cv,
    n_jobs=-1,
    scoring='accuracy',  # Metric to optimize
    random_state=42
)

# Fit the Bayesian search
bayes_search.fit(X_train, y_train)

# Make predictions with the best model
y_pred_bayes = bayes_search.best_estimator_.predict(X_test)

# Evaluate the Bayesian optimized model
bayes_accuracy = accuracy_score(y_test, y_pred_bayes)
bayes_precision = precision_score(y_test, y_pred_bayes, average='weighted')
bayes_recall = recall_score(y_test, y_pred_bayes, average='weighted')
bayes_f1 = f1_score(y_test, y_pred_bayes, average='weighted')

print(f'Bayesian Optimization Accuracy: {bayes_accuracy:.4f}')
print(f'Bayesian Optimization Precision: {bayes_precision:.4f}')
print(f'Bayesian Optimization Recall: {bayes_recall:.4f}')
print(f'Bayesian Optimization F1 Score: {bayes_f1:.4f}')


### Optuna

In [None]:
import optuna
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline

In [None]:
# Define the objective function for Optuna
def objective(trial):
    # Suggest values for hyperparameters
    C = trial.suggest_loguniform('C', 1e-6, 1e6)
    max_iter = trial.suggest_int('max_iter', 100, 3000)
    
    # Update the logistic regression model with suggested parameters
    classifier = LogisticRegression(
        C=C,
        max_iter=max_iter,
        solver='lbfgs',
        multi_class='multinomial',
        random_state=42
    )
    
    # Create the updated pipeline
    model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', classifier)
    ])
    
    # Fit the model
    model.fit(X_train, y_train)
    
    # Evaluate the model using accuracy
    y_pred = model.predict(X_test)
    return accuracy_score(y_test, y_pred)

# Create the study and optimize
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

# Print the best parameters and best score
print(f'Best parameters: {study.best_params}')
print(f'Best accuracy: {study.best_value}')

# Retrain the model with the best parameters
best_params = study.best_params
best_classifier = LogisticRegression(
    C=best_params['C'],
    max_iter=best_params['max_iter'],
    solver='lbfgs',
    multi_class='multinomial',
    random_state=42
)
best_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', best_classifier)
])
best_model.fit(X_train, y_train)
y_pred_optuna = best_model.predict(X_test)

# Evaluate the Optuna optimized model
optuna_accuracy = accuracy_score(y_test, y_pred_optuna)
optuna_precision = precision_score(y_test, y_pred_optuna, average='weighted')
optuna_recall = recall_score(y_test, y_pred_optuna, average='weighted')
optuna_f1 = f1_score(y_test, y_pred_optuna, average='weighted')

print(f'Optuna Optimization Accuracy: {optuna_accuracy:.4f}')
print(f'Optuna Optimization Precision: {optuna_precision:.4f}')
print(f'Optuna Optimization Recall: {optuna_recall:.4f}')
print(f'Optuna Optimization F1 Score: {optuna_f1:.4f}')


## Results

In [None]:
results = pd.DataFrame({
    'Method': ['Baseline', 'Grid Search', 'Random Search', 'Optuna'],
    'Accuracy': [accuracy, accuracy_grid, accuracy_random, optuna_accuracy],
    'Precision': [precision, precision_grid, precision_random, optuna_precision],
    'Recall': [recall, recall_grid, recall_random, optuna_recall],
    'F1 Score': [f1, f1_grid, f1_random, optuna_f1]
})

print(results)