# Modeling

## Import Libraries

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

from sklearn.metrics import accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix, mean_squared_error
pd.set_option("display.max_columns", None)

In [None]:
file_path = '../data/processed/HRDataset_p_v4.csv'
df = pd.read_csv(file_path, index_col=0)

df.head()

In [None]:
categorical_features = [
    'GenderID',
    'FromDiversityJobFairID',
    'State',
    'CitizenDesc',
    'HispanicLatino',
    'RaceDesc',
    'Department',
    'ManagerName',
    'RecruitmentSource',
    'HireYear',
    'HireMonth'
]
numeric_features = [
    'Salary',
    'EngagementSurvey',
    'EmpSatisfaction',
    'SpecialProjectsCount',
    'DaysLateLast30',
    'Absences',
    'Age',
    'NumberOfColleagues'
]
label = 'PerfScoreID'

In [None]:
df.shape

### Split the Data

In [None]:
# Separate features and labels
X = df[categorical_features + numeric_features]
y = df[label]
# Split into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Encode and Scale

In [None]:
# Create a ColumnTransformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='mean')),  # We don't have missing numerical values
            ('scaler', StandardScaler())
        ]), numeric_features),
        ('cat', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OneHotEncoder(handle_unknown='ignore'))  # Handle unknown categories
        ]), categorical_features)
    ]
)

## Model Training

In [None]:
# Define pipelines for classification
classifiers = {
    'RandomForest': Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', RandomForestClassifier( class_weight='balanced',random_state=42))
    ]),
    'SVC': Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', SVC( random_state=42))
    ])
}

In [None]:
# Define parameter grids for GridSearchCV with multiple values
param_grids = {
    'RandomForest':{
    'classifier__n_estimators': [100, 200, 300, 500, 700], # Different numbers of trees
    'classifier__max_depth': [10, 20, 30, None],           # Different tree depths
    'classifier__min_samples_split': [2, 5, 10],           # Split criteria
    'classifier__min_samples_leaf': [1, 2, 4],             # Minimum samples per leaf
    'classifier__bootstrap': [True, False]                 # Bootstrap or not
},


'SVC' : {
   'classifier__C': [0.1, 1, 10, 100],
    'classifier__kernel': ['linear', 'rbf'],
    'classifier__gamma': ['scale', 'auto']
}
}

# Define parameter distributions for RandomizedSearchCV
param_distributions = {
    'RandomForest': {
        'classifier__n_estimators': np.random.randint(50, 600, size=10).tolist(),  # Random number of trees
        'classifier__max_depth': np.random.randint(10, 40, size=10).tolist(),      # Random depths
        'classifier__min_samples_split': np.random.randint(2, 10, size=10).tolist(),  # Random split criteria
        'classifier__min_samples_leaf': np.random.randint(1, 5, size=10).tolist(),    # Random minimum samples per leaf
        'classifier__bootstrap': [True, False]                                       # Bootstrap or not
    },
    'SVC': {
        'classifier__C': np.random.uniform(0.01, 10, size=10).tolist(),            # Random C values
        'classifier__kernel': ['linear', 'rbf'],                                   # Different kernels
        'classifier__gamma': np.random.uniform(0.001, 1, size=10).tolist(),        # Random gamma values
        'classifier__tol': np.random.uniform(1e-5, 1e-3, size=10).tolist()         # Random tolerance values
    }
}



In [None]:
print("Distribution in training set:")
print(np.bincount(y_train))

print("Distribution in test set:")
print(np.bincount(y_test))


### No Tuning

In [None]:
for name, model in classifiers.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # wandb.log({
    #     f"{name}/accuracy": np.mean(y_pred == y_test),
    #     f"{name}/classification_report": classification_report(y_test, y_pred, output_dict=True)
    # })

    print(f"{name} Accuracy: {np.mean(y_pred == y_test)}")
    print(f"{name} Classification Report:")
    print(classification_report(y_test, y_pred,zero_division=1))

### Grid Search

In [None]:
#Gridsearch optimization

for name, model in classifiers.items():
    grid_search = GridSearchCV(estimator=model,
                               param_grid=param_grids[name],
                               scoring='accuracy',
                               cv=2,
                               verbose=1,
                               n_jobs=-1,
                               error_score='raise')
    #model.fit(X_train, y_train)
    grid_search.fit(X_train, y_train)

    #y_pred = model.predict(X_test)
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)

   # plot the confusion matrix as a heatmap
    cm = confusion_matrix(y_test, y_pred)
    ax = sns.heatmap(cm, annot=True, cmap="Blues", fmt="d")
    ax.set_title(name + " Confusion Matrix")
    ax.set_xlabel("Predicted Label")
    ax.set_ylabel("True Label")
    plt.show()

    print(f"{name} Accuracy: {np.mean(y_pred == y_test)}")
    print(f"{name} Classification Report:")
    print(classification_report(y_test, y_pred,zero_division=1))
    print('Best hyperparameters are: '+str(grid_search.best_estimator_))

### Random Search

In [None]:


# RandomizedSearch
random_search_rf = RandomizedSearchCV(estimator=classifiers['RandomForest'], param_distributions=param_distributions['RandomForest'], n_iter=20, cv=3, n_jobs=-1, verbose=2, random_state=42)
random_search_svc = RandomizedSearchCV(estimator=classifiers['SVC'], param_distributions=param_distributions['SVC'], n_iter=20, cv=3, n_jobs=-1, verbose=2, random_state=42)

# RandomizedSearch
print("Training RandomForest with RandomizedSearchCV...")
random_search_rf.fit(X_train, y_train)

print("Training SVC with RandomizedSearchCV...")
random_search_svc.fit(X_train, y_train)

print(f"Best parameters for RandomForest: {random_search_rf.best_params_}")
print(f"Best parameters for SVC: {random_search_svc.best_params_}")

rf_predictions = random_search_rf.predict(X_test)
svc_predictions = random_search_svc.predict(X_test)

print("RandomForest Classification Report (RandomizedSearch):")
print(classification_report(y_test, rf_predictions,zero_division=1))
print(f"Accuracy for RandomForest: {accuracy_score(y_test, rf_predictions)}")

print("SVC Classification Report (RandomizedSearch):")
print(classification_report(y_test, svc_predictions, zero_division=1))
print(f"Accuracy for SVC: {accuracy_score(y_test, svc_predictions)}")


In [None]:
from sklearn.model_selection import KFold

# Initialize KFold cross-validation
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

# Define the objective function for Optuna
def objective(trial, model_name):
    if model_name == 'RandomForest':
        n_estimators = trial.suggest_int('n_estimators', 50, 700, step=50)
        max_depth = trial.suggest_int('max_depth', 10, 40, step=5)
        min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
        min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 5)
        bootstrap = trial.suggest_categorical('bootstrap', [True, False])

        model = RandomForestClassifier(
            n_estimators=n_estimators,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            bootstrap=bootstrap,
            random_state=42
        )
    else:  # SVC
        C = trial.suggest_float('C', 0.1, 10, log=True)
        gamma = trial.suggest_categorical('gamma', ['scale', 'auto'])
        kernel = trial.suggest_categorical('kernel', ['linear', 'rbf'])
        tol = trial.suggest_float('tol', 1e-5, 1e-3, log=True)

        model = SVC(C=C, gamma=gamma, kernel=kernel, tol=tol, random_state=42)

    # Perform k-fold cross-validation
    scores = []

    for train_index, valid_index in kfold.split(X_train):
        X_fold_train, X_fold_valid = X_train.iloc[train_index], X_train.iloc[valid_index]
        y_fold_train, y_fold_valid = y_train.iloc[train_index], y_train.iloc[valid_index]

        # Preprocessing
        X_fold_train_processed = preprocessor.fit_transform(X_fold_train)
        X_fold_valid_processed = preprocessor.transform(X_fold_valid)

        model.fit(X_fold_train_processed, y_fold_train)
        y_valid_pred = model.predict(X_fold_valid_processed)
        score = accuracy_score(y_fold_valid, y_valid_pred)
        scores.append(score)

    return np.mean(scores)

# List to store results
results = []

# Optimize hyperparameters for both models
for model_name in ['RandomForest', 'SVC']:
    print(f"Starting Optuna optimization for {model_name}...")
    study = optuna.create_study(direction='maximize')
    study.optimize(lambda trial: objective(trial, model_name), n_trials=30)

    # Print the best parameters and best score
    print(f"Best hyperparameters for {model_name}: {study.best_params}")
    print(f"Best cross-validated score for {model_name}: {study.best_value}")

    # Train the best model on the entire training set and evaluate
    if model_name == 'RandomForest':
        best_model = RandomForestClassifier(**study.best_params, random_state=42)
    else:
        best_model = SVC(**study.best_params, random_state=42)

    # Fit the model on the full training data
    X_train_processed = preprocessor.fit_transform(X_train)
    best_model.fit(X_train_processed, y_train)

    # Predict on the test set
    X_test_processed = preprocessor.transform(X_test)
    y_pred = best_model.predict(X_test_processed)

    # Plot the confusion matrix as a heatmap
    cm = confusion_matrix(y_test, y_pred)
    ax = sns.heatmap(cm, annot=True, cmap="Blues", fmt="d")
    ax.set_title(f"{model_name} Confusion Matrix")
    ax.set_xlabel("Predicted Label")
    ax.set_ylabel("True Label")
    plt.show()

    # Print metrics
    accuracy = accuracy_score(y_test, y_pred)
    class_report = classification_report(y_test, y_pred, zero_division=1)

    print(f"{model_name} Accuracy: {accuracy:.4f}")
    print(f"{model_name} Classification Report:\n{class_report}")

    # Store the results
    results.append({
        'Model': model_name,
        'Best Hyperparameters': study.best_params,
        'Accuracy': accuracy,
        'Classification Report': class_report
    })

# Convert results to a DataFrame for display
df_results = pd.DataFrame(results)
print(df_results[['Model', 'Best Hyperparameters', 'Accuracy']])
