# Neural Network

# Modeling


## Import Libraries

In [1]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import StratifiedKFold
from scipy.stats import loguniform


In [None]:
df = pd.read_csv('..\data\processed\HRDataset_p_v4.csv', index_col=0)
df.head()

In [3]:
categorical_features = [
    'GenderID',
    'FromDiversityJobFairID',
    'State',
    'CitizenDesc',
    'HispanicLatino',
    'RaceDesc',
    'Department',
    'ManagerName',
    'RecruitmentSource',
    'HireYear',
    'HireMonth'
]
numeric_features = [
    'Salary',
    'EngagementSurvey',
    'EmpSatisfaction',
    'SpecialProjectsCount',
    'DaysLateLast30',
    'Absences',
    'Age',
    'NumberOfColleagues'
]
label = 'PerfScoreID'

In [None]:
df[categorical_features + numeric_features].shape

### Split the Data

In [131]:
# Separate features and labels
X = df[categorical_features + numeric_features]
y = df[label]

# Split into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


### Encode and Scale 

In [132]:
# Create a ColumnTransformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='mean')),  # we don't have missing numerical values
            ('scaler', StandardScaler())
        ]), numeric_features),
        ('cat', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OneHotEncoder())
        ]), categorical_features)
    ])

## Neural Network

In [133]:
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', MLPClassifier(max_iter=1000, random_state=42))
])

### Custom 

In [None]:
custom_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', MLPClassifier(hidden_layer_sizes=(100,), activation='relu', alpha=0.0001,
                                 max_iter=1000, random_state=42))
])

custom_model.fit(X_train, y_train)
y_pred_custom = custom_model.predict(X_test)

# Evaluate baseline performance
accuracy_custom = accuracy_score(y_test, y_pred_custom)
precision_custom = precision_score(y_test, y_pred_custom, average='weighted')
recall_custom = recall_score(y_test, y_pred_custom, average='weighted')
f1_custom = f1_score(y_test, y_pred_custom, average='weighted')

print(f'Baseline Accuracy: {accuracy_custom:.4f}')
print(f'Baseline Precision: {precision_custom:.4f}')
print(f'Baseline Recall: {recall_custom:.4f}')
print(f'Baseline F1 Score: {f1_custom:.4f}')

### No Optimization

In [None]:
# Fit the model without optimization
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Evaluate baseline performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Classification Report:\n{classification_report(y_test, y_pred)}\n")
print(f'Baseline Accuracy: {accuracy:.4f}')
print(f'Baseline Precision: {precision:.4f}')
print(f'Baseline Recall: {recall:.4f}')
print(f'Baseline F1 Score: {f1:.4f}')


### Grid Search

In [None]:
# Grid Search parameters
param_grid = {
    'classifier__hidden_layer_sizes': [(50,), (100,), (100, 50)],
    'classifier__activation': ['relu'],
    'classifier__alpha': [0.0001, 0.001, 0.01],
    'classifier__learning_rate': ['constant', 'adaptive']
}

stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid_search = GridSearchCV(model, param_grid, cv=stratified_kfold, scoring='accuracy')
grid_search.fit(X_train, y_train)
y_pred_grid = grid_search.predict(X_test)

# Evaluate Grid Search performance
accuracy_grid = accuracy_score(y_test, y_pred_grid)
precision_grid = precision_score(y_test, y_pred_grid, average='weighted')
recall_grid = recall_score(y_test, y_pred_grid, average='weighted')
f1_grid = f1_score(y_test, y_pred_grid, average='weighted')

print(f'Grid Search Accuracy: {accuracy_grid:.4f}')
print(f'Grid Search Precision: {precision_grid:.4f}')
print(f'Grid Search Recall: {recall_grid:.4f}')
print(f'Grid Search F1 Score: {f1_grid:.4f}')


In [None]:
print("Grid Search Optimization:")
print(f'Best Parameters: {grid_search.best_params_}')
print(f"Classification Report:\n{classification_report(y_test, y_pred_grid)}\n")

### Random Search

In [None]:
# Random Search parameters
param_dist = {
    'classifier__hidden_layer_sizes': [(50,), (100,), (50, 50), (32, 8)],
    'classifier__activation': ['relu', 'tanh'],
    'classifier__alpha': loguniform(1e-4, 1e-1),
    'classifier__learning_rate': ['constant', 'adaptive']
}

random_search = RandomizedSearchCV(
    model, param_distributions=param_dist, n_iter=50, cv=stratified_kfold,
    scoring='accuracy', random_state=42
)
random_search.fit(X_train, y_train)
y_pred_random = random_search.predict(X_test)

# Evaluate Random Search performance
accuracy_random = accuracy_score(y_test, y_pred_random)
precision_random = precision_score(y_test, y_pred_random, average='weighted')
recall_random = recall_score(y_test, y_pred_random, average='weighted')
f1_random = f1_score(y_test, y_pred_random, average='weighted')

print(f'Random Search Accuracy: {accuracy_random:.4f}')
print(f'Random Search Precision: {precision_random:.4f}')
print(f'Random Search Recall: {recall_random:.4f}')
print(f'Random Search F1 Score: {f1_random:.4f}')


In [None]:
print("Random Search Optimization:")
print(f'Best Parameters: {random_search.best_params_}')
print(f"Classification Report:\n{classification_report(y_test, y_pred_random)}\n")

### Optuna

In [None]:
import optuna

In [None]:
# Define the objective function for Optuna
def objective(trial):
    # Suggest values for hyperparameters
    hidden_layer_sizes = trial.suggest_categorical('hidden_layer_sizes', [(50,), (100,), (50, 50), (100, 50), (100, 100)])
    activation = trial.suggest_categorical('activation', ['tanh', 'relu'])
    solver = trial.suggest_categorical('solver', ['adam', 'sgd'])
    alpha = trial.suggest_loguniform('alpha', 1e-6, 1e-2)
    learning_rate = trial.suggest_categorical('learning_rate', ['constant', 'adaptive'])
    max_iter = trial.suggest_int('max_iter', 100, 3000)
    
    # Update the MLP model with suggested parameters
    classifier = MLPClassifier(
        hidden_layer_sizes=hidden_layer_sizes,
        activation=activation,
        solver=solver,
        alpha=alpha,
        learning_rate=learning_rate,
        max_iter=max_iter,
        random_state=42
    )
    
    # Create the updated pipeline
    model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', classifier)
    ])
    
    # Fit the model
    model.fit(X_train, y_train)
    
    # Evaluate the model using accuracy
    y_pred = model.predict(X_test)
    return accuracy_score(y_test, y_pred)

# Create the study and optimize
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

# Print the best parameters and best score
print(f'Best parameters: {study.best_params}')
print(f'Best accuracy: {study.best_value}')

# Retrain the model with the best parameters
best_params = study.best_params
best_classifier = MLPClassifier(
    hidden_layer_sizes=best_params['hidden_layer_sizes'],
    activation=best_params['activation'],
    solver=best_params['solver'],
    alpha=best_params['alpha'],
    learning_rate=best_params['learning_rate'],
    max_iter=best_params['max_iter'],
    random_state=42
)
best_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', best_classifier)
])
best_model.fit(X_train, y_train)
y_pred_optuna = best_model.predict(X_test)

# Evaluate the Optuna optimized model
optuna_accuracy = accuracy_score(y_test, y_pred_optuna)
optuna_precision = precision_score(y_test, y_pred_optuna, average='weighted')
optuna_recall = recall_score(y_test, y_pred_optuna, average='weighted')
optuna_f1 = f1_score(y_test, y_pred_optuna, average='weighted')

print(f'Optuna Optimization Accuracy: {optuna_accuracy:.4f}')
print(f'Optuna Optimization Precision: {optuna_precision:.4f}')
print(f'Optuna Optimization Recall: {optuna_recall:.4f}')
print(f'Optuna Optimization F1 Score: {optuna_f1:.4f}')


## Results Summary

In [None]:
results = pd.DataFrame({
    'Method': ['Baseline', 'Grid Search', 'Random Search', 'Optuna'],
    'Accuracy': [accuracy, accuracy_grid, accuracy_random, optuna_accuracy],
    'Precision': [precision, precision_grid, precision_random, optuna_precision],
    'Recall': [recall, recall_grid, recall_random, optuna_recall],
    'F1 Score': [f1, f1_grid, f1_random, optuna_f1]
})

print(results)
