In [None]:
# Using Logistic Regression Classifier for Employee Attrition Prediction

# Importing necessary libraries 
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler

# Load data (using the engineered dataset)   
train_df = pd.read_csv("data/processed/train_engineered.csv")
val_df = pd.read_csv("data/processed/validation_engineered.csv")

# Separate features and target variable
x_train = train_df.drop("Attrition", axis=1)
y_train = train_df["Attrition"]

x_val = val_df.drop("Attrition", axis=1)
y_val = val_df["Attrition"]

### Logistic Regression WITHOUT Feature Scaling ###

# Initialize and train the logistic regression model without scaling
log_reg_no_scaling = LogisticRegression(random_state=42, max_iter=1000)
log_reg_no_scaling.fit(x_train, y_train)

# Predictions and evaluation without scaling
predictions_no_scaling = log_reg_no_scaling.predict(x_val)
accuracy_no_scaling = accuracy_score(y_val, predictions_no_scaling)
report_no_scaling = classification_report(y_val, predictions_no_scaling)

print("Logistic Regression WITHOUT Feature Scaling")
print(f"Accuracy: {100*accuracy_no_scaling:.2f}%")
print("Classification Report:\n", report_no_scaling)


### Logistic Regression WITH Feature Scaling ###

# Initialize a scaler and transform the data
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_val_scaled = scaler.transform(x_val)

# Initialize and train the logistic regression model with scaled features
log_reg_scaled = LogisticRegression(random_state=42, max_iter=1000)
log_reg_scaled.fit(x_train_scaled, y_train)

# Predictions and evaluation with scaling
predictions_scaled = log_reg_scaled.predict(x_val_scaled)
accuracy_scaled = accuracy_score(y_val, predictions_scaled)
report_scaled = classification_report(y_val, predictions_scaled)

print("\nLogistic Regression WITH Feature Scaling")
print(f"Accuracy: {100*accuracy_scaled:.2f}%")
print("Classification Report:\n", report_scaled)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression WITHOUT Feature Scaling
Accuracy: 74.07%
Classification Report:
               precision    recall  f1-score   support

           0       0.74      0.77      0.76      5681
           1       0.74      0.71      0.72      5192

    accuracy                           0.74     10873
   macro avg       0.74      0.74      0.74     10873
weighted avg       0.74      0.74      0.74     10873


Logistic Regression WITH Feature Scaling
Accuracy: 74.95%
Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.76      0.76      5681
           1       0.74      0.73      0.74      5192

    accuracy                           0.75     10873
   macro avg       0.75      0.75      0.75     10873
weighted avg       0.75      0.75      0.75     10873

