In [2]:
# Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE

In [3]:
# Load data
df = pd.read_csv('clean_employee_data.csv')

In [4]:
# Encode Categorical Columns (turn text into numbers)
# use LabelEncoder for the target 'attrition' (Yes/No -> 1/0)
le = LabelEncoder()
df['attrition'] = le.fit_transform(df['attrition'])

# We use get_dummies for other text columns (BusinessTravel, Department, etc.)
df_encoded = pd.get_dummies(df, drop_first=True)

In [5]:
# Split data into X (Features) and y (Target)
X = df_encoded.drop('attrition', axis=1)
y = df_encoded['attrition']

In [6]:
# Apply SMOTE
# this generates synthetic data to balance the classes
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

In [7]:
# Verification
print("Original Class Distribution:")
print(y.value_counts())
print("\nNew SMOTE Class Distribution:")
print(y_resampled.value_counts())

Original Class Distribution:
attrition
0    1233
1     237
Name: count, dtype: int64

New SMOTE Class Distribution:
attrition
1    1233
0    1233
Name: count, dtype: int64


In [8]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Split the balanced data (80% for training, 20% for testing)
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Train logistic regression
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)
y_pred_log = log_reg.predict(X_test)
acc_log = accuracy_score(y_test, y_pred_log)

# Train random forest
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
acc_rf = accuracy_score(y_test, y_pred_rf)

# Compare results
print(f"Logistic Regression Accuracy: {acc_log:.2%}")
print(f"Random Forest Accuracy:     {acc_rf:.2%}")

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression Accuracy: 79.76%
Random Forest Accuracy:     91.90%


In [9]:
import joblib

# Save the Random forest model (Beat logistic regression)
joblib.dump(rf, 'model.pkl')

print("Success! Model saved as 'model.pkl'")

Success! Model saved as 'model.pkl'
