In [24]:
import pandas as pd
import numpy as np


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, roc_auc_score, classification_report,
    confusion_matrix, ConfusionMatrixDisplay, RocCurveDisplay
)
import matplotlib.pyplot as plt


In [25]:
df = pd.read_csv("../cleaned_hr_data.csv")
df.head()

print(df.dtypes)
#getdummies() for cat Variables
pd.get_dummies(df['BusinessTravel'])
# Dummies als Integer erstellen (statt Boolean)
df = pd.concat([df, pd.get_dummies(df['BusinessTravel'], prefix='BusinessTravel').astype(int)], axis=1)
df.pop('BusinessTravel')

df = pd.concat([df, pd.get_dummies(df['Department'], prefix='Department').astype(int)], axis=1)
df.pop('Department')

df = pd.concat([df, pd.get_dummies(df['JobRole'], prefix='JobRole').astype(int)], axis=1)
df.pop('JobRole')

df = pd.concat([df, pd.get_dummies(df['MaritalStatus'], prefix='MaritalStatus').astype(int)], axis=1)
df.pop('MaritalStatus')
print(df.dtypes)

print(df.shape)
print(df.columns)


Age                          int64
Attrition                    int64
BusinessTravel              object
DailyRate                    int64
Department                  object
DistanceFromHome             int64
Education                    int64
EmployeeCount                int64
EmployeeNumber               int64
EnvironmentSatisfaction      int64
Gender                       int64
HourlyRate                   int64
JobInvolvement               int64
JobLevel                     int64
JobRole                     object
JobSatisfaction              int64
MaritalStatus               object
MonthlyIncome                int64
MonthlyRate                  int64
NumCompaniesWorked           int64
OverTime                     int64
PercentSalaryHike            int64
PerformanceRating            int64
RelationshipSatisfaction     int64
StockOptionLevel             int64
TotalWorkingYears            int64
TrainingTimesLastYear        int64
WorkLifeBalance              int64
YearsAtCompany      


**Feature selection according to Data_Analysis.ipynb:**

selected_features = [
    "OverTime",
    "TotalWorkingYears",
    "JobLevel",
    "YearsInCurrentRole",
    "MonthlyIncome",
    "Age",
    "YearsWithCurrManager",
    "YearsAtCompany",
    "StockOptionLevel",
    "JobInvolvement",
    "JobSatisfaction",
    "WorkLifeBalance",
    "EnvironmentSatisfaction",
    "RelationshipSatisfaction",
    "BusinessTravel",
    "MaritalStatus",
    "Gender"
]

In [57]:
selected_features = [
    "OverTime",
    "TotalWorkingYears",
    "JobLevel",
    "YearsInCurrentRole",
    "MonthlyIncome",
    "Age",
    "YearsWithCurrManager",
    "YearsAtCompany",
    "StockOptionLevel",
    "JobInvolvement",
    "JobSatisfaction",
    "WorkLifeBalance",
    "EnvironmentSatisfaction",
    "RelationshipSatisfaction",
    "BusinessTravel_frequently",  # Note: corrected spelling
    "BusinessTravel_rarely",
    "BusinessTravel_no",
    "MaritalStatus_Divorced",
    "MaritalStatus_Married",
    "MaritalStatus_Single",
    "Gender"
]

#df_selected = df[selected_features].copy()

y = df['Attrition']
X = df[selected_features].copy()
print(y.shape)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

log_reg = LogisticRegression(max_iter=10000, solver='lbfgs', multi_class='multinomial')
print(y_train.shape)
log_reg.fit(X_train_scaled, y_train)
y_pred = log_reg.predict(X_test_scaled)

train_score = log_reg.score(X_train_scaled, y_train)
test_score = log_reg.score(X_test_scaled, y_test)

print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

classes = log_reg.classes_
# Bei binÃ¤rer Klassifikation: coef_ hat Shape (1, n_features)
coef_df = pd.DataFrame(
    log_reg.coef_,
    columns=X.columns,
    index=[f"Coefficients_Class_{classes[1]}"]  # Nur eine Zeile
)

print("\n=== KOEFFIZIENTEN ===")
print(coef_df.T.sort_values(by=f"Coefficients_Class_{classes[1]}", key=abs, ascending=False))

# Odds Ratios
odds_ratio_df = pd.DataFrame(
    np.exp(log_reg.coef_),
    columns=X.columns,
    index=[f"Odds_Ratio_Class_{classes[1]}"]
)

print("\n=== ODDS RATIOS ===")
print(odds_ratio_df.T.sort_values(by=f"Odds_Ratio_Class_{classes[1]}", ascending=False))





(1470,)
(1176,)
0.8843537414965986
[[248   7]
 [ 27  12]]
              precision    recall  f1-score   support

           0       0.90      0.97      0.94       255
           1       0.63      0.31      0.41        39

    accuracy                           0.88       294
   macro avg       0.77      0.64      0.67       294
weighted avg       0.87      0.88      0.87       294


=== KOEFFIZIENTEN ===
                           Coefficients_Class_1
OverTime                               0.413798
YearsAtCompany                         0.322003
YearsInCurrentRole                    -0.278337
JobSatisfaction                       -0.215866
YearsWithCurrManager                  -0.190283
JobInvolvement                        -0.180138
EnvironmentSatisfaction               -0.178979
MaritalStatus_Single                   0.161381
BusinessTravel_no                     -0.137298
BusinessTravel_frequently              0.132160
TotalWorkingYears                     -0.127916
MaritalStatus_Di

EVALUATE

In [63]:
print(df['Attrition'].value_counts())
print(f'Simple Model, which predicts 1 every Time has an Accuracy of: {237/(1233+237)}')
print(f'Simple Model, which predicts 0 every Time has an Accuracy of: {1233/(1233+237)}')
print(f'log_reg accuracy is 0,884 > 0,839. In Fact, the log_reg detects 97% of employees with risk of Attrition where the baseline model would detect 0!')

print('Strongest positive predictors (encourage Attrition): Overtime (middle effect), YearsAtCompany(middle effect) ')

print('Strongest negative predictors (reduces risk of Attrition): YearsInCurrentRole (middle/low effect), JobSatisfaction(middle/low effect)')

print('The odds for Attrition is 51% higher of the employee does OverTime (OverTime =yes)')
print('The odds for Attrition is 37% higher depending on the years at the Company')

Attrition
0    1233
1     237
Name: count, dtype: int64
Simple Model, which predicts 1 every Time has an Accuracy of: 0.16122448979591836
Simple Model, which predicts 0 every Time has an Accuracy of: 0.8387755102040816
log_reg accuracy is 0,884 > 0,839. In Fact, the log_reg detects 97% of employees with risk of Attrition where the baseline model would detect 0!
Strongest positive predictors (encourage Attrition): Overtime (middle effect), YearsAtCompany(middle effect) 
Strongest negative predictors (reduces risk of Attrition): YearsInCurrentRole (middle/low effect), JobSatisfaction(middle/low effect)
The odds for Attrition is 51% higher of the employee does OverTime (OverTime =yes)
The odds for Attrition is 37% higher depending on the years at the Company
