In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report


In [5]:
training_data = pd.read_csv('/kaggle/input/employee-attrition-dataset/train.csv')
testing_data = pd.read_csv('/kaggle/input/employee-attrition-dataset/test.csv')

In [9]:
print(training_data.shape)
print(testing_data.shape)

(59598, 24)
(14900, 24)


In [10]:
training_data.columns


Index(['Employee ID', 'Age', 'Gender', 'Years at Company', 'Job Role',
       'Monthly Income', 'Work-Life Balance', 'Job Satisfaction',
       'Performance Rating', 'Number of Promotions', 'Overtime',
       'Distance from Home', 'Education Level', 'Marital Status',
       'Number of Dependents', 'Job Level', 'Company Size', 'Company Tenure',
       'Remote Work', 'Leadership Opportunities', 'Innovation Opportunities',
       'Company Reputation', 'Employee Recognition', 'Attrition'],
      dtype='object')

In [11]:
training_data.isnull().sum()

Employee ID                 0
Age                         0
Gender                      0
Years at Company            0
Job Role                    0
Monthly Income              0
Work-Life Balance           0
Job Satisfaction            0
Performance Rating          0
Number of Promotions        0
Overtime                    0
Distance from Home          0
Education Level             0
Marital Status              0
Number of Dependents        0
Job Level                   0
Company Size                0
Company Tenure              0
Remote Work                 0
Leadership Opportunities    0
Innovation Opportunities    0
Company Reputation          0
Employee Recognition        0
Attrition                   0
dtype: int64

In [12]:
# Separate features and target variable from training data
X = training_data.drop(columns=['Employee ID', 'Attrition'])
y = training_data['Attrition']


In [13]:
# Identify categorical and numerical columns
categorical_features = X.select_dtypes(include=['object']).columns.tolist()
numerical_features = X.select_dtypes(include=[np.number]).columns.tolist()

In [14]:
# Preprocessing pipelines for both numerical and categorical data
numerical_pipeline = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline(steps=[
    ('onehot', OneHotEncoder(drop='first'))
])

In [15]:
# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_features),
        ('cat', categorical_pipeline, categorical_features)
    ])


In [16]:
# Apply preprocessing to the features
X_processed = preprocessor.fit_transform(X)


In [17]:
# Transform the testing data in the same way
X_test_processed = preprocessor.transform(testing_data.drop(columns=['Employee ID', 'Attrition']))

In [18]:
# Split the training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_processed, y, test_size=0.2, random_state=42)


In [19]:
# Train the logistic regression model
model = LogisticRegression(random_state=42)

In [20]:
model.fit(X_train, y_train)

In [21]:
# Predict on the validation set
y_pred = model.predict(X_val)

In [23]:

accuracy = accuracy_score(y_val,y_pred)
accuracy

0.7469798657718121

In [27]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Calculate evaluation metrics
precision = precision_score(y_val, y_pred, pos_label='Left')
recall = recall_score(y_val, y_pred, pos_label='Left')
f1 = f1_score(y_val, y_pred, pos_label='Left')

print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1-Score: {f1:.2f}')


Precision: 0.73
Recall: 0.74
F1-Score: 0.73
