In [None]:
#  Employee Retention Prediction using Logistic Regression

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

df = pd.read_csv("HR_comma_sep.csv")  
print(" Dataset loaded successfully!\n")
print(df.head())


# Exploratory Data Analysis (EDA)
print("\nBasic Information:")
print(df.info())

print("\nCorrelation with Employee Retention (left):")
print(df.corr()['left'].sort_values(ascending=False))

# From EDA, key variables impacting retention usually include:
# - satisfaction_level
# - average_montly_hours
# - promotion_last_5years
# - salary


# Bar chart – Impact of Salary on Retention
plt.figure(figsize=(6,4))
sns.barplot(x='salary', y='left', data=df, order=['low','medium','high'], palette='coolwarm')
plt.title("Impact of Salary on Employee Retention")
plt.xlabel("Salary Level")
plt.ylabel("Proportion who left")
plt.show()

#  Bar chart – Impact of Department on Retention
plt.figure(figsize=(10,5))
dept_retention = df.groupby('sales')['left'].mean().sort_values(ascending=False)
sns.barplot(x=dept_retention.index, y=dept_retention.values, palette='viridis')
plt.title("Impact of Department on Employee Retention")
plt.xlabel("Department")
plt.ylabel("Proportion who left")
plt.xticks(rotation=45)
plt.show()

# Build Logistic Regression Model
X = df[['satisfaction_level', 'average_montly_hours', 'promotion_last_5years', 'salary']]
y = df['left']

label_encoder = LabelEncoder()
X['salary'] = label_encoder.fit_transform(X['salary'])


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)


# Measure Model Accuracy
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"\n Model Accuracy: {accuracy * 100:.2f}%")
