# HR Employee Attrition

## Load and Explore Data

In [23]:
import pandas as pd
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report


# Load your dataset
df = pd.read_csv("HR-Employee-Attrition.csv")

In [24]:
# Check for duplicates
duplicate_rows = df.duplicated().sum()

# Check for missing values
missing_values = df.isnull().sum()

# Get summary statistics
summary_statistics = df.describe()

# Get unique values per column
unique_values = df.nunique()

# Data types overview
data_types = df.dtypes

# Results
data_overview = {
    "duplicate_rows": duplicate_rows,
    "missing_values": missing_values[missing_values > 0].to_dict(),  # Only show columns with missing values
    "summary_statistics": summary_statistics.to_dict(),
    "unique_values": unique_values.to_dict(),
    "data_types": data_types.to_dict(),
}

data_overview


{'duplicate_rows': 0,
 'missing_values': {},
 'summary_statistics': {'Age': {'count': 1470.0,
   'mean': 36.923809523809524,
   'std': 9.135373489136734,
   'min': 18.0,
   '25%': 30.0,
   '50%': 36.0,
   '75%': 43.0,
   'max': 60.0},
  'DailyRate': {'count': 1470.0,
   'mean': 802.4857142857143,
   'std': 403.50909994352827,
   'min': 102.0,
   '25%': 465.0,
   '50%': 802.0,
   '75%': 1157.0,
   'max': 1499.0},
  'DistanceFromHome': {'count': 1470.0,
   'mean': 9.19251700680272,
   'std': 8.106864435666084,
   'min': 1.0,
   '25%': 2.0,
   '50%': 7.0,
   '75%': 14.0,
   'max': 29.0},
  'Education': {'count': 1470.0,
   'mean': 2.912925170068027,
   'std': 1.0241649445978729,
   'min': 1.0,
   '25%': 2.0,
   '50%': 3.0,
   '75%': 4.0,
   'max': 5.0},
  'EmployeeCount': {'count': 1470.0,
   'mean': 1.0,
   'std': 0.0,
   'min': 1.0,
   '25%': 1.0,
   '50%': 1.0,
   '75%': 1.0,
   'max': 1.0},
  'EmployeeNumber': {'count': 1470.0,
   'mean': 1024.865306122449,
   'std': 602.024334847475,

###  Initial Data Analysis Summary
    
    Total Records: 1,470 rows
    Duplicates: No duplicate records
    Missing Values: No missing values
    

        Potential Issues:
        EmployeeCount and StandardHours are constant 
        Over18 is also constant 
        EmployeeNumber is unique for each employee 

In [25]:
# Drop unnecessary columns
columns_to_drop = ["EmployeeCount", "StandardHours", "Over18", "EmployeeNumber"]
df_cleaned = df.drop(columns=columns_to_drop)

## Exploratory Data Analysis EDA

In [26]:
# attrition distribution

fig = px.pie(df, names="Attrition", title="Employee Attrition Distribution", hole=0.3)

# Show Plot
fig.show()



### Numeric Feature Analysis
    1️⃣ Age Distribution
    2️⃣ Monthly Income Distribution 
    3️⃣ Years at Company 

In [27]:
# Age Distribution
fig_age = px.histogram(
    df_cleaned, x="Age", nbins=30, 
    title="Age Distribution of Employees", 
    labels={"Age": "Employee Age"}, 
    color_discrete_sequence=["#636EFA"],
    text_auto= True
)
fig_age.show()

In [28]:
# Monthly Income Distribution
fig_income = px.histogram(
    df_cleaned, x="MonthlyIncome", nbins=30, 
    title="Monthly Income Distribution", 
    labels={"MonthlyIncome": "Monthly Income ($)"}, 
    color_discrete_sequence=["#EF553B"],
    text_auto= True
)
fig_income.show()

In [29]:
# Years at Company Distribution
fig_years = px.histogram(
    df_cleaned, x="YearsAtCompany", nbins=30, 
    title="Years at Company Distribution", 
    labels={"YearsAtCompany": "Years at Company"}, 
    color_discrete_sequence=["#00CC96"],
    text_auto= True
)
fig_years.show()

### Categorical Feature Analysis
    1️⃣ Attrition by Department 
    2️⃣ Attrition by Job Role 
    3️⃣ Attrition by Marital Status 
    4️⃣ Attrition by Overtime Work 

In [30]:
# Attrition by Department
fig_dept = px.bar(
    df_cleaned.groupby("Department")["Attrition"].value_counts().unstack(),
    title="Attrition by Department",
    labels={"value": "Frequency", "Department": "Department"},
    barmode="group",
    color_discrete_sequence=["#636EFA", "#EF553B"],
    text_auto= True
)
fig_dept.show()

In [31]:
# Attrition by Job Role
fig_role = px.bar(
    df_cleaned.groupby("JobRole")["Attrition"].value_counts().unstack(),
    title="Attrition by Job Role",
    labels={"value": "Frequency", "JobRole": "Job Role"},
    barmode="group",
    color_discrete_sequence=["#636EFA", "#EF553B"],
    text_auto= True
)
fig_role.show()

In [32]:
# Attrition by Marital Status
fig_marital = px.bar(
    df_cleaned.groupby("MaritalStatus")["Attrition"].value_counts().unstack(),
    title="Attrition by Marital Status",
    labels={"value": "frequency", "MaritalStatus": "Marital Status"},
    barmode="group",
    color_discrete_sequence=["#636EFA", "#EF553B"],
    text_auto= True
)
fig_marital.show()

In [33]:
# Attrition by Overtime Work
fig_overtime = px.bar(
    df_cleaned.groupby("OverTime")["Attrition"].value_counts().unstack(),
    title="Attrition by Overtime Work",
    labels={"value": "frequency", "OverTime": "Overtime"},
    barmode="group",
    color_discrete_sequence=["#636EFA", "#EF553B"],
    text_auto= True
)
fig_overtime.show()


### Correlation Analysis
    1️⃣ Correlation Heatmap Shows relationships between numeric features
    2️⃣ Attrition vs. Salary, Job Satisfaction, and Work-Life Balance 

In [34]:

import plotly.figure_factory as ff
import numpy as np

# Select only important numeric columns for attrition
important_features = [
    "Age", "MonthlyIncome", "TotalWorkingYears", "YearsAtCompany", 
    "JobSatisfaction", "WorkLifeBalance"
]

df_selected = df_cleaned[important_features]

# Compute correlation matrix
correlation_matrix = df_selected.corr()

# Format numbers to 2 decimal places
z_text = np.around(correlation_matrix.values, decimals=2).astype(str)

# Create heatmap
fig_corr = ff.create_annotated_heatmap(
    z=correlation_matrix.values,
    x=list(correlation_matrix.columns),
    y=list(correlation_matrix.index),
    annotation_text=z_text,  # Add formatted text
    colorscale="Blues",
    showscale=True
)

# Set title
fig_corr.update_layout(title_text="Correlation Heatmap (Key Features)")

# Show figure
fig_corr.show()



In [35]:
# Attrition vs. Monthly Income
fig_income_attrition = px.box(
    df_cleaned, x="Attrition", y="MonthlyIncome", 
    color="Attrition",
    title="Attrition vs. Monthly Income",
    labels={"MonthlyIncome": "Monthly Income ($)", "Attrition": "Attrition Status"},
    color_discrete_sequence=["#636EFA", "#EF553B"],
    
)
fig_income_attrition.show()

In [36]:
# Attrition vs. Job Satisfaction
fig_job_satisfaction = px.histogram(
    df_cleaned, x="JobSatisfaction", color="Attrition",
    title="Attrition vs. Job Satisfaction",
    labels={"JobSatisfaction": "Job Satisfaction Level", "Attrition": "Attrition Status"},
    barmode="group",
    color_discrete_sequence=["#636EFA", "#EF553B"],
    text_auto= True
)
fig_job_satisfaction.show()

In [37]:
# Attrition vs. Work-Life Balance
fig_work_life = px.histogram(
    df_cleaned, x="WorkLifeBalance", color="Attrition",
    title="Attrition vs. Work-Life Balance",
    labels={"WorkLifeBalance": "Work-Life Balance Level", "Attrition": "Attrition Status"},
    barmode="group",
    color_discrete_sequence=["#EF553B", "#636EFA"],
        text_auto= True
)
fig_work_life.show()


### Predictive Model for Employee Attrition

In [38]:
# Encode categorical variables
df_encoded = df_cleaned.copy()
label_encoders = {}

for col in df_encoded.select_dtypes(include=["object"]).columns:
    le = LabelEncoder()
    df_encoded[col] = le.fit_transform(df_encoded[col])
    label_encoders[col] = le

In [39]:
# Define features and target variable
X = df_encoded.drop(columns=["Attrition"])  # Features
y = df_encoded["Attrition"]  # Target variable

In [40]:
# Split data into training and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [41]:
# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [42]:
# Train a logistic regression model
model = LogisticRegression()
model.fit(X_train_scaled, y_train)

In [43]:
# Make predictions
y_pred = model.predict(X_test_scaled)

In [None]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")
print(classification_report(y_test, y_pred))


(0.8945578231292517,
 '              precision    recall  f1-score   support\n\n           0       0.91      0.98      0.94       255\n           1       0.70      0.36      0.47        39\n\n    accuracy                           0.89       294\n   macro avg       0.80      0.67      0.71       294\nweighted avg       0.88      0.89      0.88       294\n')

### ✅ Model Results
    Accuracy: 89.5%
    Precision & Recall:
    Employees Staying (0): 91% precision, 98% recall
    Employees Leaving (1): 70% precision, 36% recall
    🔹 Interpretation: The model predicts employees staying well but struggles with employees leaving (attrition cases are fewer).