# Employee Attrition Prediction Analysis

## Objective
Predict employee attrition using machine learning models and identify key factors contributing to employee turnover.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings('ignore')

## 1. Data Loading and Exploration

In [None]:
# Load data
df = pd.read_csv('WA_Fn-UseC_-HR-Employee-Attrition.csv')
print(f"Dataset shape: {df.shape}")
print(f"\nAttrition distribution:")
print(df['Attrition'].value_counts())
print(f"\nAttrition percentage:")
print(df['Attrition'].value_counts(normalize=True) * 100)

## 2. Data Cleaning and Preprocessing

In [None]:
# Check for missing values and duplicates
print(f"Missing values: {df.isnull().sum().sum()}")
print(f"Duplicates: {df.duplicated().sum()}")

# Remove duplicates
df = df.drop_duplicates()

# Encode target variable
df['Attrition'] = df['Attrition'].map({'Yes': 1, 'No': 0})

# Drop non-predictive columns
df = df.drop(['EmployeeCount', 'EmployeeNumber', 'StandardHours'], axis=1, errors='ignore')

# One-hot encode categorical variables
categorical_cols = ['Department', 'Gender', 'OverTime', 'BusinessTravel', 'EducationField', 'JobRole', 'MaritalStatus']
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

print(f"\nFinal dataset shape: {df.shape}")

## 3. Exploratory Data Analysis

In [None]:
# Correlation with satisfaction metrics
satisfaction_cols = ['JobSatisfaction', 'EnvironmentSatisfaction', 'RelationshipSatisfaction', 'WorkLifeBalance']
corr_data = df[satisfaction_cols + ['Attrition']].corr()['Attrition'].drop('Attrition')

plt.figure(figsize=(10, 6))
corr_data.plot(kind='bar')
plt.title('Correlation between Satisfaction Metrics and Attrition')
plt.ylabel('Correlation with Attrition')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

print("Correlation with Attrition:")
print(corr_data.sort_values())

In [None]:
# Check class imbalance
plt.figure(figsize=(8, 6))
df['Attrition'].value_counts().plot(kind='bar')
plt.title('Class Distribution - Attrition')
plt.xlabel('Attrition (0=No, 1=Yes)')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.show()

imbalance_ratio = df['Attrition'].value_counts()[0] / df['Attrition'].value_counts()[1]
print(f"Imbalance ratio: {imbalance_ratio:.2f}:1")

## 4. Model Training and Evaluation

In [None]:
# Prepare features and target
X = df.drop('Attrition', axis=1)
y = df['Attrition']

# Scale numerical features
numerical_cols = X.select_dtypes(include=[np.number]).columns
scaler = StandardScaler()
X[numerical_cols] = scaler.fit_transform(X[numerical_cols])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Training set: {X_train.shape}, Test set: {X_test.shape}")

In [None]:
# Handle class imbalance with SMOTE
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

print(f"Original training set: {y_train.value_counts()}")
print(f"Balanced training set: {pd.Series(y_train_balanced).value_counts()}")

In [None]:
# Train models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Random Forest (Balanced)': RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
}

results = {}

# Train on original data
for name, model in models.items():
    if 'Balanced' not in name:
        model.fit(X_train, y_train)
    else:
        model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    results[name] = {'model': model, 'accuracy': accuracy, 'predictions': y_pred}
    
    print(f"\n{name} Accuracy: {accuracy:.4f}")
    print(classification_report(y_test, y_pred))

# Train with SMOTE
rf_smote = RandomForestClassifier(n_estimators=100, random_state=42)
rf_smote.fit(X_train_balanced, y_train_balanced)
y_pred_smote = rf_smote.predict(X_test)
accuracy_smote = accuracy_score(y_test, y_pred_smote)
results['Random Forest (SMOTE)'] = {'model': rf_smote, 'accuracy': accuracy_smote, 'predictions': y_pred_smote}

print(f"\nRandom Forest (SMOTE) Accuracy: {accuracy_smote:.4f}")
print(classification_report(y_test, y_pred_smote))

## 5. Feature Importance Analysis

In [None]:
# Feature importance from best Random Forest model
best_rf = results['Random Forest (SMOTE)']['model']
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': best_rf.feature_importances_
}).sort_values('importance', ascending=False)

# Plot top 15 features
plt.figure(figsize=(12, 8))
top_features = feature_importance.head(15)
plt.barh(range(len(top_features)), top_features['importance'])
plt.yticks(range(len(top_features)), top_features['feature'])
plt.xlabel('Feature Importance')
plt.title('Top 15 Feature Importance - Random Forest')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

print("Top 10 Most Important Features:")
print(feature_importance.head(10))

## 6. Save Cleaned Dataset

In [None]:
# Save cleaned dataset
df.to_csv('cleaned_employee_attrition.csv', index=False)
print("Cleaned dataset saved as 'cleaned_employee_attrition.csv'")

# Save feature importance
feature_importance.to_csv('feature_importance.csv', index=False)
print("Feature importance saved as 'feature_importance.csv'")

## Summary

### Key Findings:
1. **Class Imbalance**: Dataset shows significant imbalance with ~84% No attrition vs ~16% Yes attrition
2. **SMOTE Effectiveness**: SMOTE oversampling improved model performance for minority class
3. **Top Predictors**: Most important features for predicting attrition
4. **Model Performance**: Random Forest with SMOTE achieved best balanced performance

### Recommendations:
1. Focus on top contributing factors identified in feature importance
2. Implement retention strategies targeting high-risk employee profiles
3. Monitor satisfaction metrics as early warning indicators