# Employee Attrition Prediction Analysis
This notebook analyzes the factors influencing employee turnover and builds predictive models using the IBM HR Analytics dataset.

## 1. Setup and System Configuration

In [65]:
import os
import re
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from pptx import Presentation
from pptx.util import Inches, Pt

# Configuration
CONFIG = {
    "DATA_PATH": "../data/IBM-HR-Employee-Attrition.csv",
    "RESULT_DIR": "../result/",
    "RANDOM_STATE": 42
}

# Create directory structure
for folder in ["figures", "slides"]:
    Path(os.path.join(CONFIG["RESULT_DIR"], folder)).mkdir(parents=True, exist_ok=True)

FIG_DIR = os.path.join(CONFIG["RESULT_DIR"], "figures")
SLD_DIR = os.path.join(CONFIG["RESULT_DIR"], "slides")

## 2. Data Load and Feature Engineering
We load the dataset and define a preprocessing pipeline. We use verbose_feature_names_out=False to ensure our feature labels are not prefixed with technical metadata like num__ or cat__.

In [66]:
# Helper to clean labels
def clean_feature_label(name):
    name = name.replace('_', ' ')
    name = re.sub(r'([a-z])([A-Z])', r'\1 \2', name)
    return name.replace(' Yes', ' (Yes)').replace(' No', ' (No)').strip().title()

# Load data
df = pd.read_csv(CONFIG["DATA_PATH"])
X = df.drop("Attrition", axis=1)
y = df["Attrition"].apply(lambda x: 1 if x == "Yes" else 0)

# Preprocessing
num_cols = X.select_dtypes(include=['int64', 'float64']).columns
cat_cols = X.select_dtypes(include=['object']).columns

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
    ],
    verbose_feature_names_out=False 
)

## 3. Model Validation and Performance Calculation
We split the data into training ($80\%$) and testing ($20\%$) sets to validate the model's predictive power. After training the Random Forest, we calculate key metrics like $Accuracy$, $Precision$, $Recall$, and the $F_1$-score.

In [67]:
# Split and Transform
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=CONFIG["RANDOM_STATE"], stratify=y
)

X_train_proc = preprocessor.fit_transform(X_train)
X_test_proc = preprocessor.transform(X_test)

# Train Model
model = RandomForestClassifier(random_state=CONFIG["RANDOM_STATE"])
model.fit(X_train_proc, y_train)

# Calculate Performance
y_pred = model.predict(X_test_proc)
metrics = {
    "Accuracy": accuracy_score(y_test, y_pred),
    "Precision": precision_score(y_test, y_pred),
    "Recall": recall_score(y_test, y_pred),
    "F1": f1_score(y_test, y_pred)
}

# Top 10 Features
importance = pd.Series(model.feature_importances_, index=preprocessor.get_feature_names_out())
top_10 = importance.sort_values(ascending=False).head(10)
top_10.index = [clean_feature_label(n) for n in top_10.index]

## 4. Performance Visualization
We save the visual findings as high-resolution figures. These will be retrieved later for the PowerPoint slide.

In [68]:
# Save Top 10 Features Chart
plt.figure(figsize=(10, 6))
sns.barplot(x=top_10.values, y=top_10.index, hue=top_10.index, palette="Blues_r", legend=False)
plt.title("Top 10 Attrition Drivers", fontsize=16, fontweight='bold')
plt.xlabel("Predictive Importance Score")
plt.tight_layout()
feat_img_path = os.path.join(FIG_DIR, "top_10_features.png")
plt.savefig(feat_img_path, dpi=300)
plt.close()

In [69]:
# Save Confusion Matrix
plt.figure(figsize=(6, 5))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=['Stayed', 'Left'], yticklabels=['Stayed', 'Left'])
plt.title("Confusion Matrix: Actual vs Predicted", fontsize=14)
plt.tight_layout()
cm_img_path = os.path.join(FIG_DIR, "confusion_matrix.png")
plt.savefig(cm_path, dpi=300)
plt.close()

## 5. Create End-to-End Executive Slide
This final section builds a professional PowerPoint report with a split layout: the visualization on the left and metrics/remarks on the right.

In [None]:
def create_executive_summary(prs, img_path, metrics_dict):
    slide = prs.slides.add_slide(prs.slide_layouts[6]) # Blank Layout
    
    # Title
    title = slide.shapes.add_textbox(Inches(0.5), Inches(0.3), Inches(9), Inches(1))
    title.text_frame.text = "Executive Attrition Analysis & Model Performance"
    title.text_frame.paragraphs[0].font.size, title.text_frame.paragraphs[0].font.bold = Pt(28), True

    # Left: Top 10 Feature Figure
    slide.shapes.add_picture(img_path, Inches(0.5), Inches(1.5), width=Inches(5.2))

    # Right: Metrics and Remarks
    tx_box = slide.shapes.add_textbox(Inches(6.0), Inches(1.5), Inches(3.5), Inches(5))
    tf = tx_box.text_frame
    tf.word_wrap = True
    
    # Metrics
    p = tf.add_paragraph()
    p.text = "Model Validation Results:"
    p.font.bold, p.font.size = True, Pt(18)
    for k, v in metrics_dict.items():
        bp = tf.add_paragraph()
        bp.text = f"{k}: {v:.1%}"
        bp.font.size, bp.level = Pt(14), 0

    # Analysis Remarks
    p = tf.add_paragraph()
    p.text = "\nKey Analysis Remarks:"
    p.font.bold, p.font.size = True, Pt(18)
    remarks = [
        # Will be added manually in the pptx, this is just an example.
        "Overtime is the strongest predictor of attrition.",
        "Monthly Income levels significantly impact retention.",
        "Tenure at the company reveals high risk for newer staff."
    ]
    for remark in remarks:
        bp = tf.add_paragraph()
        bp.text = remark
        bp.font.size, bp.level = Pt(13), 0

# Run Export
prs = Presentation()
create_executive_summary(prs, feat_img_path, metrics)
prs.save(os.path.join(SLD_DIR, "Executive_Attrition_Report.pptx"))