## 1. Setup & Libraries
Import necessary libraries for data analysis, visualization, preprocessing, and modeling.


In [None]:

# 1. Setup & Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE

sns.set_style('whitegrid')
pd.set_option('display.max_columns', None)


## 2. Data Loading
Load the cleaned HR Employee Attrition dataset and inspect basic info, data types, and target distribution.


In [None]:

# 2. Data Loading

data_path = "../data/HR-Employee-Attrition_cleaned.csv"
df = pd.read_csv(data_path)

# Quick overview
print(df.head())
print(df.info())
print(df['Attrition'].value_counts())


## 3. Data Cleaning & Preprocessing
- Check for missing values and duplicates.  
- Encode the target variable `Attrition` (Yes → 1, No → 0).  
- Identify categorical and numerical features for preprocessing.

In [None]:
# 3. Data Cleaning & Preprocessing

# Check for missing values
print(df.isna().sum())

# Remove duplicates if any
df.drop_duplicates(inplace=True)

# Encode target variable
df['Attrition'] = df['Attrition'].map({'Yes':1, 'No':0})

# Identify categorical and numerical columns
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
numerical_cols = df.select_dtypes(include=['int64','float64']).columns.tolist()
numerical_cols.remove('Attrition')  # remove target from features


## 4. Exploratory Data Analysis (EDA)
- Visualize the distribution of the target variable.  
- Compare attrition across categorical features (e.g., OverTime).  
- Compute and plot correlation matrix to identify relationships between features.

In [None]:
# 4. Exploratory Data Analysis (EDA)

# Attrition distribution
sns.countplot(x='Attrition', data=df)
plt.title('Attrition Distribution')
plt.show()

# Example: Attrition by OverTime
sns.countplot(x='OverTime', hue='Attrition', data=df)
plt.title('Attrition by OverTime')
plt.show()

# Correlation heatmap
plt.figure(figsize=(12,10))
sns.heatmap(df.corr(), annot=True, fmt=".2f", cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()



## 5. Feature Engineering
- Example: High-risk employee profile based on `OverTime`, `JobSatisfaction`, and `YearsAtCompany`.  
- Optional: Additional features can be created based on domain knowledge.


In [None]:

# 5. Feature Engineering

# Example: Creating high risk feature
df['HighRiskProfile'] = ((df['OverTime']=='Yes') & 
                         (df['JobSatisfaction']<=2) & 
                         (df['YearsAtCompany']<3)).astype(int)



## 6. Modeling
- Split data into train and test sets (stratified by target).  
- Preprocess features: StandardScaler for numerical, OneHotEncoder for categorical.  
- Apply SMOTE to balance the training set.  
- Train models:
  - Logistic Regression
  - Random Forest  
- Evaluate using Accuracy, Precision, Recall, F1-score, and Confusion Matrix.


In [None]:

# 6. Modeling

X = df.drop('Attrition', axis=1)
y = df['Attrition']

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])

# Apply SMOTE to balance classes
X_train_enc = preprocessor.fit_transform(X_train)
sm = SMOTE(random_state=42)
X_train_bal, y_train_bal = sm.fit_resample(X_train_enc, y_train)

X_test_enc = preprocessor.transform(X_test)

# Initialize models
models = {
    'LogisticRegression': LogisticRegression(max_iter=1000, random_state=42),
    'RandomForest': RandomForestClassifier(n_estimators=100, random_state=42)
}

# Train and evaluate
for name, model in models.items():
    model.fit(X_train_bal, y_train_bal)
    y_pred = model.predict(X_test_enc)
    print(f"==== {name} ====")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred))
    print("Recall:", recall_score(y_test, y_pred))
    print("F1 Score:", f1_score(y_test, y_pred))
    
    cm = confusion_matrix(y_test, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot()
    plt.title(f'Confusion Matrix - {name}')
    plt.show()



**Notes for Reviewers / Collaborators:**
- The workflow is modular and reproducible.
- SMOTE is applied only on the training set to prevent data leakage.
- Preprocessing pipeline ensures categorical variables are correctly handled for future predictions.
- Visualization sections can be expanded depending on stakeholder needs.
