# Import Required Libraries
This section imports all necessary libraries for data analysis and modeling.

In [None]:
# Import Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import shap
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report, roc_curve
import xgboost as xgb

# Load Dataset
Load the employee attrition dataset using pandas.

In [None]:
# Load Dataset
# Update the path if needed
DATA_PATH = '../data/WA_Fn-UseC_-HR-Employee-Attrition.csv'
df = pd.read_csv(DATA_PATH)
df.head()

# Explore Data Structure
Display the first few rows, data types, and basic info about the dataset.

In [None]:
# Explore Data Structure
df.info()
df.dtypes
df.head()

# Visualize Missing Values
Use seaborn and matplotlib to visualize missing values in the dataset.

In [None]:
# Visualize Missing Values
plt.figure(figsize=(12,6))
sns.heatmap(df.isnull(), cbar=False, cmap='viridis')
plt.title('Missing Values Heatmap')
plt.show()

# Statistical Summary
Generate descriptive statistics for numerical and categorical features.

In [None]:
# Statistical Summary
df.describe(include='all')

# Feature Correlation Analysis
Compute and visualize feature correlations using seaborn heatmap.

In [None]:
# Feature Correlation Analysis
plt.figure(figsize=(16,10))
corr = df.corr(numeric_only=True)
sns.heatmap(corr, annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Feature Correlation Heatmap')
plt.show()

# Class Distribution Visualization
Plot the distribution of the target variable (attrition) to check for imbalance.

In [None]:
# Class Distribution Visualization
sns.countplot(x='Attrition', data=df)
plt.title('Attrition Class Distribution')
plt.show()

# Handle Imbalanced Data
Apply imbalanced-learn techniques such as SMOTE to balance the classes.

In [None]:
# Handle Imbalanced Data
X = df.drop('Attrition', axis=1)
y = df['Attrition'].apply(lambda x: 1 if x == 'Yes' else 0)
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, y)
print('Resampled dataset shape:', X_res.shape, y_res.shape)

# Split Data for Modeling
Split the dataset into training and test sets using scikit-learn.

In [None]:
# Split Data for Modeling
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42, stratify=y_res)
print('Train shape:', X_train.shape, y_train.shape)
print('Test shape:', X_test.shape, y_test.shape)

# Train Baseline Model
Train a baseline classifier (e.g., XGBoost) on the training data.

In [None]:
# Train Baseline Model
model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
model.fit(X_train, y_train)

# Evaluate Model Performance
Evaluate the model using accuracy, precision, recall, F1-score, and ROC-AUC.

In [2]:
# Evaluate Model Performance
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred))
print('Recall:', recall_score(y_test, y_pred))
print('F1 Score:', f1_score(y_test, y_pred))
print('ROC-AUC:', roc_auc_score(y_test, y_pred_proba))
print(classification_report(y_test, y_pred))
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
plt.plot(fpr, tpr, label='XGBoost (AUC = {:.2f})'.format(roc_auc_score(y_test, y_pred_proba)))
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()

NameError: name 'model' is not defined

# Explain Model Predictions with SHAP
Use SHAP to interpret and visualize feature importance and individual predictions.

In [1]:
# Explain Model Predictions with SHAP
explainer = shap.Explainer(model, X_train)
shap_values = explainer(X_test)
shap.summary_plot(shap_values, X_test)
shap.plots.waterfall(shap_values[0])

NameError: name 'shap' is not defined