# **Imports**

In [None]:
%%capture
!pip install -q lime shap

In [None]:
import numpy as np, pandas as pd
import matplotlib.pyplot as plt, seaborn as sns
from sklearn.model_selection import StratifiedKFold
import kagglehub
from sklearn.model_selection import train_test_split, learning_curve, ValidationCurveDisplay
from sklearn.metrics import accuracy_score, classification_report, ConfusionMatrixDisplay, RocCurveDisplay, auc, PrecisionRecallDisplay
from sklearn.inspection import PartialDependenceDisplay,permutation_importance
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import ExtraTreesClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
import textwrap
import lime
import lime.lime_tabular
import shap
from sklearn.metrics import accuracy_score, classification_report, ConfusionMatrixDisplay, RocCurveDisplay, auc, roc_curve
import warnings
warnings.filterwarnings('ignore')

# **Utils**

In [None]:
def our_confusion_matrix(model, name='Model'):
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16,8))
    ConfusionMatrixDisplay.from_estimator(model, X_train, y_train, cmap="Oranges", ax=ax1)
    ConfusionMatrixDisplay.from_estimator(model, X_test, y_test, cmap="Oranges", ax=ax2)
    plt.title(f"{name} Confusion Matrix")

def our_roc(model, name="Model"):
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 10))
    roc1 = RocCurveDisplay.from_estimator(
    model, X_train, y_train,
    name=name, ax=ax1,
    color="orange"
    )
    roc2 = RocCurveDisplay.from_estimator(
        model, X_test, y_test,
        name=name, ax=ax2,
        color="orange"
    )
    ax1.fill_between(roc1.fpr, roc1.tpr, alpha=0.4, color="orange")
    ax2.fill_between(roc2.fpr, roc2.tpr, alpha=0.4, color="orange")
    ax1.set_title(f"{name} Training ROC Curve", fontsize=16)
    ax2.set_title(f"{name} Testing ROC Curve", fontsize=16)
    for ax in [ax1, ax2]:
        ax.grid(alpha=0.5)

def our_learning_curve(model, name='Model'):
    train_sizes, train_scores, test_scores = learning_curve(
        model, X_train, y_train, cv=5, scoring='accuracy',
        train_sizes=np.linspace(0.1, 1.0, 10), random_state=0
    )

    plt.figure(figsize=(13, 8))
    plt.plot(train_sizes, np.mean(train_scores, axis=1), 'o-', color="mediumseagreen", label="Training score")
    plt.plot(train_sizes, np.mean(test_scores, axis=1), 'o-', color="royalblue", label="Cross-validation score")
    plt.title(f"{name} Learning Curve")
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    plt.legend(loc="best")
    plt.grid(alpha=0.5)

def our_prc(model, name="Model"):
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(21, 13))
    train_disp = PrecisionRecallDisplay.from_estimator(model, X_train, y_train, ax=ax1, color='orange')
    test_disp = PrecisionRecallDisplay.from_estimator(model, X_test, y_test, ax=ax2, color='orange')
    ax1.set_title(f"{name} Precision-Recall Curve - Train")
    ax2.set_title(f"{name} Precision-Recall Curve - Test")
    ax1.fill_between(train_disp.recall, train_disp.precision, alpha=0.4, color='orange')
    ax2.fill_between(test_disp.recall, test_disp.precision, alpha=0.4, color='orange')

# **Load Processed Dat**a

In [None]:
path = kagglehub.dataset_download('mohamedhassan77/xai-project-processed-data')
df = pd.read_csv(f"{path}/combined_adasyn_train_test.csv")
df

Unnamed: 0,Age,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,HourlyRate,...,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,promotion_velocity,workload_score,compensation_ratio,Attrition,Set
0,31,2,1062,1,24,3,3,3,0,96,...,3,10.0,9,1,8,9.999900,1,1.019760,0,Train
1,50,2,328,1,1,3,3,3,1,86,...,2,3.0,2,0,2,300000.000000,2,1.281918,0,Train
2,46,2,717,1,13,4,1,3,1,34,...,3,10.0,7,0,9,1000000.000000,1,0.832635,0,Train
3,44,2,170,1,1,4,1,2,1,78,...,3,2.0,0,2,2,0.999995,1,0.753443,0,Train
4,26,1,1479,1,1,3,1,3,0,84,...,1,6.0,5,1,4,5.999940,3,1.007878,0,Train
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2042,56,2,1162,1,24,2,1,1,1,97,...,3,4.0,2,1,0,3.999960,1,0.898732,1,Test
2043,46,2,669,2,9,2,3,3,1,64,...,3,9.0,8,4,7,2.249994,1,1.558490,1,Test
2044,30,2,1240,0,9,3,0,3,1,48,...,1,11.0,9,4,7,2.749993,7,2.052053,0,Test
2045,31,2,741,1,2,4,1,2,1,69,...,4,5.0,2,0,3,500000.000000,0,1.207921,0,Test


In [None]:
train_df = df[df['Set'] == 'Train'].drop(columns=['Set'])
test_df = df[df['Set'] == 'Test'].drop(columns=['Set'])

X_train = train_df.drop(columns=['Attrition'])
y_train = train_df['Attrition']

X_test = test_df.drop(columns=['Attrition'])
y_test = test_df['Attrition']

print("X_train:", X_train.shape, "y_train:", y_train.shape)
print("X_test:", X_test.shape, "y_test:", y_test.shape)

X_train: (1589, 33) y_train: (1589,)
X_test: (458, 33) y_test: (458,)


# **Model-Specific EDA (Optional)**

In [None]:
print("Target variable distribution in the training set:")
print(y_train.value_counts())
plt.figure(figsize=(6, 4))
sns.countplot(x=y_train)
plt.title('Class Distribution in Training Set (Before SMOTE)')
plt.xlabel('Attrition (0: No, 1: Yes)')
plt.ylabel('Count')
plt.show()

# **Model Initialization**

In [None]:
etc_classifier = ExtraTreesClassifier(
    n_estimators=100,
    random_state=123,
    n_jobs=-1,
    max_depth=10,
    min_samples_split=5,
    min_samples_leaf=2
)

# **Model Training**

In [None]:
etc_classifier.fit(X_train, y_train)

# **Model Evaluation**

In [None]:
y_pred = etc_classifier.predict(X_test)
y_pred_proba = etc_classifier.predict_proba(X_test)[:, 1]

In [None]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

In [None]:
print("Classification Report:")
report = classification_report(y_test, y_pred, target_names=['No Attrition', 'Attrition'])
print(report)



> The model is performing well for the "No Attrition" class, while it struggles with the Attrition Class, especially the recall with (0.39), which means that it misses 61% of acutal attrition cases.



> The overall accuracy was 83%  



> The model is baised toward the "No Attrition"







In [None]:
our_confusion_matrix(etc_classifier, name="Extra Trees Classifier")

In [None]:
our_roc(etc_classifier, name="Extra Trees Classifier")

In [None]:
our_learning_curve(etc_classifier, name="Extra Trees Classifier")

> The model acheives Almost 1 Score, indicating overfitting


> The cross validation increases moderatly and stopped increasing at 0.8, indicating that the model is not imporve further with more data.




In [None]:
our_prc(etc_classifier, name="Extra Trees Classifier")



> The training performace is nearly perfect, indicating that the model fit the data too well, and there is a sign of overfitting.



> The test performance shows that the model faileed to generalize the unseend data for the "Attrition" class.





# **Explainability Techniques**

**examples only, use any**

## LIME Analysis

In [None]:
explainer = lime.lime_tabular.LimeTabularExplainer(
    training_data=np.array(X_train),
    feature_names=X_train.columns,
    class_names=['No Attrition', 'Attrition'],
    mode='classification'
)

instance = X_test.iloc[0]

pred_prob = etc_classifier.predict_proba([instance])[0][1]

exp = explainer.explain_instance(instance.values, etc_classifier.predict_proba, num_features=10)

In [None]:
values = [v for _, v in exp.as_list(label=1)]
features = [f for f, _ in exp.as_list(label=1)]
colors = ['#2ecc71' if v > 0 else '#e74c3c' for v in values]

wrapped_features = [textwrap.fill(f, width=30) for f in features]

fig, ax = plt.subplots(figsize=(12, 10))
y_pos = np.arange(len(wrapped_features))
bars = ax.barh(y_pos, values, color=colors, alpha=0.8)

ax.set_yticks(y_pos)
ax.set_yticklabels(wrapped_features, fontsize=11)
ax.invert_yaxis()
ax.set_xlabel('Feature Impact', fontsize=12)
ax.set_title(f'LIME Explanation\n(Predicted Probability: {pred_prob:.2f})', fontsize=14, weight='bold', pad=15)

for i, v in enumerate(values):
    ax.text(v + 0.005 if v > 0 else v - 0.005, i, f"{v:.2f}",
            va='center', ha='left' if v > 0 else 'right', fontsize=10)

plt.tight_layout()
plt.show()
plt.savefig('LIME - Extra Trees Classifier.jpg')

**The 69% is the Attrition risk.**

> Green bars are the features that increase the chance of leaving, While red bars decreases it.

> StockOptionLevel(<=0) has the most impact for employee to leave. JobLevel (<=1) and JobSatisfaction (<=1) are also strong flags to push the employee toward leaving the compnay.


> workload_score (<=1) a lower workload decreases the probability of leaving. JobInvolvement (<=3) a lower job involvement decreases the probability of leaving. The same intepretation with the other columns.



> Some interpretations might not be really meaningful becuase the model is overfit.












## SHAP Analysis

In [None]:
explainer = shap.TreeExplainer(etc_classifier)

sample_size = min(100, len(X_test))
X_sample = X_test.iloc[:sample_size]

shap_values = explainer.shap_values(X_sample)

if len(np.array(shap_values).shape) == 3:
    shap_values_attrition = shap_values[:,:,1]
elif len(np.array(shap_values).shape) == 2:
    shap_values_attrition = shap_values

plt.figure(figsize=(12, 8))
shap.summary_plot(
    shap_values_attrition,
    X_sample,
    feature_names=X_train.columns.tolist(),
    plot_type="dot",
    show=False
)
plt.title("SHAP Feature Importance for Attrition Prediction", pad=20)
plt.tight_layout()
plt.show()
plt.savefig('SHAP - Extra Trees Classifier.jpg')


> Each dot represent a person, and color show if their value for this feature is high (red) or low (blue).

> StockOptionLevel is the most influence factor, people with low stock options (blue dots) are more likely to leave, while those with high stock options are more likely to stay.



> On the other hand MaritalStatus has a relatively small impact, because the spread of dots is narrow, meaning the MaritalStatus doesn't strongly push the prediction toward staying or leaving.





## Feature Importance (Model-Specific)

In [None]:
importances = etc_classifier.feature_importances_
feature_names = X_train.columns
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

In [None]:
plt.figure(figsize=(10, 8))
sns.barplot(x='Importance', y='Feature', data=feature_importance_df)
plt.title('Feature Importances - Extra Trees Classifier')
plt.tight_layout()
plt.show()
plt.savefig('Feature Importance - Extra Trees Classifier.jpg')



> Built-in feature importance ranked from the most important to the lowest.



## Partial Dependence Plots

In [None]:
top_9_features = feature_importance_df.head(9)['Feature'].tolist()

n_cols = 3
n_rows = 3
fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 3.5*n_rows))
axes = axes.flatten()

for i, (ax, feature_name) in enumerate(zip(axes, top_9_features)):
    PartialDependenceDisplay.from_estimator(
        etc_classifier,
        X_test[X_train.columns],
        features=[X_train.columns.get_loc(feature_name)],
        target=1,
        ax=ax,
        line_kw={'color':'#1f77b4', 'lw':2},
        ice_lines_kw={'color':'#1f77b4', 'alpha':0.05, 'lw':0.5},
        pd_line_kw={'color':'#ff7f0e', 'alpha':0.3}
    )

    ax.set_title(f"{feature_name}", pad=8, fontsize=11, fontweight='bold')
    ax.set_xlabel("Feature Value", fontsize=9)
    ax.set_ylabel("Attrition Prob" if i%n_cols==0 else "", fontsize=9)
    ax.grid(True, linestyle=':', alpha=0.3)

    plt.setp(ax.get_xticklabels(), rotation=45, ha='right')

for j in range(i+1, len(axes)):
    fig.delaxes(axes[j])

plt.suptitle("Partial Dependence Plots - Top 10 Features", y=1.02, fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()
plt.savefig('PDP - Extra Trees Classifier.jpg')



> As the stock option level increases from 0 to 3, the predicted probability of attrition decreases from 0.34 to 0.22. We can determine from here that higher StockOptionLevel reduce employee attrition.



> StockOptionLevel and JobSatisfaction change the probablity the most, dropping it from 0.34 to 0.22, as their values increases. JobLevel and JobInvolvement also show notable changes, while








## Permutation Feature Importance

In [None]:
result = permutation_importance(
    etc_classifier,
    X_test[X_train.columns],
    y_test,
    n_repeats=10,
    random_state=42
)

sorted_idx = result.importances_mean.argsort()

plt.barh(X_train.columns[sorted_idx], result.importances_mean[sorted_idx])
plt.title("Permutation Feature Importance")
plt.tight_layout()
plt.savefig('PFI - Extra Trees Classifier.jpg')

The PFI method tries to estimate how important a feature is for model results based on what happens to the model when we change the feature connected to the target variable. Features like StockOptionLevel and workload_score have the most impact, while features like Department and MonthlyRatehave the least impact.