In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import hvplot.pandas
from scipy.stats import chi2_contingency
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.pipeline import make_pipeline
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import BorderlineSMOTE
import xgboost as xgb
from sklearn.neural_network import MLPClassifier
from pytorch_tabnet.tab_model import TabNetClassifier
import torch
from pytorch_tabnet.metrics import Metric
import io
import warnings
from sklearn.exceptions import ConvergenceWarning
import shap
    
warnings.filterwarnings("ignore", category=ConvergenceWarning)




%matplotlib inline
sns.set_style("whitegrid")
plt.style.use("fivethirtyeight")

pd.set_option("display.float_format", "{:.2f}".format)
pd.set_option("display.max_columns", 80)
pd.set_option("display.max_rows", 80)

In [None]:
df = pd.read_csv("IBM Dataset 1.csv")
df.head()

In [None]:
df.info()

In [None]:
df.describe()

#'Age', 'JobLevel', 'StockOptionLevel', 'YearsWithCurrManager', 'YearsSinceLastPromotion', 'NumCompaniesWorked', 'YearsAtCompany', 'PerformanceRating', 'PercentSalaryhike' 

In [None]:
#Class Imbalance bar chart


df = pd.read_csv('IBM Dataset 1.csv')
df['Attrition'] = df['Attrition'].map({'No': 0, 'Yes': 1})

attrition_counts = df['Attrition'].value_counts()
labels = ['No', 'Yes']
counts = [attrition_counts[0], attrition_counts[1]]

total = sum(counts)
percentages = [count / total * 100 for count in counts]

fig, ax = plt.subplots(figsize=(8, 5))
bars = ax.bar(labels, counts, color=['lightblue', 'lightcoral'], alpha=0.7, label=labels)

for bar, count, percentage in zip(bars, counts, percentages):
    ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() / 2, f'{count} ({percentage:.1f}%)',
            ha='center', va='center', fontsize=14, fontweight='bold', color='black')

ax.set_xlabel('Attrition Outcome', fontsize=12)
ax.set_ylabel('Count', fontsize=12)
ax.set_title('Employee Attrition: Count and Percentage', fontsize=14)
ax.legend(bars, ['No Attrition', 'Attrition'], loc='upper right', fontsize=12)

plt.tight_layout()
#plt.savefig('attrition_bar_chart.png')
plt.show()


In [None]:
# Correlation matrix for numerical features correlated with 'Attrition'

df = pd.read_csv('IBM Dataset 1.csv')

df['Attrition'] = df['Attrition'].map({'No': 0, 'Yes': 1})
df.drop(['EmployeeCount', 'EmployeeNumber', 'Over18', 'StandardHours'], axis="columns", inplace=True)
col = df.corr(numeric_only=True).Attrition.index
plt.figure(figsize=(20, 20))
sns.heatmap(df[col].corr(), annot=True, cmap='seismic', annot_kws={'size': 12}, center=0)
plt.title('Correlation Matrix of Numerical Features Correlated with Attrition', fontsize=20)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.tight_layout(pad=2)
#plt.savefig('correlation_matrix.png')


In [None]:
#Top 10 correlations for numerical variabels with Attrition

df = pd.read_csv('IBM Dataset 1.csv')

df['Attrition'] = df['Attrition'].map({'No': 0, 'Yes': 1})
df.drop(['EmployeeCount', 'EmployeeNumber', 'Over18', 'StandardHours'], axis="columns", inplace=True)

correlation_matrix = df.corr(numeric_only=True)
attrition_corr = correlation_matrix['Attrition'].drop('Attrition')

print("Correlation values with Attrition:\n", attrition_corr)

sorted_attrition_corr = attrition_corr.abs().sort_values(ascending=False).head(10)
top_features = sorted_attrition_corr.index
correlations = sorted_attrition_corr.values 


plt.figure(figsize=(10, 6), facecolor='white')
plt.gca().set_facecolor('white')
sns.set_style("whitegrid") 
sns.barplot(x=correlations, y=top_features, palette='pastel', orient='h')
plt.xlabel('Absolute Correlation with Attrition', loc='center', color='black')
plt.ylabel('Features', loc='center', color='black')
plt.title('Top 10 Features based on Correlation with Employee Attrition', loc='left', color='black')
plt.grid(axis='x', linestyle='--', alpha=0.6)
plt.tight_layout()

#plt.savefig('top_features_attrition.png', facecolor='white', bbox_inches='tight')
plt.show()


In [None]:
def cramers_v(x, y):
    contingency_table = pd.crosstab(x, y)
    chi2, p, dof, expected = chi2_contingency(contingency_table)
    
    n = contingency_table.sum().sum()  # Total number of observations
    cramers_v_value = np.sqrt(chi2 / (n * (min(contingency_table.shape) - 1)))
    
    return cramers_v_value

categorical_variables = ['BusinessTravel', 'Department', 'EducationField', 'Gender', 'JobRole', 'MaritalStatus', 'OverTime']

cramers_v_results = []
for column in categorical_variables:
    cramers_v_value = cramers_v(df['Attrition'], df[column])
    cramers_v_results.append({'Variable': column, "Cramér's V": cramers_v_value})


cramers_v_df = pd.DataFrame(cramers_v_results)
cramers_v_df = cramers_v_df.sort_values(by="Cramér's V", ascending=False)
print(cramers_v_df)

#cramers_v_df.to_csv('cramers_v_results.csv', index=False)


In [None]:
# Function to create bins dynamically, group data, and plot bar charts with counts

def plot_numerical_attrition_with_duplicates_handling(data, column, bins=None, bin_labels=None):
    """
    This function creates bins for continuous numerical columns (if applicable), groups data by Attrition,
    plots bar charts, and adds counts on the bins with correct legend using colored boxes.
    Handles duplicate bin edges gracefully.

    Args:
        data (pd.DataFrame): The dataset.
        column (str): The numerical column to analyze.
        bins (list, optional): List of bin edges for continuous columns. If None, bins are generated dynamically.
        bin_labels (list, optional): List of labels for the bins. If None, labels are created dynamically.

    Returns:
        Displays a bar chart and corresponding table.
    """

    if len(data[column].unique()) > 10:
        if bins is None:
            bins = pd.qcut(data[column], q=10, duplicates='drop', retbins=True)[1]  # Deciles with duplicate handling
            bin_labels = [f"{int(bins[i])}-{int(bins[i+1])}" for i in range(len(bins)-1)]
        
        bin_column = pd.cut(data[column], bins=bins, labels=bin_labels, include_lowest=True)
        data[f"{column}Bins"] = bin_column
        group_column = f"{column}Bins"
    else:
        
        group_column = column

    grouped_data = data.groupby([group_column, 'Attrition']).size().unstack(fill_value=0)

    grouped_data['Total'] = grouped_data.sum(axis=1)
    grouped_data['% Attrition in Cluster'] = round((grouped_data['Yes'] / grouped_data['Total']) * 100, 1)
    grouped_data['% of Total Attrition'] = round((grouped_data['Yes'] / grouped_data['Yes'].sum()) * 100, 1)

    grouped_data = grouped_data.reset_index()

    plot_data = grouped_data.melt(
        id_vars=[group_column], 
        value_vars=['No', 'Yes'], 
        var_name='Attrition', 
        value_name='Count'
    )

    custom_palette = ['lightblue', 'lightcoral']
    plt.figure(figsize=(12, 6))
    ax = sns.barplot(data=plot_data, x=group_column, y='Count', hue='Attrition', dodge=True, palette=custom_palette)
    
    for p in ax.patches:
        height = p.get_height()
        if height > 0:
            ax.annotate(f'{int(height)}', (p.get_x() + p.get_width() / 2., height),
                        ha='center', va='bottom', fontsize=9, color='black')

    plt.title(f'Attrition Distribution by {column}')
    plt.xlabel(f'{column}' if group_column == f"{column}Bins" else column)
    plt.ylabel('Number of Employees')
    plt.xticks(rotation=45)

    handles, labels = ax.get_legend_handles_labels()
    labels = ['No Attrition', 'Attrition']
    ax.legend(handles, labels, title='Attrition', loc='upper right', frameon=True)

    #plt.savefig(f'attrition_distribution_by_{column}.png', format='png', dpi=300)

    plt.show()
    grouped_data.to_csv(f'attrition_distribution_values_by_{column}.csv')

    return grouped_data

    
def load_and_prepare_data(file_path):
    """
    Loads the dataset and removes unnecessary columns.
    
    Args:
        file_path (str): Path to the CSV file.
    
    Returns:
        pd.DataFrame: Cleaned dataset.
    """

    data = pd.read_csv(file_path)

    columns_to_drop = ['EmployeeNumber', 'StandardHours', 'Over18', 'EmployeeCount']
    data = data.drop(columns=columns_to_drop, errors='ignore')
    
    return data

file_path =  'IBM Dataset 1.csv'
cleaned_data = load_and_prepare_data(file_path)


In [None]:
for col in cleaned_data.columns:
    if col == "Attrition":
        continue
    plot_numerical_attrition_with_duplicates_handling(cleaned_data, column=col)



In [None]:
plot_numerical_attrition_with_duplicates_handling(cleaned_data, column="Gender")

In [None]:
# Step-by-Step Feature Importance (SHAP) Analysis using XGBoost


data = pd.read_csv("IBM Dataset 1.csv")

data = data.drop(columns=['EmployeeCount', 'EmployeeNumber', 'StandardHours', 'Over18'], errors='ignore')

engineered_features = [
    ('IncomePerJobLevel', lambda df: df['MonthlyIncome'] / (df['JobLevel'] + 1)),
    ('TotalWorkingYearsToJobLevelRatio', lambda df: df['TotalWorkingYears'] / (df['JobLevel'] + 1)),
    ('YearsAtCompanyToAgeRatio', lambda df: df['YearsAtCompany'] / (df['Age'] + 1)),
    ('YearsAtCompanyToYearsInCurrentRoleRatio', lambda df: df['YearsAtCompany'] / (df['YearsInCurrentRole'] + 1))
]

for name, func in engineered_features:
    data[name] = func(data)

nominal_columns = ['Department', 'EducationField', 'JobRole', 'MaritalStatus']
data = pd.get_dummies(data, columns=nominal_columns, drop_first=True)

data['Attrition'] = data['Attrition'].map({'No': 0, 'Yes': 1})
data['OverTime'] = data['OverTime'].map({'No': 0, 'Yes': 1})
data['Gender'] = data['Gender'].map({'Male': 0, 'Female': 1})
data['BusinessTravel'] = data['BusinessTravel'].map({'Non-Travel': 0, 'Travel_Rarely': 1, 'Travel_Frequently': 2})

X_processed = data.drop(columns=['Attrition'], errors='ignore')
y_processed = data['Attrition']

X_train, X_test, y_train, y_test = train_test_split(X_processed, y_processed, test_size=0.2, random_state=42, stratify=y_processed)

smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

xgb_model = xgb.XGBClassifier(
    n_estimators=189,
    max_depth=4,
    learning_rate=0.4214,
    scale_pos_weight=1,
    eval_metric='logloss',
    use_label_encoder=False,
    random_state=42
)
xgb_model.fit(X_train_smote, y_train_smote)

explainer_xgb = shap.Explainer(xgb_model, X_train_smote)
shap_values_xgb = explainer_xgb(X_test)



In [None]:
# Step-by-Step Feature Importance Analysis using MLP (SHAP)


data = pd.read_csv("IBM Dataset 1.csv")

data = data.drop(columns=['EmployeeCount', 'EmployeeNumber', 'StandardHours', 'Over18'], errors='ignore')

engineered_features = [
    ('IncomePerJobLevel', lambda df: df['MonthlyIncome'] / (df['JobLevel'] + 1)),
    ('TotalWorkingYearsToJobLevelRatio', lambda df: df['TotalWorkingYears'] / (df['JobLevel'] + 1)),
    ('YearsAtCompanyToAgeRatio', lambda df: df['YearsAtCompany'] / (df['Age'] + 1)),
    ('YearsAtCompanyToYearsInCurrentRoleRatio', lambda df: df['YearsAtCompany'] / (df['YearsInCurrentRole'] + 1))
]

for name, func in engineered_features:
    data[name] = func(data)

nominal_columns = ['Department', 'EducationField', 'JobRole', 'MaritalStatus']
data = pd.get_dummies(data, columns=nominal_columns, drop_first=True)

data['Attrition'] = data['Attrition'].map({'No': 0, 'Yes': 1})
data['OverTime'] = data['OverTime'].map({'No': 0, 'Yes': 1})
data['Gender'] = data['Gender'].map({'Male': 0, 'Female': 1})
data['BusinessTravel'] = data['BusinessTravel'].map({'Non-Travel': 0, 'Travel_Rarely': 1, 'Travel_Frequently': 2})

X_processed = data.drop(columns=['Attrition'], errors='ignore')
y_processed = data['Attrition']

X_train, X_test, y_train, y_test = train_test_split(X_processed, y_processed, test_size=0.2, random_state=42, stratify=y_processed)

smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

mlp_model = MLPClassifier(
    hidden_layer_sizes=(256, 128, 64, 32, 16),
    activation='tanh',
    solver='adam',
    alpha=1.7442758481025684e-05,
    learning_rate_init=0.0008722872705873338,
    random_state=42
)
mlp_model.fit(X_train_smote, y_train_smote)

explainer_mlp = shap.Explainer(mlp_model.predict, X_train_smote)
shap_values_mlp = explainer_mlp(X_test)




In [None]:
#Feature Importance Analysis SHAP using TabNet

class F1WeightedMetric(Metric):
    def __init__(self):
        self._name = "f1_score"
        self._maximize = True

    def __call__(self, y_true, y_score):
        y_score = np.where(y_score > 0.5, 1, 0)
        y_score = y_score[:,1]
        return f1_score(y_true, y_score, average="weighted")
    
warnings.filterwarnings("ignore", category=ConvergenceWarning)

data = pd.read_csv("IBM Dataset 1.csv")

data = data.drop(columns=['EmployeeCount', 'EmployeeNumber', 'StandardHours', 'Over18'], errors='ignore')

engineered_features = [
    ('IncomePerJobLevel', lambda df: df['MonthlyIncome'] / (df['JobLevel'] + 1)),
    ('TotalWorkingYearsToJobLevelRatio', lambda df: df['TotalWorkingYears'] / (df['JobLevel'] + 1)),
    ('YearsAtCompanyToAgeRatio', lambda df: df['YearsAtCompany'] / (df['Age'] + 1)),
    ('YearsAtCompanyToYearsInCurrentRoleRatio', lambda df: df['YearsAtCompany'] / (df['YearsInCurrentRole'] + 1))
]

for name, func in engineered_features:
    data[name] = func(data)

nominal_columns = ['Department', 'EducationField', 'JobRole', 'MaritalStatus']
data = pd.get_dummies(data, columns=nominal_columns, drop_first=True)

data['Attrition'] = data['Attrition'].map({'No': 0, 'Yes': 1})
data['OverTime'] = data['OverTime'].map({'No': 0, 'Yes': 1})
data['Gender'] = data['Gender'].map({'Male': 0, 'Female': 1})
data['BusinessTravel'] = data['BusinessTravel'].map({'Non-Travel': 0, 'Travel_Rarely': 1, 'Travel_Frequently': 2})

X_processed = data.drop(columns=['Attrition'], errors='ignore')
y_processed = data['Attrition']

X_train, X_test, y_train, y_test = train_test_split(X_processed, y_processed, test_size=0.2, random_state=42, stratify=y_processed)

borderline_smote = BorderlineSMOTE(random_state=42)
X_train_smote, y_train_smote = borderline_smote.fit_resample(X_train, y_train)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


tabnet_model = TabNetClassifier(
    n_d=62, 
    n_a=41, 
    n_steps=2,
    gamma=1.2628907527283806, 
    lambda_sparse=0.003385618571289165,
    optimizer_fn=torch.optim.RMSprop,
    optimizer_params=dict(lr=2e-3),
    device_name=device,
    n_independent=4,
    n_shared=2,
    seed=42
)

tabnet_model.fit(
    X_train_smote.values, y_train_smote.values,
    max_epochs=100,
    patience=10,
    batch_size=64,
    
)

from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
import matplotlib.pyplot as plt
import seaborn as sns

y_pred = tabnet_model.predict(X_test_np)

