**Load Dataset**

In [None]:
import pandas as pd


In [None]:
empdata=pd.read_csv(r"G:\Guvi-Datascience\Project\Employee Attrition_Project 3\Employee-Attrition - Employee-Attrition.csv")

In [None]:
empdata

In [None]:
empdata.isnull().sum()

In [None]:
empdata.info()


In [None]:
empdata.duplicated()

In [None]:
# Drop useless constant columns
remove_cols = ['EmployeeCount', 'StandardHours', 'Over18','EmployeeNumber']
empdata.drop(columns=remove_cols, inplace=True)


In [None]:
empdata.nunique()

**EDA**


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [None]:

# Separate Numerical & Categorical Columns
# Numerical columns: int or float
num_cols = [col for col in empdata.columns if empdata[col].nunique() >= 10]

# Categorical columns: object type
cat_cols = [col for col in empdata.columns if empdata[col].nunique() < 10]


print("\nNumerical Columns:\n", num_cols)
print("\nCategorical Columns:\n", cat_cols)


**Univariate Analysis**

In [None]:

# UNIVARIATE ANALYSIS
# Target Variable: Attrition

plt.figure(figsize=(6, 4))
sns.countplot(data=empdata, x='Attrition')
plt.title("Attrition Count (0 = Stayed, 1 = Left)")
plt.xlabel("Attrition")
plt.ylabel("Count")
plt.show()

# Percentage distribution of Attrition
print("\nAttrition Value Counts (%):")
print(empdata['Attrition'].value_counts(normalize=True) * 100)


The attrition variable is highly imbalanced, with significantly fewer employees leaving than staying, which necessitates the use of resampling techniques like SMOTE and evaluation metrics such as Recall and ROC-AUC instead of accuracy.

In [None]:
# UNIVARIATE ANALYSIS - Numerical (Hist + Boxplot + Stats)

# List of numerical columns
num_cols_for_hist = [col for col in num_cols]

for col in num_cols_for_hist:
    
    # ---- Statistics ----
    mean_val = empdata[col].mean()
    median_val = empdata[col].median()
    percent_diff = abs(mean_val - median_val) / mean_val * 100

    print("Column:", col)
    print("====================")
    print("Mean:", round(mean_val, 2), 
          " | Median:", round(median_val, 2))
    print("Mean-Median Difference:", round(percent_diff, 2), "%")
    print("Skewness:", round(empdata[col].skew(), 2))
    print("\n")

    # ---- Plots SIDE BY SIDE ----
    plt.figure(figsize=(14, 5))

    # 1️⃣ Histogram + KDE
    plt.subplot(1, 2, 1)
    sns.histplot(empdata[col], kde=True)
    plt.title(f"Histogram of {col}")
    plt.xlabel(col)
    plt.ylabel("Frequency")

    # 2️⃣ Boxplot
    plt.subplot(1, 2, 2)
    sns.boxplot(x=empdata[col])
    plt.title(f"Boxplot of {col}")
    plt.xlabel(col)

    plt.tight_layout()
    plt.show()


Most tenure and experience-related variables are right-skewed with notable outliers, indicating that the workforce is dominated by early-to-mid tenure employees, while long-tenured employees form a small minority, making these features important for attrition analysis.

In [None]:
import math

cat_cols_no_attr = [col for col in cat_cols if col != 'Attrition']

# Number of categorical columns
n = len(cat_cols_no_attr)

# Choose grid size (rows & cols)
cols = 3 
rows = math.ceil(n / 3)   # 3 columns per row 

plt.figure(figsize=(18, rows * 5))

for i, col in enumerate(cat_cols_no_attr, 1):
    plt.subplot(rows, cols, i)
    sns.countplot(data=empdata, x=col)
    plt.title(f"Count Plot - {col}")
    plt.xticks(rotation=45)

plt.tight_layout()
plt.show()


The workforce is concentrated in specific departments, job roles, and satisfaction levels, with limited variation in performance ratings and work–life balance, indicating structured organizational policies that may influence employee attrition.

**Bivariate Analysis** 

In [None]:
# Numerical Vs Attrition
# Number of numerical columns
n = len(num_cols)

# Grid layout → 3 plots per row (you can change)
rows = math.ceil(n / 3)
cols = 3

plt.figure(figsize=(18, rows * 5))

for i, num in enumerate(num_cols, 1):
    plt.subplot(rows, cols, i)
    sns.boxplot(data=empdata, x="Attrition", y=num)
    plt.title(f"{num} vs Attrition")
    plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

Employees with lower income, lesser experience, shorter tenure, and longer commuting distances show higher attrition, while compensation rates alone exhibit limited discriminatory power.

In [None]:
# Categrical Vs Attrition
# Remove Attrition from categorical columns
cat_cols_no_attr = [col for col in cat_cols if col != "Attrition"]

# Number of categorical columns
n = len(cat_cols_no_attr)

# Grid layout → 3 plots per row (you can change)
rows = math.ceil(n / 3)
cols = 3

plt.figure(figsize=(20, rows * 5))

for i, col in enumerate(cat_cols_no_attr, 1):
    plt.subplot(rows, cols, i)
    sns.countplot(data=empdata, x=col, hue="Attrition")
    plt.title(f"{col} vs Attrition")
    plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

Higher attrition is observed among employees with frequent travel, overtime work, lower job involvement and satisfaction, lower job levels, and limited stock options, highlighting work conditions and career growth as key drivers of attrition.

**Correlation** 

In [None]:
# Compute correlation
corr = empdata[num_cols].corr()
# Plot heatmap
plt.figure(figsize=(14, 10))
sns.heatmap(
    corr,
    annot=True,
    cmap='coolwarm',
    linewidths=0.5,
    fmt=".2f",
    square=True
)
plt.title("Correlation Heatmap of Selected Features", fontsize=16)
plt.show()

Tenure and experience-related features exhibit strong positive intercorrelations indicating multicollinearity, while compensation rates and distance-related variables show weak linear relationships with other features.

**Chi- Square Contingency** 

In [None]:

from scipy.stats import chi2_contingency

# Copy dataset (safe practice)
df = empdata.copy()

# Select categorical columns
cat_cols = df.select_dtypes(include='object').columns.tolist()

# Remove target column
cat_cols.remove('Attrition')

chi_results = []

for col in cat_cols:
    # Create contingency table
    contingency_table = pd.crosstab(df[col], df['Attrition'])

    # Perform Chi-Square test
    chi2, p_value, dof, expected = chi2_contingency(contingency_table)

    chi_results.append({
        'Feature': col,
        'Chi2_Statistic': round(chi2, 3),
        'p_value': round(p_value, 5),
        'Significant (p<0.05)': 'Yes' if p_value < 0.05 else 'No'
    })

# Convert to DataFrame
chi_square_df = pd.DataFrame(chi_results)

# Sort by p-value
chi_square_df = chi_square_df.sort_values(by='p_value').reset_index(drop=True)

chi_square_df


Chi-square analysis shows that job-related factors such as OverTime, JobRole, and BusinessTravel are significantly associated with attrition, while Gender has no statistically significant impact

**Annova**

In [None]:
from scipy.stats import f_oneway
import pandas as pd
import numpy as np

# Detect target column (works if it's 'Attrition' or an encoded name like 'Attrition_Yes')
if 'Attrition' in empdata.columns:
    target_col = 'Attrition'
else:
    target_candidates = [c for c in empdata.columns if 'Attrition' in c]
    if len(target_candidates) == 0:
        raise KeyError("No column containing 'Attrition' found in empdata")
    target_col = target_candidates[0]

# Numeric columns (exclude target if numeric)
num_cols = empdata.select_dtypes(include=[np.number]).columns.tolist()
if target_col in num_cols:
    num_cols = [c for c in num_cols if c != target_col]

anova_results = []

# Use the unique values of the target as group labels (keeps whatever form you have: 'Yes'/'No' or 0/1)
group_labels = empdata[target_col].dropna().unique().tolist()

for col in num_cols:
    # build groups for this feature
    groups = []
    for g in group_labels:
        vals = empdata.loc[empdata[target_col] == g, col].dropna().values
        if len(vals) > 1:
            groups.append(vals)

    # need at least two groups with >1 sample to run ANOVA
    if len(groups) < 2:
        continue

    # one-way ANOVA
    f_stat, p_val = f_oneway(*groups)

    # group means (safe retrieval)
    means = {str(g): empdata.loc[empdata[target_col] == g, col].mean() for g in group_labels}

    anova_results.append({
        'Feature': col,
        'F_statistic': float(f_stat),
        'p_value': float(p_val),
        **{f'Mean_{str(g)}': float(means.get(str(g), np.nan)) for g in group_labels}
    })

anova_df = pd.DataFrame(anova_results)

# sort by p-value (smallest p first)
anova_df = anova_df.sort_values('p_value').reset_index(drop=True)

# show columns and top results
pd.set_option('display.max_columns', None)
print(f"Target column used for grouping: {target_col}")
anova_df.head(50)


ANOVA results indicate that experience, tenure, income, job level, and satisfaction-related variables significantly influence attrition, while compensation rates, performance rating, and education show no meaningful impact.

**Selected Features**

In [None]:
s_categorical_features = [
    'BusinessTravel', 'Department', 'EducationField',
    'EnvironmentSatisfaction', 'JobInvolvement',
    'JobRole', 'JobSatisfaction', 'MaritalStatus',
    'OverTime', 'StockOptionLevel', 'WorkLifeBalance', 'JobLevel'
]

s_numerical_features = [
    'Age', 'DailyRate', 'DistanceFromHome', 'MonthlyIncome',
    'TotalWorkingYears', 'YearsAtCompany', 'YearsInCurrentRole',
    'YearsWithCurrManager', 'YearsSinceLastPromotion',
    'TrainingTimesLastYear'
]


In [None]:
# Multicollinearity Analysis of Numerical Features

plt.figure(figsize=(15, 6))

sns.heatmap(
    empdata[s_numerical_features].corr(),
    annot=True,
    cmap="coolwarm",
)

plt.title("Correlation Heatmap of Numerical Features", fontsize=14)
plt.show()


**Final Selected Feature**

In [None]:
# final selected features (updated)

f_numerical_features = [
    'Age',
    'MonthlyIncome',
    'TotalWorkingYears',
    'YearsAtCompany',
    'YearsInCurrentRole',
    'YearsWithCurrManager'
]

f_categorical_features = [
    'OverTime',
    'JobRole',
    'JobLevel',
    'EnvironmentSatisfaction',
    'WorkLifeBalance'
]


**Feature Engineering**

In [None]:
# Categorical Encoding (Label + One-Hot)

binary_encoded_cols = []

for col in f_categorical_features:

    # Binary categorical → Label Encoding (0 / 1)
    if empdata[col].nunique() == 2:
        empdata[col] = pd.factorize(empdata[col])[0]
        binary_encoded_cols.append(col)

    # Multi-class categorical → One-Hot Encoding
    elif 2 < empdata[col].nunique() < 10:
        dummies = pd.get_dummies(empdata[col], drop_first=False, prefix=col,dtype=int)
        empdata = pd.concat([empdata.drop(col, axis=1), dummies], axis=1)

# One-hot encoded columns
encoded_cols = [
    c for c in empdata.columns
    if any(c.startswith(f + "_") for f in f_categorical_features)
]

# Final dataset (numerical + binary categorical + one-hot + target)
empdata = empdata[
    f_numerical_features + binary_encoded_cols + encoded_cols + ['Attrition']
]


**Outlier Treatment**

In [None]:
# Outlier treatent

df_out = empdata.copy()

mask = pd.Series(True, index=df_out.index)

for col in f_numerical_features:
    features = df_out[col].quantile(0.25)
    Q3 = df_out[col].quantile(0.75)
    IQR = Q3 - features

    mask &= (
        (df_out[col] >= features - 1.5 * IQR) &
        (df_out[col] <= Q3 + 1.5 * IQR)
    )

df_out = df_out[mask].reset_index(drop=True)



**SMOTE**

In [None]:
from imblearn.over_sampling import SMOTE
import pandas as pd

# One-hot encoding
df_encoded = pd.get_dummies(df_out, drop_first=True)

# Target column AFTER encoding
target_col = "Attrition_Yes"
x = df_encoded.drop(columns=[target_col])
y = df_encoded[target_col]

# Apply SMOTE
sm = SMOTE(random_state=0)
x_sm, y_sm = sm.fit_resample(X, y)

print("Before SMOTE:\n", y.value_counts())
print("\nAfter SMOTE:\n", y_sm.value_counts())


**Train Test Split**

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(
    x_sm, y_sm,
    test_size=0.20,
    random_state=42,
    stratify=y_sm
)

**Scaling**

In [None]:
from sklearn.preprocessing import StandardScaler

std = StandardScaler()

x_train_scaled = pd.DataFrame(
    std.fit_transform(x_train),
    columns=x_train.columns,
    index=x_train.index
)

x_test_scaled = pd.DataFrame(
    std.transform(x_test),
    columns=x_train.columns,
    index=x_test.index
)


**Score Function**

In [None]:
# classification summary (focused metrics)

from sklearn.metrics import (
    f1_score, recall_score,
    confusion_matrix, classification_report, roc_auc_score
)
import pandas as pd

results_df = pd.DataFrame(columns=['model', 'recall_macro'])


def classification_summary(model, y_test, pred, pred_prob):
    print("\n--- model evaluation ---")
    print(f"model: {model.__class__.__name__}")

    # key metrics
    print("recall (yes):", round(recall_score(y_test, pred), 3))
    print("f1 score (macro):", round(f1_score(y_test, pred, average='macro'), 3))
    print("auc-roc:", round(roc_auc_score(y_test, pred_prob[:, 1]), 3))

    print("\nconfusion matrix:")
    print(confusion_matrix(y_test, pred))

    print("\nclassification report:")
    print(classification_report(y_test, pred))

    results_df.loc[len(results_df)] = [
        model.__class__.__name__,
        f1_score(y_test, pred, average='macro')
    ]


**Multiple Models & Comparison**

In [None]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(x_train_scaled, y_train)

pred = model.predict(x_test_scaled)
pred_prob = model.predict_proba(x_test_scaled)

classification_summary(model, y_test, pred, pred_prob)

In [None]:
# Decision Tree 

from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier(random_state=42)
model.fit(x_train, y_train)

pred = model.predict(x_test)
pred_prob = model.predict_proba(x_test)

# Evaluation
classification_summary(model, y_test, pred, pred_prob)


In [None]:
# Random Forest

from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=300, random_state=42, n_jobs=-1)
model.fit(x_train, y_train)

pred = model.predict(x_test)
pred_prob = model.predict_proba(x_test)

# Evaluation
classification_summary(model, y_test, pred, pred_prob)


In [None]:
# Gradient Boosting

from sklearn.ensemble import GradientBoostingClassifier

model = GradientBoostingClassifier(random_state=42)
model.fit(x_train, y_train)

pred = model.predict(x_test)
pred_prob = model.predict_proba(x_test)

# Evaluation
classification_summary(model, y_test, pred, pred_prob)

In [None]:
# XGBoost

from xgboost import XGBClassifier

model = XGBClassifier(
    objective="binary:logistic",
    eval_metric="logloss",
    random_state=42,
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6
)

model.fit(x_train, y_train)

pred = model.predict(x_test)
pred_prob = model.predict_proba(x_test)

# Evaluation
classification_summary(model, y_test, pred, pred_prob)

In [None]:
results_df.sort_values(by='recall_macro', ascending=False)

**Save to Pickle File**

In [None]:
best_model = RandomForestClassifier(n_estimators=300, random_state=42, n_jobs=-1)

In [None]:
best_model.fit(x_train, y_train)

In [None]:
# save the best trained model as a pickle file

import pickle

file_path = (r"G:\DS_Projects\.venv\Employee_Attrition\best_model.pkl")

with open(file_path, "wb") as f:
    pickle.dump(best_model, f)