Pipeline Without PCA

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin

In [2]:
# Custom transformer for random imputation
class RandomImputer(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        self.values = {}
        for col in self.columns:
            self.values[col] = X[col].dropna().values
        return self

    def transform(self, X):
        X = X.copy()
        for col in self.columns:
            non_missing_values = self.values[col]
            X[col] = X[col].apply(lambda x: np.random.choice(non_missing_values) if pd.isnull(x) else x)
        return X

In [3]:

# Custom transformer for outlier detection and capping using IQR
class OutlierCapper(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        self.bounds = {}
        for col in self.columns:
            Q1 = X[col].quantile(0.25)  # 25th percentile
            Q3 = X[col].quantile(0.75)  # 75th percentile
            IQR = Q3 - Q1  # Interquartile Range
            lower_bound = Q1 - 1.5 * IQR  # Lower bound for outliers
            upper_bound = Q3 + 1.5 * IQR  # Upper bound for outliers
            self.bounds[col] = (lower_bound, upper_bound)
        return self

    def transform(self, X):
        X = X.copy()
        for col in self.columns:
            lower_bound, upper_bound = self.bounds[col]
            # Cap outliers to the lower and upper bounds
            X[col] = np.clip(X[col], lower_bound, upper_bound)
        return X

In [4]:
# Load data
df = pd.read_csv('HR-Employee-Attrition_with_missing_values.csv')
y = df['Attrition'].map({'Yes': 1, 'No': 0})
X = df.drop(columns=['Attrition', 'Over18', 'EmployeeCount', 'StandardHours'])

In [5]:
# Define columns
numerical_cols = X.select_dtypes(include=['float64', 'int64']).columns
nominal_cols = ['Department', 'EducationField', 'Gender', 'JobRole', 'MaritalStatus']
ordinal_cols = ['BusinessTravel']
ordinal_categories = [['Non-Travel', 'Travel_Rarely', 'Travel_Frequently']]

In [6]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
X_train

Unnamed: 0,Age,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeNumber,EnvironmentSatisfaction,Gender,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
1097,,Travel_Rarely,350,Research & Development,21.0,2,Technical Degree,1551,3.0,Male,...,3,2,3,2,3.0,3,1.0,1,0.0,0
727,18.0,Non-Travel,287,Research & Development,,2,Life Sciences,1012,,Male,...,3,4,0,0,,3,0.0,0,0.0,0
254,29.0,Travel_Rarely,1247,Sales,20.0,2,Marketing,349,,Male,...,3,4,1,10,,3,,2,0.0,2
1175,,Travel_Rarely,492,Research & Development,12.0,3,Medical,1654,,Male,...,4,3,0,7,3.0,3,5.0,4,1.0,0
1341,31.0,Travel_Rarely,311,Research & Development,20.0,3,Life Sciences,1881,,Male,...,3,1,1,10,,3,10.0,8,0.0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1130,35.0,Travel_Rarely,750,Research & Development,28.0,3,Life Sciences,1596,,Male,...,3,4,2,10,3.0,2,10.0,9,6.0,8
1294,41.0,Travel_Rarely,447,Research & Development,,3,Life Sciences,1814,,Male,...,3,1,0,11,,1,3.0,2,1.0,2
860,22.0,Travel_Frequently,1256,Research & Development,3.0,4,Life Sciences,1203,3.0,Male,...,3,2,1,1,5.0,3,,0,,0
1459,29.0,Travel_Rarely,1378,Research & Development,,2,Other,2053,4.0,Male,...,3,1,1,10,,3,4.0,3,,3


In [8]:
# Preprocessing pipelines
numerical_pipeline = Pipeline([
    ('random_imputer', RandomImputer(columns=numerical_cols)),
    ('outlier_capper', OutlierCapper(columns=numerical_cols)),  # Add Outlier Capping using IQR
    ('scaler', StandardScaler())
])

nominal_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

ordinal_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
    ('ordinal', OrdinalEncoder(categories=ordinal_categories))
])

preprocessor = ColumnTransformer([
    ('numerical', numerical_pipeline, numerical_cols),
    ('nominal', nominal_pipeline, nominal_cols),
    ('ordinal', ordinal_pipeline, ordinal_cols)
])

In [9]:
# Full pipeline with classifier
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

In [10]:
# Check missing values before preprocessing
print("Missing Values Before Preprocessing:")
print(X.isnull().sum())

Missing Values Before Preprocessing:
Age                         571
BusinessTravel                0
DailyRate                     0
Department                    0
DistanceFromHome            571
Education                     0
EducationField                0
EmployeeNumber                0
EnvironmentSatisfaction     571
Gender                        0
HourlyRate                    0
JobInvolvement                0
JobLevel                    571
JobRole                       0
JobSatisfaction               0
MaritalStatus               571
MonthlyIncome                 0
MonthlyRate                 571
NumCompaniesWorked            0
OverTime                      0
PercentSalaryHike             0
PerformanceRating             0
RelationshipSatisfaction      0
StockOptionLevel              0
TotalWorkingYears             0
TrainingTimesLastYear       571
WorkLifeBalance               0
YearsAtCompany              571
YearsInCurrentRole            0
YearsSinceLastPromotion     571
Yea

In [11]:
# Function to count outliers
def count_outliers(df, columns):
    outliers = {}
    for col in columns:
        Q1 = df[col].quantile(0.25)  # 25th percentile
        Q3 = df[col].quantile(0.75)  # 75th percentile
        IQR = Q3 - Q1  # Interquartile Range
        lower_bound = Q1 - 1.5 * IQR  # Lower bound for outliers
        upper_bound = Q3 + 1.5 * IQR  # Upper bound for outliers
        outliers[col] = ((df[col] < lower_bound) | (df[col] > upper_bound)).sum()
    return outliers

In [12]:
# Count and display outliers before preprocessing
print("Outliers Before Preprocessing:")
outliers_before = count_outliers(X, numerical_cols)
for col, count in outliers_before.items():
    print(f"{col}: {count} outliers")

Outliers Before Preprocessing:
Age: 0 outliers
DailyRate: 0 outliers
DistanceFromHome: 0 outliers
Education: 0 outliers
EmployeeNumber: 0 outliers
EnvironmentSatisfaction: 0 outliers
HourlyRate: 0 outliers
JobInvolvement: 0 outliers
JobLevel: 0 outliers
JobSatisfaction: 0 outliers
MonthlyIncome: 114 outliers
MonthlyRate: 0 outliers
NumCompaniesWorked: 52 outliers
PercentSalaryHike: 0 outliers
PerformanceRating: 226 outliers
RelationshipSatisfaction: 0 outliers
StockOptionLevel: 85 outliers
TotalWorkingYears: 63 outliers
TrainingTimesLastYear: 145 outliers
WorkLifeBalance: 0 outliers
YearsAtCompany: 31 outliers
YearsInCurrentRole: 21 outliers
YearsSinceLastPromotion: 70 outliers
YearsWithCurrManager: 14 outliers


In [13]:
# Fit the pipeline
pipeline.fit(X_train, y_train)

In [14]:
# Extract  feature names
nominal_feature_names = pipeline.named_steps['preprocessor'].transformers_[1][1].named_steps['onehot'].get_feature_names_out(nominal_cols)
final_feature_names = list(numerical_cols) + list(nominal_feature_names) + ordinal_cols

In [15]:
# Transform train and test sets
X_train_processed = pipeline.named_steps['preprocessor'].transform(X_train)
X_test_processed = pipeline.named_steps['preprocessor'].transform(X_test)

In [16]:
# Convert processed data to DataFrame
X_train_processed_df = pd.DataFrame(X_train_processed, columns=final_feature_names)
X_test_processed_df = pd.DataFrame(X_test_processed, columns=final_feature_names)

In [17]:
# Check missing values after preprocessing
print("Missing Values After Preprocessing:")
print(X_train_processed_df.isnull().sum())

Missing Values After Preprocessing:
Age                                  0
DailyRate                            0
DistanceFromHome                     0
Education                            0
EmployeeNumber                       0
EnvironmentSatisfaction              0
HourlyRate                           0
JobInvolvement                       0
JobLevel                             0
JobSatisfaction                      0
MonthlyIncome                        0
MonthlyRate                          0
NumCompaniesWorked                   0
PercentSalaryHike                    0
PerformanceRating                    0
RelationshipSatisfaction             0
StockOptionLevel                     0
TotalWorkingYears                    0
TrainingTimesLastYear                0
WorkLifeBalance                      0
YearsAtCompany                       0
YearsInCurrentRole                   0
YearsSinceLastPromotion              0
YearsWithCurrManager                 0
Department_Human Resources  

In [18]:
# Count and display outliers after preprocessing
print("Outliers After Preprocessing:")
outliers_after = count_outliers(X_train_processed_df, numerical_cols)
for col, count in outliers_after.items():
    print(f"{col}: {count} outliers")

Outliers After Preprocessing:
Age: 0 outliers
DailyRate: 0 outliers
DistanceFromHome: 0 outliers
Education: 0 outliers
EmployeeNumber: 0 outliers
EnvironmentSatisfaction: 0 outliers
HourlyRate: 0 outliers
JobInvolvement: 0 outliers
JobLevel: 0 outliers
JobSatisfaction: 0 outliers
MonthlyIncome: 0 outliers
MonthlyRate: 0 outliers
NumCompaniesWorked: 0 outliers
PercentSalaryHike: 0 outliers
PerformanceRating: 0 outliers
RelationshipSatisfaction: 0 outliers
StockOptionLevel: 0 outliers
TotalWorkingYears: 0 outliers
TrainingTimesLastYear: 0 outliers
WorkLifeBalance: 0 outliers
YearsAtCompany: 0 outliers
YearsInCurrentRole: 0 outliers
YearsSinceLastPromotion: 163 outliers
YearsWithCurrManager: 0 outliers


In [19]:

# Train and evaluate the classifier
y_pred = pipeline.named_steps['classifier'].predict(X_test_processed)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.87


In [20]:
X_train_processed_df

Unnamed: 0,Age,DailyRate,DistanceFromHome,Education,EmployeeNumber,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,JobSatisfaction,...,JobRole_Manufacturing Director,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Missing,MaritalStatus_Single,BusinessTravel
0,0.022502,-1.108139,1.336791,-0.863356,0.860666,0.263944,-0.472832,-1.012340,2.080133,-1.582336,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
1,-2.112420,-1.263481,-1.064226,-0.863356,-0.026811,1.187746,0.309374,0.389912,0.029427,1.152834,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,-0.876413,1.103647,1.216740,-0.863356,-1.118456,-0.659859,-1.059487,0.389912,0.029427,1.152834,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
3,-2.112420,-0.758001,0.256333,0.099933,1.030258,1.187746,-0.032841,0.389912,0.029427,-0.670613,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
4,-0.651684,-1.204303,1.216740,0.099933,1.404019,0.263944,1.091580,0.389912,0.029427,0.241111,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1171,-0.202227,-0.121835,2.177147,0.099933,0.934760,0.263944,-1.010599,1.792164,0.029427,0.241111,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
1172,0.471959,-0.868960,-0.223870,0.099933,1.293702,-1.583661,0.896028,1.792164,0.029427,-0.670613,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
1173,-1.662963,1.125839,-0.824125,1.063222,0.287676,0.263944,-0.912823,-1.012340,-0.995927,1.152834,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0
1174,-0.876413,1.426662,2.177147,-0.863356,1.687221,1.187746,-1.010599,-1.012340,0.029427,-0.670613,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
