In [1]:
# IMPORTING DATA
import pandas as pd
import numpy as np

In [6]:
# Load the data from the CSV file
data = pd.read_csv('employee.csv')

In [7]:
# Print the first few rows of the dataset
print("First 5 rows of the dataset:")
print(data.head())

First 5 rows of the dataset:
   Education  JoiningYear       City  PaymentTier  Age  Gender EverBenched  \
0  Bachelors         2017  Bangalore            3   34    Male          No   
1  Bachelors         2013       Pune            1   28  Female          No   
2  Bachelors         2014  New Delhi            3   38  Female          No   
3    Masters         2016  Bangalore            3   27    Male          No   
4    Masters         2017       Pune            3   24    Male         Yes   

   ExperienceInCurrentDomain  LeaveOrNot  
0                          0           0  
1                          3           1  
2                          2           0  
3                          5           1  
4                          2           1  


In [8]:
# Check for missing values
print("\nMissing values in each column:")
print(data.isnull().sum())


Missing values in each column:
Education                    0
JoiningYear                  0
City                         0
PaymentTier                  0
Age                          0
Gender                       0
EverBenched                  0
ExperienceInCurrentDomain    0
LeaveOrNot                   0
dtype: int64


In [9]:
# Splitting features and target variable
X = data.drop(columns=['LeaveOrNot'])
y = data['LeaveOrNot']

In [10]:
# Identify categorical and numerical features
categorical_features = X.select_dtypes(include=[object]).columns.tolist()
numerical_features = X.select_dtypes(include=[np.number]).columns.tolist()

print("\nCategorical Features:", categorical_features)
print("Numerical Features:", numerical_features)


Categorical Features: ['Education', 'City', 'Gender', 'EverBenched']
Numerical Features: ['JoiningYear', 'PaymentTier', 'Age', 'ExperienceInCurrentDomain']


In [11]:
# EXPORTING FEATURE INPUT METADATA
def summarize_cat(data, categorical_features):
    results = []
    for column in data[categorical_features]:
        members = data[column].unique().tolist()
        results.append([column, members])
    return pd.DataFrame(results, columns=['Column Name', 'Members'])

In [12]:
# Create a dictionary for feature metadata
my_feature_dict = {
    'CATEGORICAL': summarize_cat(X, categorical_features).to_dict(),
    'NUMERICAL': {'Column Name': numerical_features}
}

In [13]:
# Save the schema to a .pkl file
import pickle
with open('my_feature_dict.pkl', 'wb') as fp:
    pickle.dump(my_feature_dict, fp)
print("\nFeature schema saved successfully to file.")


Feature schema saved successfully to file.


In [14]:
# SPLITTING DATA FOR TRAIN / TEST
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("\nTrain Data Distribution:")
print(y_train.value_counts(normalize=True))
print("\nTest Data Distribution:")
print(y_test.value_counts(normalize=True))


Train Data Distribution:
LeaveOrNot
0    0.656368
1    0.343632
Name: proportion, dtype: float64

Test Data Distribution:
LeaveOrNot
0    0.655209
1    0.344791
Name: proportion, dtype: float64


In [15]:
# CREATING THE PIPELINE
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier

In [16]:
# Preprocessing for categorical features
pipeline_cat = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Preprocessing for numerical features
pipeline_num = Pipeline(steps=[
    ('scale_data', StandardScaler()),
    ('imputer', SimpleImputer(strategy='mean'))
])

In [17]:
# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', pipeline_cat, categorical_features),
        ('num', pipeline_num, numerical_features)
    ]
)

In [18]:
# Final pipeline with classifier
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

In [19]:
# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

In [20]:
# Evaluate the model on training data
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
y_train_pred = pipeline.predict(X_train)
print("\nTraining Accuracy:", accuracy_score(y_train, y_train_pred))
print("\nClassification Report:\n", classification_report(y_train, y_train_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_train, y_train_pred))


Training Accuracy: 0.9279957012358947

Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.98      0.95      2443
           1       0.95      0.84      0.89      1279

    accuracy                           0.93      3722
   macro avg       0.93      0.91      0.92      3722
weighted avg       0.93      0.93      0.93      3722


Confusion Matrix:
 [[2386   57]
 [ 211 1068]]


In [21]:
# Save the trained pipeline to a .pkl file
import dill
with open('pipeline.pkl', 'wb') as file:
    dill.dump(pipeline, file)
print("\nPipeline saved successfully to file.")


Pipeline saved successfully to file.
