In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

In [2]:
df = pd.read_csv('Employee.csv')
df.head()

Unnamed: 0,Education,JoiningYear,City,PaymentTier,Age,Gender,EverBenched,ExperienceInCurrentDomain,LeaveOrNot
0,Bachelors,2017,Bangalore,3,34,Male,No,0,0
1,Bachelors,2013,Pune,1,28,Female,No,3,1
2,Bachelors,2014,New Delhi,3,38,Female,No,2,0
3,Masters,2016,Bangalore,3,27,Male,No,5,1
4,Masters,2017,Pune,3,24,Male,Yes,2,1


In [3]:
df.isnull().sum()

Education                    0
JoiningYear                  0
City                         0
PaymentTier                  0
Age                          0
Gender                       0
EverBenched                  0
ExperienceInCurrentDomain    0
LeaveOrNot                   0
dtype: int64

In [4]:
df.dtypes

Education                    object
JoiningYear                   int64
City                         object
PaymentTier                   int64
Age                           int64
Gender                       object
EverBenched                  object
ExperienceInCurrentDomain     int64
LeaveOrNot                    int64
dtype: object

In [5]:
# Separate features (X) and target (y)
X = df.drop(columns=['LeaveOrNot'])
y = df['LeaveOrNot']

In [6]:
# Define numerical and categorical column names
numerical_cols = X.select_dtypes(exclude='object').columns.to_list()
print(numerical_cols)
categorical_cols = X.select_dtypes(include='object').columns.to_list()
print(categorical_cols)

['JoiningYear', 'PaymentTier', 'Age', 'ExperienceInCurrentDomain']
['Education', 'City', 'Gender', 'EverBenched']


In [7]:
for i in categorical_cols:
    display(X[i].value_counts())

Bachelors    3601
Masters       873
PHD           179
Name: Education, dtype: int64

Bangalore    2228
Pune         1268
New Delhi    1157
Name: City, dtype: int64

Male      2778
Female    1875
Name: Gender, dtype: int64

No     4175
Yes     478
Name: EverBenched, dtype: int64

In [8]:
# Create transformers for preprocessing
#imputer is used suppose eventhough we dont have nulls in given data , but will be useful when we get nulls in real data
numerical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore',drop='if_binary'))
])

In [9]:
# Combine transformers into a preprocessor using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [10]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)

# Create a full pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor)
])

In [11]:
pipeline.fit_transform(X_train)

array([[ 1.04683008,  0.5405808 , -0.49509546, ...,  0.        ,
         1.        ,  0.        ],
       [-0.55922992,  0.5405808 , -0.91064294, ...,  1.        ,
         0.        ,  0.        ],
       [-0.02387659,  0.5405808 , -0.28732172, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.51147675, -1.24472687, -0.91064294, ...,  0.        ,
         1.        ,  0.        ],
       [-0.02387659, -1.24472687, -0.28732172, ...,  1.        ,
         0.        ,  1.        ],
       [-0.02387659, -1.24472687, -0.49509546, ...,  1.        ,
         0.        ,  0.        ]])

In [12]:
cat_cols = list(pipeline.named_steps['preprocessor'].named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(categorical_cols))

In [13]:
# Fit and transform the training data and keep it as a DataFrame
X_train_scaled = pd.DataFrame(pipeline.fit_transform(X_train), columns=numerical_cols + cat_cols)
X_train_scaled.index = X_train.index
# Transform the test data and keep it as a DataFrame
X_test_scaled = pd.DataFrame(pipeline.transform(X_test), columns=numerical_cols + cat_cols)
X_test_scaled.index = X_test.index

In [14]:
train_data = X_train_scaled.copy()
test_data = X_test_scaled.copy()

In [15]:
X_train.head(5)

Unnamed: 0,Education,JoiningYear,City,PaymentTier,Age,Gender,EverBenched,ExperienceInCurrentDomain
1300,Bachelors,2017,Bangalore,3,27,Male,No,5
622,Bachelors,2014,Pune,3,25,Female,No,3
918,Bachelors,2015,Bangalore,3,28,Female,No,1
4402,Bachelors,2014,Pune,3,34,Male,No,5
3306,Bachelors,2013,Bangalore,3,31,Male,No,2


In [16]:
y_train[0:5]

1300    0
622     1
918     0
4402    0
3306    0
Name: LeaveOrNot, dtype: int64

In [17]:
display(X_test.head(5))
display(y_test[0:5])

Unnamed: 0,Education,JoiningYear,City,PaymentTier,Age,Gender,EverBenched,ExperienceInCurrentDomain
2904,Masters,2016,New Delhi,3,29,Female,No,1
3284,Masters,2018,New Delhi,3,38,Male,No,2
3230,Bachelors,2015,Bangalore,3,35,Male,No,0
3023,Bachelors,2014,Bangalore,3,36,Male,No,1
179,Bachelors,2012,Bangalore,3,27,Male,No,5


2904    1
3284    1
3230    0
3023    0
179     1
Name: LeaveOrNot, dtype: int64

In [18]:
train_data['Target'] = y_train
test_data['Target'] = y_test

In [19]:
train_data.head()

Unnamed: 0,JoiningYear,PaymentTier,Age,ExperienceInCurrentDomain,Education_Bachelors,Education_Masters,Education_PHD,City_Bangalore,City_New Delhi,City_Pune,Gender_Male,EverBenched_Yes,Target
1300,1.04683,0.540581,-0.495095,1.352505,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0
622,-0.55923,0.540581,-0.910643,0.057912,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1
918,-0.023877,0.540581,-0.287322,-1.236681,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0
4402,-0.55923,0.540581,0.959321,1.352505,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0
3306,-1.094583,0.540581,0.336,-0.589384,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0


In [20]:
train_data.isnull().sum()

JoiningYear                  0
PaymentTier                  0
Age                          0
ExperienceInCurrentDomain    0
Education_Bachelors          0
Education_Masters            0
Education_PHD                0
City_Bangalore               0
City_New Delhi               0
City_Pune                    0
Gender_Male                  0
EverBenched_Yes              0
Target                       0
dtype: int64

In [21]:
test_data.head()

Unnamed: 0,JoiningYear,PaymentTier,Age,ExperienceInCurrentDomain,Education_Bachelors,Education_Masters,Education_PHD,City_Bangalore,City_New Delhi,City_Pune,Gender_Male,EverBenched_Yes,Target
2904,0.511477,0.540581,-0.079548,-1.236681,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1
3284,1.582183,0.540581,1.790416,-0.589384,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1
3230,-0.023877,0.540581,1.167094,-1.883977,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0
3023,-0.55923,0.540581,1.374868,-1.236681,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0
179,-1.629937,0.540581,-0.495095,1.352505,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1


In [22]:
test_data.isnull().sum()

JoiningYear                  0
PaymentTier                  0
Age                          0
ExperienceInCurrentDomain    0
Education_Bachelors          0
Education_Masters            0
Education_PHD                0
City_Bangalore               0
City_New Delhi               0
City_Pune                    0
Gender_Male                  0
EverBenched_Yes              0
Target                       0
dtype: int64

In [23]:
# train_data.to_csv('Train_data.csv',index=False)
# test_data.to_csv('Test_data.csv',index=False)