In [1]:
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import pandas as pd

In [2]:
train = pd.read_csv("../data/train.csv")
test = pd.read_csv("../data/test.csv")

In [3]:
train

Unnamed: 0,education,joiningyear,city,paymenttier,age,gender,everbenched,experienceincurrentdomain,leaveornot
0,Bachelors,2012,New Delhi,3,38,Female,No,1,0
1,Bachelors,2018,Bangalore,3,26,Male,No,4,1
2,Bachelors,2016,Pune,3,27,Male,No,5,0
3,Bachelors,2016,Bangalore,3,36,Male,No,4,0
4,Bachelors,2017,Bangalore,3,27,Male,Yes,5,0
...,...,...,...,...,...,...,...,...,...
3717,Masters,2017,New Delhi,3,27,Male,No,5,1
3718,Bachelors,2016,Bangalore,3,26,Male,Yes,4,0
3719,Masters,2015,Pune,3,26,Male,No,4,1
3720,Bachelors,2014,Bangalore,3,26,Male,No,4,0


In [4]:
train = train.drop(["gender", "age"], axis = 1)
train

Unnamed: 0,education,joiningyear,city,paymenttier,everbenched,experienceincurrentdomain,leaveornot
0,Bachelors,2012,New Delhi,3,No,1,0
1,Bachelors,2018,Bangalore,3,No,4,1
2,Bachelors,2016,Pune,3,No,5,0
3,Bachelors,2016,Bangalore,3,No,4,0
4,Bachelors,2017,Bangalore,3,Yes,5,0
...,...,...,...,...,...,...,...
3717,Masters,2017,New Delhi,3,No,5,1
3718,Bachelors,2016,Bangalore,3,Yes,4,0
3719,Masters,2015,Pune,3,No,4,1
3720,Bachelors,2014,Bangalore,3,No,4,0


In [5]:
train["education"].unique()

array(['Bachelors', 'Masters', 'PHD'], dtype=object)

In [6]:
train["joiningyear"].unique()

array([2012, 2018, 2016, 2017, 2013, 2015, 2014])

In [7]:
train["experienceincurrentdomain"].unique()

array([1, 4, 5, 2, 0, 3, 6, 7])

In [8]:
ordi = OrdinalEncoder(categories=[['Bachelors', 'Masters', 'PHD'],
                                  [2012, 2013, 2014, 2015, 2016, 2017, 2018],
                                  [1,2,3],
                                  [0,1,2,3,4,5,6,7],
                                  ["No", "Yes"]])
minmax = MinMaxScaler()
onehot = OneHotEncoder()

In [9]:
ct = ColumnTransformer([("ord", ordi, ["education", "joiningyear", "paymenttier", "experienceincurrentdomain", "everbenched"]), 
                        ("onehot", onehot, ["city"])], remainder="passthrough")

In [10]:
ct.fit(train)

In [11]:
train_use = ct.transform(train)
new_features = list(ct.get_feature_names_out())
for i in range(len(new_features)):
    new_features[i] = new_features[i].split("__")[1]
train_df = pd.DataFrame(train_use, columns= new_features)
train_df

Unnamed: 0,education,joiningyear,paymenttier,experienceincurrentdomain,everbenched,city_Bangalore,city_New Delhi,city_Pune,leaveornot
0,0.0,0.0,2.0,1.0,0.0,0.0,1.0,0.0,0.0
1,0.0,6.0,2.0,4.0,0.0,1.0,0.0,0.0,1.0
2,0.0,4.0,2.0,5.0,0.0,0.0,0.0,1.0,0.0
3,0.0,4.0,2.0,4.0,0.0,1.0,0.0,0.0,0.0
4,0.0,5.0,2.0,5.0,1.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
3717,1.0,5.0,2.0,5.0,0.0,0.0,1.0,0.0,1.0
3718,0.0,4.0,2.0,4.0,1.0,1.0,0.0,0.0,0.0
3719,1.0,3.0,2.0,4.0,0.0,0.0,0.0,1.0,1.0
3720,0.0,2.0,2.0,4.0,0.0,1.0,0.0,0.0,0.0


In [12]:
test_use = ct.transform(test)
#new_features = list(ct.get_feature_names_out())
test_df = pd.DataFrame(test_use, columns=new_features)

In [13]:
test_df

Unnamed: 0,education,joiningyear,paymenttier,experienceincurrentdomain,everbenched,city_Bangalore,city_New Delhi,city_Pune,leaveornot
0,1.0,5.0,1.0,5.0,0.0,0.0,1.0,0.0,1.0
1,0.0,2.0,2.0,3.0,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,2.0,3.0,0.0,1.0,0.0,0.0,0.0
3,0.0,3.0,2.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,1.0,2.0,5.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
926,0.0,0.0,2.0,2.0,0.0,0.0,0.0,1.0,1.0
927,0.0,0.0,2.0,5.0,0.0,1.0,0.0,0.0,1.0
928,1.0,2.0,0.0,3.0,0.0,0.0,1.0,0.0,0.0
929,0.0,2.0,2.0,5.0,0.0,0.0,0.0,1.0,1.0


In [14]:
train_df.to_csv("../data/train_fe.csv", index=False)
test_df.to_csv("../data/test_fe.csv", index=False)