# Categorical Encoding

## Import Libraries

In [16]:
import pandas as pd

## Import Data

In [17]:
train = pd.read_csv('train_classification.csv')

In [18]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18359 entries, 0 to 18358
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   enrollee_id             18359 non-null  int64  
 1   city                    18359 non-null  object 
 2   city_development_index  18359 non-null  float64
 3   gender                  14261 non-null  object 
 4   relevent_experience     18359 non-null  object 
 5   enrolled_university     18017 non-null  object 
 6   education_level         17902 non-null  object 
 7   major_discipline        15521 non-null  object 
 8   experience              18300 non-null  object 
 9   company_size            13580 non-null  object 
 10  company_type            13320 non-null  object 
 11  last_new_job            17992 non-null  object 
 12  training_hours          18359 non-null  int64  
 13  target                  18359 non-null  int64  
dtypes: float64(1), int64(3), object(10)
me

## Partition Data into X and y

In [19]:
y = train['target']
X = train.drop(['target','enrollee_id'], axis =  1)

In [20]:
X.isnull().sum()

city                         0
city_development_index       0
gender                    4098
relevent_experience          0
enrolled_university        342
education_level            457
major_discipline          2838
experience                  59
company_size              4779
company_type              5039
last_new_job               367
training_hours               0
dtype: int64

## Get Numerical and Categorical Column Names

In [21]:
numerical_features = X.select_dtypes(include = ['int64','float64']).columns.tolist()
numerical_features

['city_development_index', 'training_hours']

In [22]:
categorical_features = X.select_dtypes(exclude = ['int64','float64']).columns.tolist()
categorical_features

['city',
 'gender',
 'relevent_experience',
 'enrolled_university',
 'education_level',
 'major_discipline',
 'experience',
 'company_size',
 'company_type',
 'last_new_job']

In [23]:
hc_cat_features = X[categorical_features].columns[X[categorical_features].nunique() > 10].tolist()
hc_cat_features

['city', 'experience']

In [24]:
lc_cat_features = X[categorical_features].columns[X[categorical_features].nunique() <= 10].tolist()
lc_cat_features

['gender',
 'relevent_experience',
 'enrolled_university',
 'education_level',
 'major_discipline',
 'company_size',
 'company_type',
 'last_new_job']

## Missing Value Treatment

In [25]:
from sklearn.impute import SimpleImputer

num_impute = SimpleImputer(strategy = 'median')
num_impute.fit(X[numerical_features])
X[numerical_features] = num_impute.transform(X[numerical_features])

cat_impute = SimpleImputer(strategy = 'constant', fill_value = 'missing')
cat_impute.fit(X[categorical_features])
X[categorical_features] = cat_impute.transform(X[categorical_features])

# High Cardinality Encoding Strategy

## 1/ Check Features

In [26]:
X[hc_cat_features].head()

Unnamed: 0,city,experience
0,city_149,3
1,city_83,14
2,city_16,6
3,city_64,14
4,city_100,8


In [27]:
X[hc_cat_features].nunique()

city          123
experience     23
dtype: int64

## 2/ Encode

In [28]:
from sklearn.preprocessing import OrdinalEncoder

In [29]:
# Create Instance
hc_encoder = OrdinalEncoder(handle_unknown = 'use_encoded_value', unknown_value = -1)

In [30]:
# Fit 
hc_encoder.fit(X[hc_cat_features])

OrdinalEncoder(handle_unknown='ignore')

In [31]:
# Transform and Replace in X Datafrome
X[hc_cat_features] = hc_encoder.transform(X[hc_cat_features])

## 3/ Check

In [32]:
X[hc_cat_features].head()

Unnamed: 0,city,experience
0,41.0,13.0
1,112.0,5.0
2,48.0,16.0
3,93.0,5.0
4,2.0,18.0


In [33]:
X[hc_cat_features].describe()

Unnamed: 0,city,experience
count,18359.0,18359.0
mean,43.239229,12.95795
std,36.036755,6.69527
min,0.0,0.0
25%,5.0,7.0
50%,48.0,14.0
75%,64.0,19.0
max,122.0,22.0


# Low Cardinality Encoding

In [34]:
X.head()

Unnamed: 0,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours
0,41.0,0.689,Male,Has relevent experience,no_enrollment,Graduate,STEM,13.0,100-500,Pvt Ltd,1,106.0
1,112.0,0.923,Male,Has relevent experience,no_enrollment,Graduate,STEM,5.0,<10,Funded Startup,1,69.0
2,48.0,0.91,missing,Has relevent experience,no_enrollment,Graduate,STEM,16.0,50-99,Public Sector,2,4.0
3,93.0,0.666,Male,Has relevent experience,no_enrollment,Graduate,STEM,5.0,50-99,Pvt Ltd,1,26.0
4,2.0,0.887,missing,No relevent experience,no_enrollment,Masters,STEM,18.0,missing,missing,2,88.0


In [35]:
X[lc_cat_features].nunique()

gender                 4
relevent_experience    2
enrolled_university    4
education_level        6
major_discipline       7
company_size           9
company_type           7
last_new_job           7
dtype: int64

In [36]:
X['gender'].unique()

array(['Male', 'missing', 'Female', 'Other'], dtype=object)

In [37]:
X = pd.get_dummies(X, drop_first = True)
X.head()

Unnamed: 0,city,city_development_index,experience,training_hours,gender_Male,gender_Other,gender_missing,relevent_experience_No relevent experience,enrolled_university_Part time course,enrolled_university_missing,...,company_type_Other,company_type_Public Sector,company_type_Pvt Ltd,company_type_missing,last_new_job_2,last_new_job_3,last_new_job_4,last_new_job_>4,last_new_job_missing,last_new_job_never
0,41.0,0.689,13.0,106.0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,112.0,0.923,5.0,69.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,48.0,0.91,16.0,4.0,0,0,1,0,0,0,...,0,1,0,0,1,0,0,0,0,0
3,93.0,0.666,5.0,26.0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,2.0,0.887,18.0,88.0,0,0,1,1,0,0,...,0,0,0,1,1,0,0,0,0,0


In [38]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18359 entries, 0 to 18358
Data columns (total 42 columns):
 #   Column                                      Non-Null Count  Dtype  
---  ------                                      --------------  -----  
 0   city                                        18359 non-null  float64
 1   city_development_index                      18359 non-null  float64
 2   experience                                  18359 non-null  float64
 3   training_hours                              18359 non-null  float64
 4   gender_Male                                 18359 non-null  uint8  
 5   gender_Other                                18359 non-null  uint8  
 6   gender_missing                              18359 non-null  uint8  
 7   relevent_experience_No relevent experience  18359 non-null  uint8  
 8   enrolled_university_Part time course        18359 non-null  uint8  
 9   enrolled_university_missing                 18359 non-null  uint8  
 10  enrolled_u