## Cleaning and treating categorical variables

In [1]:
import numpy as np
from pandas import DataFrame

from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [3]:
data = {'names':['steve', 'john', 'richard', 'sarah', 'randy', 'micheal', 'julie'],
'age':[20, 22, 20, 21, 24, 23, 22],
'gender':['Male', 'Male', np.nan, 'Female', np.nan, 'Male', np.nan],
'rank':[2, 1, 4, 5, 3, 7, 6]}
df = DataFrame(data)
df

Unnamed: 0,names,age,gender,rank
0,steve,20,Male,2
1,john,22,Male,1
2,richard,20,,4
3,sarah,21,Female,5
4,randy,24,,3
5,micheal,23,Male,7
6,julie,22,,6


In [5]:
df = df.drop('gender',axis=1)
df

Unnamed: 0,names,age,rank
0,steve,20,2
1,john,22,1
2,richard,20,4
3,sarah,21,5
4,randy,24,3
5,micheal,23,7
6,julie,22,6


### Label Encoding

In [6]:
label_encoder = LabelEncoder()
label_encoder.fit(df['names'])

In [7]:
label_encoder_names = label_encoder.transform(df['names'])
label_encoder_names

array([6, 0, 4, 5, 3, 2, 1])

### One Hot Encoder

In [8]:
onehot_encoder = OneHotEncoder(sparse_output=False)


In [9]:
onehot_encoder.fit(df[['names']])

In [10]:
onehot_encoder_names = onehot_encoder.transform(df[['names']])

In [11]:
onehot_encoded_df = DataFrame(onehot_encoder_names,columns=onehot_encoder.categories_)
onehot_encoded_df['names'] = df[['names']]
onehot_encoded_df

Unnamed: 0,john,julie,micheal,randy,richard,sarah,steve,names
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,steve
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,john
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,richard
3,0.0,0.0,0.0,0.0,0.0,1.0,0.0,sarah
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,randy
5,0.0,0.0,1.0,0.0,0.0,0.0,0.0,micheal
6,0.0,1.0,0.0,0.0,0.0,0.0,0.0,julie
