## Cleaning and treating categorical variables

In [1]:
import numpy as np
from pandas import DataFrame

from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [5]:
data = {'names':['steve', 'john', 'richard', 'sarah', 'randy', 'micheal', 'julie'],
'age':[20, 22, 20, 21, 24, 23, 22],
'gender':['Male', 'Male', np.nan, 'Female', np.nan, 'Male', np.nan],
'rank':[2, 1, 4, 5, 3, 7, 6]}

df = DataFrame(data)
df

Unnamed: 0,names,age,gender,rank
0,steve,20,Male,2
1,john,22,Male,1
2,richard,20,,4
3,sarah,21,Female,5
4,randy,24,,3
5,micheal,23,Male,7
6,julie,22,,6


In [6]:
df = df.drop('gender', axis = 1)
df

Unnamed: 0,names,age,rank
0,steve,20,2
1,john,22,1
2,richard,20,4
3,sarah,21,5
4,randy,24,3
5,micheal,23,7
6,julie,22,6


##Label Encoding

Label encoding is a technique used to convert categorical variables with a limited number of unique values into numerical data. It assigns a unique integer to each category, making the data suitable for machine learning algorithms that require numerical input.

In [7]:
label_encoder = LabelEncoder()
label_encoder.fit(df['names']) # This prepares the encoder to transform the 'names' column into numerical data.

In [8]:
label_encoded_names = label_encoder.transform(df['names'])
#takes the original 'names' column and uses the mapping it learned during the fit() step to
# replace each name with its corresponding numerical label.
label_encoded_names

array([6, 0, 4, 5, 3, 2, 1])

##One Hot Enocder
Unlike label encoding, which assigns a single number, one-hot encoding creates a new binary column for each unique category in the original feature.

In [9]:
onehot_encoder=OneHotEncoder(sparse_output=False)
onehot_encoder.fit(df[['names']])

In [11]:
onehot_encoded_names = onehot_encoder.transform(df[['names']])
onehot_encoded_names

array([[0., 0., 0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0.]])

In [12]:
onehot_encoded_df = DataFrame(onehot_encoded_names, columns = onehot_encoder.categories_)
onehot_encoded_df['names'] = df[['names']]
onehot_encoded_df

Unnamed: 0,john,julie,micheal,randy,richard,sarah,steve,names
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,steve
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,john
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,richard
3,0.0,0.0,0.0,0.0,0.0,1.0,0.0,sarah
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,randy
5,0.0,0.0,1.0,0.0,0.0,0.0,0.0,micheal
6,0.0,1.0,0.0,0.0,0.0,0.0,0.0,julie
