# One Hot Encoder

In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelEncoder

In [4]:
data = {'sales': [100000,222000,1000000,522000,111111,222222,1111111,20000,75000,90000,1000000,10000],
      'city': ['Tampa','Tampa','Orlando','Jacksonville','Miami','Jacksonville','Miami','Miami','Orlando','Orlando','Orlando','Orlando'],
      'size': ['Small', 'Medium','Large','Large','Small','Medium','Large','Small','Medium','Medium','Medium','Small']}
df = pd.DataFrame(data=data)
df

Unnamed: 0,sales,city,size
0,100000,Tampa,Small
1,222000,Tampa,Medium
2,1000000,Orlando,Large
3,522000,Jacksonville,Large
4,111111,Miami,Small
5,222222,Jacksonville,Medium
6,1111111,Miami,Large
7,20000,Miami,Small
8,75000,Orlando,Medium
9,90000,Orlando,Medium


In [5]:
print(df['city'].unique())

['Tampa' 'Orlando' 'Jacksonville' 'Miami']


In [6]:
OHE = OneHotEncoder(handle_unknown='ignore', sparse_output= False).set_output(transform='pandas')
OHE_Transform = OHE.fit_transform(df[['city']])
OHE_Transform

Unnamed: 0,city_Jacksonville,city_Miami,city_Orlando,city_Tampa
0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,1.0
2,0.0,0.0,1.0,0.0
3,1.0,0.0,0.0,0.0
4,0.0,1.0,0.0,0.0
5,1.0,0.0,0.0,0.0
6,0.0,1.0,0.0,0.0
7,0.0,1.0,0.0,0.0
8,0.0,0.0,1.0,0.0
9,0.0,0.0,1.0,0.0


In [7]:
df_1 = pd.concat([df, OHE_Transform], axis=1).drop(columns=['city'])
print(df_1)
print(df)

      sales    size  city_Jacksonville  city_Miami  city_Orlando  city_Tampa
0    100000   Small                0.0         0.0           0.0         1.0
1    222000  Medium                0.0         0.0           0.0         1.0
2   1000000   Large                0.0         0.0           1.0         0.0
3    522000   Large                1.0         0.0           0.0         0.0
4    111111   Small                0.0         1.0           0.0         0.0
5    222222  Medium                1.0         0.0           0.0         0.0
6   1111111   Large                0.0         1.0           0.0         0.0
7     20000   Small                0.0         1.0           0.0         0.0
8     75000  Medium                0.0         0.0           1.0         0.0
9     90000  Medium                0.0         0.0           1.0         0.0
10  1000000  Medium                0.0         0.0           1.0         0.0
11    10000   Small                0.0         0.0           1.0         0.0

# Components Breakdown
## OneHotEncoder:
    OneHotEncoder is a class in scikit-learn used for converting categorical data into a one-hot (or dummy) encoded format. One-hot encoding transforms categorical variables into a series of binary columns.

## handle_unknown='ignore':

### handle_unknown: 
    This parameter specifies how to handle unknown categories encountered during transformation. 
    'ignore': When set to 'ignore', the encoder will ignore any unknown categories during transformation. This prevents errors when new, unseen categories appear in the test data.
    
## sparse_output=False:

### sparse_output: 
    This parameter determines whether the output should be in a sparse matrix format.
    False: Setting this to False means the output will be a dense array, which is a standard 2D numpy array. This is useful when you prefer to work with dense arrays or DataFrames.

## .set_output(transform='pandas'):

### set_output: 
    This method is used to configure the output format of transformers in scikit-learn.
    transform='pandas': When this is set, the output of the transformation will be a pandas DataFrame instead of a numpy array. This is beneficial because pandas DataFrames come with additional functionality and are more user-friendly for data manipulation and analysis.

# Ordinal Encoding

In [15]:
OE = OrdinalEncoder()
df['size'] = OE.fit_transform(df[['size']])
df

Unnamed: 0,sales,city,size
0,100000,Tampa,2.0
1,222000,Tampa,1.0
2,1000000,Orlando,0.0
3,522000,Jacksonville,0.0
4,111111,Miami,2.0
5,222222,Jacksonville,1.0
6,1111111,Miami,0.0
7,20000,Miami,2.0
8,75000,Orlando,1.0
9,90000,Orlando,1.0


<b> if we didn't provide the category it will be category in ABC order in the above code

In [18]:
size_category = df['size'].unique()
size_category

array(['Small', 'Medium', 'Large'], dtype=object)

In [19]:
OE = OrdinalEncoder(categories=[size_category])
df['size'] = OE.fit_transform(df[['size']])
df

Unnamed: 0,sales,city,size
0,100000,Tampa,0.0
1,222000,Tampa,1.0
2,1000000,Orlando,2.0
3,522000,Jacksonville,2.0
4,111111,Miami,0.0
5,222222,Jacksonville,1.0
6,1111111,Miami,2.0
7,20000,Miami,0.0
8,75000,Orlando,1.0
9,90000,Orlando,1.0


# Lable Encoder

In [None]:
LE = LabelEncoder()
df['size'] = LE.fit_transform(df[['size']])
df
# this will be categorized in ABC order