<a href="https://colab.research.google.com/github/MArtistForLife/Encoding/blob/main/Encoding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Data Encoding

1. Nominal/OHE Encoding
2. Label and Ordinal Encoding
3. Target Guided Ordinal Encoding

### Nominal/OHE Encoding
One hot encoding, also known as nominal encoding, is a technique used to represent categorical data as numerical data, which is more suitable for machine learning algorithms. In this technique, each category is represented as a binary vector where each bit corresponds to a unique category. For example, if we have a categorical variable "color" with three possible values (red, green, blue), we can represent it using one hot encoding as follows:

1. Red: [1, 0, 0]
2. Green: [0, 1, 0]
3. Blue: [0, 0, 1]

This is a sparse matrix (lots of ones and zeroes; usually leads to overfitting, meaning model gets trained very well with training data). Don't use when you have lots of categories, though (one drawback of this tactic).

In [2]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

In [3]:
# create dataframe
## need brackets outside the quoted values
## df goes: ({ blah blah: ["values",...]})

colors = pd.DataFrame({"color": ["red", "green", "blue"]})
colors

Unnamed: 0,color
0,red
1,green
2,blue


In [4]:
# create OneHotEncoder
hotEncoder = OneHotEncoder() # it's a function
hotEncoder.fit_transform(colors[["color"]])
## think: double brackets = 2D
## this creates sparse matrix

<3x3 sparse matrix of type '<class 'numpy.float64'>'
	with 3 stored elements in Compressed Sparse Row format>

In [5]:
hotEncoder.fit_transform(colors[["color"]]).toarray()
# we just get the numerical values of each color now
## array sorts alphabetically, though, so blue is 001 and is now first, whereas
## red is 100 and is now last

array([[0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.]])

In [6]:
encodedColors = hotEncoder.fit_transform(colors[["color"]]).toarray()
# save under a variable

In [7]:
import pandas as pd
encodedColors_df = pd.DataFrame(encodedColors, columns = hotEncoder.get_feature_names_out())
encodedColors_df
## hotEncoder takes our column "color" and converts it into multiple binary columns
## like below to help ML algorithms predict better
## and encodedColors is taking the array we made earlier to put into this df

Unnamed: 0,color_blue,color_green,color_red
0,0.0,0.0,1.0
1,0.0,1.0,0.0
2,1.0,0.0,0.0


In [8]:
# for new data
hotEncoder.transform([["blue"]]).toarray()
## this tells us what the binary numbering for blue is



array([[1., 0., 0.]])

In [9]:
# to concatenate
pd.concat([colors, encodedColors_df], axis=1)
## axis=1 relates to columns
## this gets us a nice table combining the original df with the hot encoder one
## more visually appealing

Unnamed: 0,color,color_blue,color_green,color_red
0,red,0.0,0.0,1.0
1,green,0.0,1.0,0.0
2,blue,1.0,0.0,0.0


In [10]:
# try to do the same with this df
import seaborn as sns
tipsSNS = sns.load_dataset("tips")
tipsSNS

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [11]:
hotEncoder.fit_transform(tipsSNS[["day"]])
## i want to try for the day column in tips df

<244x4 sparse matrix of type '<class 'numpy.float64'>'
	with 244 stored elements in Compressed Sparse Row format>

In [12]:
dayMatrix = hotEncoder.fit_transform(tipsSNS[["day"]]).toarray()
dayMatrix

array([[0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],


In [13]:
import pandas as pd
dayMatrix_df = pd.DataFrame(dayMatrix, columns = hotEncoder.get_feature_names_out())
dayMatrix_df

Unnamed: 0,day_Fri,day_Sat,day_Sun,day_Thur
0,0.0,0.0,1.0,0.0
1,0.0,0.0,1.0,0.0
2,0.0,0.0,1.0,0.0
3,0.0,0.0,1.0,0.0
4,0.0,0.0,1.0,0.0
...,...,...,...,...
239,0.0,1.0,0.0,0.0
240,0.0,1.0,0.0,0.0
241,0.0,1.0,0.0,0.0
242,0.0,1.0,0.0,0.0


In [14]:
hotEncoder.transform([["Sat"]]).toarray()



array([[0., 1., 0., 0.]])

In [15]:
pd.concat([tipsSNS, dayMatrix_df], axis=1)
# adds the numerical value column versions of categorical value columns to original df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,day_Fri,day_Sat,day_Sun,day_Thur
0,16.99,1.01,Female,No,Sun,Dinner,2,0.0,0.0,1.0,0.0
1,10.34,1.66,Male,No,Sun,Dinner,3,0.0,0.0,1.0,0.0
2,21.01,3.50,Male,No,Sun,Dinner,3,0.0,0.0,1.0,0.0
3,23.68,3.31,Male,No,Sun,Dinner,2,0.0,0.0,1.0,0.0
4,24.59,3.61,Female,No,Sun,Dinner,4,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3,0.0,1.0,0.0,0.0
240,27.18,2.00,Female,Yes,Sat,Dinner,2,0.0,1.0,0.0,0.0
241,22.67,2.00,Male,Yes,Sat,Dinner,2,0.0,1.0,0.0,0.0
242,17.82,1.75,Male,No,Sat,Dinner,2,0.0,1.0,0.0,0.0


### Label Encoding
Label encoding and ordinal encoding are two techniques used to encode categorical data as numerical data.

Label encoding involves assigning a unique numerical label to each category in the variable. **The labels are usually assigned in alphabetical order or based on the frequency of the categories.** For example, if we have a categorical variable "color" with three possible values (red, green, blue), we can represent it using label encoding as follows:

1. Red: 1
2. Green: 2
3. Blue: 3

In [16]:
colors
## our original df from earlier

Unnamed: 0,color
0,red
1,green
2,blue


In [18]:
## LabelEncoder is the new thing we'll be using
from sklearn.preprocessing import LabelEncoder
labelEnc = LabelEncoder()

In [26]:
labelEnc.fit_transform(colors[["color"]])
## gives us the numbers for red, green, blue; alphabetically is blue, green, red, which is the 1, 2, 3
## so, red green blue is 2 1 0


  y = column_or_1d(y, warn=True)


array([2, 1, 0])

In [23]:
labelEnc.transform([["red"]])

  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


array([2])

In [24]:
labelEnc.transform([["green"]])

  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


array([1])

In [25]:
labelEnc.transform([["blue"]])

  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


array([0])

###### In short, what is happening here? Well, the model is assigning the higher value to "red" since red comes last of the three colors alphabetically, so the model considers that red is greater than blue and green.

### Ordinal Encoding - if you want a rank
It is used to encode categorical data that have an intrinsic order or ranking. In this technique, each category is assigned a numerical value based on its position in the order. For example, if we have a categorical variable "education level" with four possible values (high school, college, graduate, post-graduate), we can represent it using ordinal encoding as follows:

1. High school: 1
2. College: 2
3. Graduate: 3
4. Post-graduate: 4

In [32]:
education = pd.DataFrame({"education level": ["high school", "college", "graduate", "post-graduate"]})
education

Unnamed: 0,education level
0,high school
1,college
2,graduate
3,post-graduate


In [39]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
ordinalEnc = OrdinalEncoder(categories=[["high school", "college", "graduate", "post-graduate"]])
## have to specify the categories written in the order you want them, otherwise
## it will sort alphabetically

In [40]:
# to create instance of ordinal encoder
ordinalEnc.fit_transform(education[["education level"]])

array([[0.],
       [1.],
       [2.],
       [3.]])

In [41]:
ordinalEnc.transform([["high school"]])



array([[0.]])

In [42]:
ordinalEnc.transform([["college"]])



array([[1.]])

In [43]:
ordinalEnc.transform([["graduate"]])



array([[2.]])

In [44]:
ordinalEnc.transform([["post-graduate"]])



array([[3.]])

## Target Guided Ordinal Encoding
It is a technique used **to encode categorical variables based on their relationship with the target variable**. This encoding technique is useful when we have a categorical variable with a large number of unique categories, and we want to use this variable as a feature in our machine learning model.

In Target Guided Ordinal Encoding, we replace each category in the categorical variable with a numerical value based on the mean or median of the target variable for that category. This creates a monotonic (*varying in such a way that it either never decreases or never increases*) relationship between the categorical variable and the target variable, which can improve the predictive power of our model.

In [51]:
import pandas as pd

cities = pd.DataFrame({
    "city": ["New York", "London", "Paris", "Tokyo", "New York", "Paris"],
    "price": [200, 150, 300, 250, 180, 320]
})
cities

Unnamed: 0,city,price
0,New York,200
1,London,150
2,Paris,300
3,Tokyo,250
4,New York,180
5,Paris,320


In [52]:
## we have 2 values for NY and 2 values for Paris, so if we wanted to replace
## the categorical values New York and Paris with numerical values, then we
## take the mean of NY's 2 values and use that as its numerical value, and
## same for Paris; just use groupby() function

In [53]:
together = cities.groupby("city")["price"].mean()
## structure is: DF.groupby("WHAT YOU WANT ONE VALUE")["WHAT YOU ARE BUNCHING TOGETHER"]
together

Unnamed: 0_level_0,price
city,Unnamed: 1_level_1
London,150.0
New York,190.0
Paris,310.0
Tokyo,250.0


In [57]:
# to make a dictionary for these new compiled city-price pairs
dictTime = cities.groupby("city")["price"].mean().to_dict()
dictTime
## yay!!

{'London': 150.0, 'New York': 190.0, 'Paris': 310.0, 'Tokyo': 250.0}

In [62]:
cities["citiesEncoded"] = cities["city"].map(dictTime)
cities
## now we see the means of each city's prices as its own column in the original df :)

Unnamed: 0,city,price,citiesEncoded
0,New York,200,190.0
1,London,150,150.0
2,Paris,300,310.0
3,Tokyo,250,250.0
4,New York,180,190.0
5,Paris,320,310.0


In [66]:
## if we only wanna see the cities and our new column:
cities[["city", "citiesEncoded"]]
## bc these are the only 2 columns we would wanna give the model for training purposes

Unnamed: 0,city,citiesEncoded
0,New York,190.0
1,London,150.0
2,Paris,310.0
3,Tokyo,250.0
4,New York,190.0
5,Paris,310.0


In [71]:
## extra assignment!!
import seaborn as sns
tipsSNS = sns.load_d"ataset("tips")
tipsSNS

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [73]:
## goal is to convert the categorical variable time based on the total bill
timeEat = tipsSNS.groupby("time")["total_bill"].mean()
timeEat

  timeEat = tipsSNS.groupby("time")["total_bill"].mean()


Unnamed: 0_level_0,total_bill
time,Unnamed: 1_level_1
Lunch,17.168676
Dinner,20.797159


In [80]:
eatToDict = tipsSNS.groupby("time")["total_bill"].mean().to_dict()
eatToDict

  eatToDict = tipsSNS.groupby("time")["total_bill"].mean().to_dict()


{'Lunch': 17.168676470588235, 'Dinner': 20.79715909090909}

In [81]:
tipsSNS["billByTime"] = tipsSNS["time"].map(eatToDict)
tipsSNS

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,billByTime
0,16.99,1.01,Female,No,Sun,Dinner,2,20.797159
1,10.34,1.66,Male,No,Sun,Dinner,3,20.797159
2,21.01,3.50,Male,No,Sun,Dinner,3,20.797159
3,23.68,3.31,Male,No,Sun,Dinner,2,20.797159
4,24.59,3.61,Female,No,Sun,Dinner,4,20.797159
...,...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3,20.797159
240,27.18,2.00,Female,Yes,Sat,Dinner,2,20.797159
241,22.67,2.00,Male,Yes,Sat,Dinner,2,20.797159
242,17.82,1.75,Male,No,Sat,Dinner,2,20.797159


In [82]:
tipsSNS[["time", "billByTime"]]

Unnamed: 0,time,billByTime
0,Dinner,20.797159
1,Dinner,20.797159
2,Dinner,20.797159
3,Dinner,20.797159
4,Dinner,20.797159
...,...,...
239,Dinner,20.797159
240,Dinner,20.797159
241,Dinner,20.797159
242,Dinner,20.797159


In [83]:
lunchRows = tipsSNS.loc[tipsSNS["time"] == "Lunch", ["time", "billByTime"]]
lunchRows

Unnamed: 0,time,billByTime
77,Lunch,17.168676
78,Lunch,17.168676
79,Lunch,17.168676
80,Lunch,17.168676
81,Lunch,17.168676
...,...,...
222,Lunch,17.168676
223,Lunch,17.168676
224,Lunch,17.168676
225,Lunch,17.168676
