In [1]:
# Categorical data is a data type that represents groups or categories
# denoting quality characteristics. - color, age, height, etc.

# Categorical data is divided into two main types:

# Nominal data - This is categorical data with no explicit order or ranking.
# Items can simply be assigned to a specific category. Examples: colors, gender, car brands.

# Ordinal data - This is categorical data with an explicit order or ranking between categories.
# Items can be ordered relative to each other, but the difference between categories may not be uniform.
# Examples: level of education (for example, “higher”, “secondary”, “primary”),
# ratings (e.g. "low", "medium", "high").




In [2]:
import numpy as np
import pandas as pd

In [3]:
x = pd.DataFrame(np.array(["M", "O-", "medium",
                            "M", "O-", "high",
                            "F", "o+", "high",
                            "F", "AB", "low",
                            "F", "AB", "medium"])
                            .reshape(5, 3))

x.columns = ["Gender", "Bloodgroup", "health_status"]

In [4]:
x

Unnamed: 0,Gender,Bloodgroup,health_status
0,M,O-,medium
1,M,O-,high
2,F,o+,high
3,F,AB,low
4,F,AB,medium


In [12]:
# example of encoding the health_status column using the ordinal encoding method
# in this case the variables were encoded with a certain pattern
from sklearn.preprocessing import OrdinalEncoder

encoder = OrdinalEncoder(dtype="int")

x.health_status = encoder.fit_transform(x.health_status.values.reshape(-1, 1))

In [13]:
x

Unnamed: 0,Gender,Bloodgroup,health_status
0,M,O-,2
1,M,O-,0
2,F,o+,0
3,F,AB,1
4,F,AB,2


In [7]:
# now we use another encoding method - LabelEncoder

from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

x['health_status'] = encoder.fit_transform(x['health_status'])



In [8]:
x

Unnamed: 0,Gender,Bloodgroup,health_status
0,M,O-,2
1,M,O-,0
2,F,o+,0
3,F,AB,1
4,F,AB,2


In [9]:
# we can also try One-Hot-encoder

from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(dtype="int")

# Convert and encode categorical columns
encoded_data = encoder.fit_transform(x[["Gender", "Bloodgroup", "health_status"]]).toarray()

# Create a new DataFrame taking into account all the created columns
n = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(["Gender", "Bloodgroup", "health_status"]))

# Adding a categorical column "health_status" from the original DataFrame
n["health_status"] = x["health_status"]

# Output the result
n



Unnamed: 0,Gender_F,Gender_M,Bloodgroup_AB,Bloodgroup_O-,Bloodgroup_o+,health_status_0,health_status_1,health_status_2,health_status
0,0,1,0,1,0,0,0,1,2
1,0,1,0,1,0,1,0,0,0
2,1,0,0,0,1,1,0,0,0
3,1,0,1,0,0,0,1,0,1
4,1,0,1,0,0,0,0,1,2
