# One-Hot Encoder

### Example 1:

In [3]:
import pandas as pd

data = pd.DataFrame({
    "Gender": ["male", "female", "male", "female", "male"],
    "Age": [25, 30, 35, 40, 45],
    "Salary": [50000, 60000, 70000, 80000, 90000],
    "Purchased": [0, 1, 0, 1, 1]
})

#print(data)
data

Unnamed: 0,Gender,Age,Salary,Purchased
0,male,25,50000,0
1,female,30,60000,1
2,male,35,70000,0
3,female,40,80000,1
4,male,45,90000,1


In [5]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# define the columns to be encoded
columns_to_encode = ["Gender"]

# create the transformer
ct = ColumnTransformer(
    transformers=[("encoder", OneHotEncoder(), columns_to_encode)],
    remainder="passthrough"
)

# apply the transformer to the dataset
data_encoded = ct.fit_transform(data)

print(data_encoded)

[[0.0e+00 1.0e+00 2.5e+01 5.0e+04 0.0e+00]
 [1.0e+00 0.0e+00 3.0e+01 6.0e+04 1.0e+00]
 [0.0e+00 1.0e+00 3.5e+01 7.0e+04 0.0e+00]
 [1.0e+00 0.0e+00 4.0e+01 8.0e+04 1.0e+00]
 [0.0e+00 1.0e+00 4.5e+01 9.0e+04 1.0e+00]]


array([[0.0e+00, 1.0e+00, 2.5e+01, 5.0e+04, 0.0e+00],
       [1.0e+00, 0.0e+00, 3.0e+01, 6.0e+04, 1.0e+00],
       [0.0e+00, 1.0e+00, 3.5e+01, 7.0e+04, 0.0e+00],
       [1.0e+00, 0.0e+00, 4.0e+01, 8.0e+04, 1.0e+00],
       [0.0e+00, 1.0e+00, 4.5e+01, 9.0e+04, 1.0e+00]])

In [6]:
from sklearn.linear_model import LogisticRegression

# separate the input variables from the target variable
X = data_encoded[:, :-1]
y = data_encoded[:, -1]

# train a logistic regression model on the encoded dataset
model = LogisticRegression().fit(X, y)

# make predictions on new data
new_data = [[1, 0, 50, 100000]]  # male, 50 years old, salary $100,000
prediction = model.predict(new_data)
print(prediction)

[1.]


### Example 2:

In [None]:
# 2 Class

from sklearn.preprocessing import OneHotEncoder
import numpy as np

# Create an example dataset with a categorical feature
X = np.array([['sup'], ['sub']])

# Create the One Hot Encoder object
encoder = OneHotEncoder()

# Fit the encoder to the dataset and transform the data
X_encoded = encoder.fit_transform(X)

# Print the encoded data
print(X_encoded.toarray())

### Example 3:

In [14]:
# 4 Class
from sklearn.preprocessing import OneHotEncoder
import numpy as np

# Create an example dataset with a categorical feature
X = np.array([['bird'], ['cat'],['dog'], ['dog'],['bird'], ['insect']])

# Create the One Hot Encoder object
encoder = OneHotEncoder()

# Fit the encoder to the dataset and transform the data
X_encoded = encoder.fit_transform(X)

# Print the encoded data
print(X_encoded.toarray())

[[1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]]


### Example 4:

In [15]:
import pandas as pd

data = pd.DataFrame({
    "Disease": ["positive", "negative", "negative", "positive", "negative"],
    "Age": [25, 30, 35, 40, 45],
    "BloodPressure": [120, 130, 140, 150, 160]
})

print(data)

    Disease  Age  BloodPressure
0  positive   25            120
1  negative   30            130
2  negative   35            140
3  positive   40            150
4  negative   45            160


In [16]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# define the columns to be encoded
columns_to_encode = ["Disease"]

# create the transformer
ct = ColumnTransformer(
    transformers=[("encoder", OneHotEncoder(), columns_to_encode)],
    remainder="passthrough"
)

# apply the transformer to the dataset
data_encoded = ct.fit_transform(data)

print(data_encoded)

[[  0.   1.  25. 120.]
 [  1.   0.  30. 130.]
 [  1.   0.  35. 140.]
 [  0.   1.  40. 150.]
 [  1.   0.  45. 160.]]


In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# separate the input variables from the target variable
X = data_encoded[:, 1:]
y = data_encoded[:, 0]

# split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# train a logistic regression model on the training set
model = LogisticRegression().fit(X_train, y_train)

# make predictions on the testing set
y_pred = model.predict(X_test)

# evaluate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.0


In [24]:
X = data_encoded[:, 1:]
y = data_encoded[:, 0]
#print(X)
print(y)
#y
data_encoded

[[  1.  25. 120.]
 [  0.  30. 130.]
 [  0.  35. 140.]
 [  1.  40. 150.]
 [  0.  45. 160.]]
[0. 1. 1. 0. 1.]


array([[  0.,   1.,  25., 120.],
       [  1.,   0.,  30., 130.],
       [  1.,   0.,  35., 140.],
       [  0.,   1.,  40., 150.],
       [  1.,   0.,  45., 160.]])