# Handeling Categorical variables

First, download the dataset and save it in your current working directory with the name “adult-all.csv”:

https://raw.githubusercontent.com/jbrownlee/Datasets/master/adult-all.csv

Import the dataset as pandas dataframe:

In [1]:
import pandas as pd

adult = pd.read_csv("adult-all.csv")

#Definethe column names
column_names = [
    "Age", "Workclass", "Final Weight", "Education", "Education Number of Years",
    "Marital Status", "Occupation", "Relationship", "Race", "Sex",
    "Capital Gain", "Capital Loss", "Hours per Week", "Native Country", "Income"
]

# Assign the column names to the DataFrame
adult.columns = column_names

  from pandas.core import (


In [2]:
adult.head()

Unnamed: 0,Age,Workclass,Final Weight,Education,Education Number of Years,Marital Status,Occupation,Relationship,Race,Sex,Capital Gain,Capital Loss,Hours per Week,Native Country,Income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K


In [3]:
# for instance, let's just ignore na values:
adult = adult.dropna()

In [4]:
## features by type:
f_cat = adult.select_dtypes(include=['object', 'bool']).columns
print(f_cat)

f_num = adult.select_dtypes(include=['int64', 'float64']).columns
print(f_num)



Index(['Workclass', 'Education', 'Marital Status', 'Occupation',
       'Relationship', 'Race', 'Sex', 'Native Country', 'Income'],
      dtype='object')
Index(['Age', 'Final Weight', 'Education Number of Years', 'Capital Gain',
       'Capital Loss', 'Hours per Week'],
      dtype='object')


Model 1: only numerical features:

In [5]:
print(adult[f_cat])

          Workclass     Education      Marital Status         Occupation  \
0           Private          11th       Never-married  Machine-op-inspct   
1           Private       HS-grad  Married-civ-spouse    Farming-fishing   
2         Local-gov    Assoc-acdm  Married-civ-spouse    Protective-serv   
3           Private  Some-college  Married-civ-spouse  Machine-op-inspct   
4                 ?  Some-college       Never-married                  ?   
...             ...           ...                 ...                ...   
48837       Private    Assoc-acdm  Married-civ-spouse       Tech-support   
48838       Private       HS-grad  Married-civ-spouse  Machine-op-inspct   
48839       Private       HS-grad             Widowed       Adm-clerical   
48840       Private       HS-grad       Never-married       Adm-clerical   
48841  Self-emp-inc       HS-grad  Married-civ-spouse    Exec-managerial   

      Relationship   Race     Sex Native Country Income  
0        Own-child  Black    

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
import numpy as np

y = adult['Income'].copy()

# Select only numerical features for X
X_num = adult[f_num].copy()


# Impute missing values if there are any, using mean for numerical features
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
X_num_imputed = imputer.fit_transform(X_num)

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_num_imputed, y, test_size=0.2, random_state=42)

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


from sklearn.linear_model import LogisticRegression

# Initialize and train the logistic regression model
model = LogisticRegression()
model.fit(X_train_scaled, y_train)


from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Make predictions on the test set
predictions = model.predict(X_test_scaled)

# Evaluate the model
print(f"Accuracy: {accuracy_score(y_test, predictions)}")
print(f"Precision: {precision_score(y_test, predictions, pos_label='>50K')}")
print(f"Recall: {recall_score(y_test, predictions, pos_label='>50K')}")
baseline_accuracy = f1_score(y_test, predictions, pos_label='>50K')
print(f"F1 Score: {baseline_accuracy}")



Accuracy: 0.8185075237997748
Precision: 0.7027450980392157
Recall: 0.39126637554585153
F1 Score: 0.5026647966339411


The categorical variables contains an information that we shouldn't ignore.

Let's convert them to numerical. Sex can be encoded as {0,1} values.

For example: Male=1 & Female=0

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# We can use if else
adult['Sex_encoded'] = adult['Sex'].apply(lambda x: 1 if x == 'Male' else 0)

adult['Sex_encoded']

0        1
1        1
2        1
3        1
4        0
        ..
48837    0
48838    1
48839    0
48840    1
48841    0
Name: Sex_encoded, Length: 48842, dtype: int64

In [8]:
# or we can use the LabelEncoder from sklearn.preprocessing:

sex_encoder = LabelEncoder()
adult['Sex_encoded'] = sex_encoder.fit_transform(adult['Sex'])
adult['Sex_encoded']


0        1
1        1
2        1
3        1
4        0
        ..
48837    0
48838    1
48839    0
48840    1
48841    0
Name: Sex_encoded, Length: 48842, dtype: int32

In [9]:
adult.head()

Unnamed: 0,Age,Workclass,Final Weight,Education,Education Number of Years,Marital Status,Occupation,Relationship,Race,Sex,Capital Gain,Capital Loss,Hours per Week,Native Country,Income,Sex_encoded
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K,1
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K,1
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K,1
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K,1
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K,0


In [10]:
# selecting feautures to include in the model: numerical feautures + encoded sex:
f_num_plus_sex = list(f_num) 
f_num_plus_sex.append('Sex_encoded')
f_num_plus_sex

['Age',
 'Final Weight',
 'Education Number of Years',
 'Capital Gain',
 'Capital Loss',
 'Hours per Week',
 'Sex_encoded']

In [11]:
# Now, explicitly select features and target for the model to avoid any index issues
X = adult[f_num_plus_sex].copy()
y = adult['Income'].copy()

X.head()

Unnamed: 0,Age,Final Weight,Education Number of Years,Capital Gain,Capital Loss,Hours per Week,Sex_encoded
0,25,226802,7,0,0,40,1
1,38,89814,9,0,0,50,1
2,28,336951,12,0,0,40,1
3,44,160323,10,7688,0,40,1
4,18,103497,10,0,0,30,0


build the second model using those features:

In [12]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize and train the logistic regression model
model = LogisticRegression()
model.fit(X_train_scaled, y_train)

# Predictions
predictions = model.predict(X_test_scaled)

# Evaluation
print(f"Accuracy: {accuracy_score(y_test, predictions)}")
print(f"Precision: {precision_score(y_test, predictions, pos_label='>50K')}")
print(f"Recall: {recall_score(y_test, predictions, pos_label='>50K')}")
print(f"F1 Score: {f1_score(y_test, predictions, pos_label='>50K')}")

Accuracy: 0.8265943289998976
Precision: 0.7110481586402266
Recall: 0.43842794759825326
F1 Score: 0.5424095083738519


comparing the models accuracy:

In [13]:
print(f"F1 Score of model without sex attribute: {baseline_accuracy}")
print(f"F1 Score of model with sex attribute: {f1_score(y_test, predictions, pos_label='>50K')}")

F1 Score of model without sex attribute: 0.5026647966339411
F1 Score of model with sex attribute: 0.5424095083738519


let's build a third model that includes "education" attribute too.
For this feature we will use one hot encoding:

In [14]:
#One-hot encode the 'Education' column
education_dummies = pd.get_dummies(adult['Education'], prefix='Education')

# Concatenate the one-hot encoded 'Education' dataframe with the original 'adult' dataframe
adult = pd.concat([adult, education_dummies], axis=1)

adult.head()

Unnamed: 0,Age,Workclass,Final Weight,Education,Education Number of Years,Marital Status,Occupation,Relationship,Race,Sex,...,Education_9th,Education_Assoc-acdm,Education_Assoc-voc,Education_Bachelors,Education_Doctorate,Education_HS-grad,Education_Masters,Education_Preschool,Education_Prof-school,Education_Some-college
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,...,False,False,False,False,False,False,False,False,False,False
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,...,False,False,False,False,False,True,False,False,False,False
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,...,False,True,False,False,False,False,False,False,False,False
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,...,False,False,False,False,False,False,False,False,False,True
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,...,False,False,False,False,False,False,False,False,False,True


In [15]:
# Ensure f_num is a list and then create a new list of features including 'Sex_encoded' and the education dummies
features = list(f_num) + ['Sex_encoded'] + list(education_dummies.columns)

# Select features and target for the model
X = adult[features].copy()
y = adult['Income'].copy()

# Drop rows with any missing values in X and y
X.dropna(inplace=True)
y = y.loc[X.index]  # Ensure y is aligned with X after dropping rows

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize and train the logistic regression model
model = LogisticRegression()
model.fit(X_train_scaled, y_train)

# Predictions
predictions = model.predict(X_test_scaled)

# Evaluation
print(f"Accuracy: {accuracy_score(y_test, predictions)}")
print(f"Precision: {precision_score(y_test, predictions, pos_label='>50K')}")
print(f"Recall: {recall_score(y_test, predictions, pos_label='>50K')}")
print(f"F1 Score: {f1_score(y_test, predictions, pos_label='>50K')}")

Accuracy: 0.8255706827720339
Precision: 0.7069209039548022
Recall: 0.437117903930131
F1 Score: 0.54020507285483


Using all features:

In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


import pandas as pd

adult = pd.read_csv("adult-all.csv")

# Definethe column names
column_names = [
    "Age", "Workclass", "Final Weight", "Education", "Education Number of Years",
    "Marital Status", "Occupation", "Relationship", "Race", "Sex",
    "Capital Gain", "Capital Loss", "Hours per Week", "Native Country", "Income"
]

# Assign the column names to the DataFrame
adult.columns = column_names


categorical_features = adult.select_dtypes(include=['object']).columns.tolist()

print(categorical_features)

if 'Income' in categorical_features:
    categorical_features.remove('Income')

# Perform one-hot encoding for the categorical features:
adult = pd.get_dummies(adult, columns=categorical_features)

# Define the features X and the target y
X = adult.drop('Income', axis=1)
y = adult['Income']

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize and train the logistic regression model
model = LogisticRegression() 
model.fit(X_train_scaled, y_train)

# Make predictions and evaluate the model
predictions = model.predict(X_test_scaled)

# Evaluation
print(f"Accuracy: {accuracy_score(y_test, predictions)}")
print(f"Precision: {precision_score(y_test, predictions, pos_label='>50K')}")
print(f"Recall: {recall_score(y_test, predictions, pos_label='>50K')}")
print(f"F1 Score: {f1_score(y_test, predictions, pos_label='>50K')}")

['Workclass', 'Education', 'Marital Status', 'Occupation', 'Relationship', 'Race', 'Sex', 'Native Country', 'Income']
Accuracy: 0.8572013512130208
Precision: 0.7368978295394388
Recall: 0.6078602620087337
F1 Score: 0.6661880832735104


In [17]:
adult.head()

Unnamed: 0,Age,Final Weight,Education Number of Years,Capital Gain,Capital Loss,Hours per Week,Income,Workclass_?,Workclass_Federal-gov,Workclass_Local-gov,...,Native Country_Portugal,Native Country_Puerto-Rico,Native Country_Scotland,Native Country_South,Native Country_Taiwan,Native Country_Thailand,Native Country_Trinadad&Tobago,Native Country_United-States,Native Country_Vietnam,Native Country_Yugoslavia
0,25,226802,7,0,0,40,<=50K,False,False,False,...,False,False,False,False,False,False,False,True,False,False
1,38,89814,9,0,0,50,<=50K,False,False,False,...,False,False,False,False,False,False,False,True,False,False
2,28,336951,12,0,0,40,>50K,False,False,True,...,False,False,False,False,False,False,False,True,False,False
3,44,160323,10,7688,0,40,>50K,False,False,False,...,False,False,False,False,False,False,False,True,False,False
4,18,103497,10,0,0,30,<=50K,True,False,False,...,False,False,False,False,False,False,False,True,False,False


## Exercice:

Using the Adult Dataset, apply different encoding techniques (label, one-hot, binary, frequency) to the categorical variables and train a Logistic Regression model for each encoding strategy. Compare the performance of each model and discuss your findings.

Critical Thinking Exercise: How can you handle new categories in the test dataset that were not present during the training phase after applying categorical encoding?

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Load the Adult Dataset
adult = pd.read_csv("adult-all.csv")

# Define the column names
column_names = [
    "Age", "Workclass", "Final Weight", "Education", "Education Number of Years",
    "Marital Status", "Occupation", "Relationship", "Race", "Sex",
    "Capital Gain", "Capital Loss", "Hours per Week", "Native Country", "Income"
]

# Assign the column names to the DataFrame
adult.columns = column_names

# Preprocessing
# Handling missing values
adult.dropna(inplace=True)

  from pandas.core import (


In [2]:
adult.head()

Unnamed: 0,Age,Workclass,Final Weight,Education,Education Number of Years,Marital Status,Occupation,Relationship,Race,Sex,Capital Gain,Capital Loss,Hours per Week,Native Country,Income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K


In [3]:
adult.describe(include='all')

Unnamed: 0,Age,Workclass,Final Weight,Education,Education Number of Years,Marital Status,Occupation,Relationship,Race,Sex,Capital Gain,Capital Loss,Hours per Week,Native Country,Income
count,48842.0,48842,48842.0,48842,48842.0,48842,48842,48842,48842,48842,48842.0,48842.0,48842.0,48842,48842
unique,,9,,16,,7,15,6,5,2,,,,42,2
top,,Private,,HS-grad,,Married-civ-spouse,Prof-specialty,Husband,White,Male,,,,United-States,<=50K
freq,,33906,,15784,,22379,6172,19716,41762,32650,,,,43832,37155
mean,38.643585,,189664.1,,10.078089,,,,,,1079.067626,87.502314,40.422382,,
std,13.71051,,105604.0,,2.570973,,,,,,7452.019058,403.004552,12.391444,,
min,17.0,,12285.0,,1.0,,,,,,0.0,0.0,1.0,,
25%,28.0,,117550.5,,9.0,,,,,,0.0,0.0,40.0,,
50%,37.0,,178144.5,,10.0,,,,,,0.0,0.0,40.0,,
75%,48.0,,237642.0,,12.0,,,,,,0.0,0.0,45.0,,


In [4]:
# Encoding categorical variables
# Label Encoding
label_encoded_data = adult.copy()
label_encoded_data['Workclass'] = label_encoded_data['Workclass'].astype('category').cat.codes
label_encoded_data['Education'] = label_encoded_data['Education'].astype('category').cat.codes
label_encoded_data['Marital Status'] = label_encoded_data['Marital Status'].astype('category').cat.codes
label_encoded_data['Occupation'] = label_encoded_data['Occupation'].astype('category').cat.codes
label_encoded_data['Relationship'] = label_encoded_data['Relationship'].astype('category').cat.codes
label_encoded_data['Race'] = label_encoded_data['Race'].astype('category').cat.codes
label_encoded_data['Sex'] = label_encoded_data['Sex'].astype('category').cat.codes
label_encoded_data['Native Country'] = label_encoded_data['Native Country'].astype('category').cat.codes
label_encoded_data['Income'] = label_encoded_data['Income'].astype('category').cat.codes

In [5]:
label_encoded_data.head()

Unnamed: 0,Age,Workclass,Final Weight,Education,Education Number of Years,Marital Status,Occupation,Relationship,Race,Sex,Capital Gain,Capital Loss,Hours per Week,Native Country,Income
0,25,4,226802,1,7,4,7,3,2,1,0,0,40,39,0
1,38,4,89814,11,9,2,5,0,4,1,0,0,50,39,0
2,28,2,336951,7,12,2,11,0,4,1,0,0,40,39,1
3,44,4,160323,15,10,2,7,0,2,1,7688,0,40,39,1
4,18,0,103497,15,10,4,0,3,4,0,0,0,30,39,0


In [6]:
# One-Hot Encoding
one_hot_encoded_data = pd.get_dummies(label_encoded_data, 
                                      columns=['Workclass', 'Education', 'Marital Status', 
                                               'Occupation', 'Relationship', 'Race', 'Sex'])

In [7]:
one_hot_encoded_data.head()

Unnamed: 0,Age,Final Weight,Education Number of Years,Capital Gain,Capital Loss,Hours per Week,Native Country,Income,Workclass_0,Workclass_1,...,Relationship_3,Relationship_4,Relationship_5,Race_0,Race_1,Race_2,Race_3,Race_4,Sex_0,Sex_1
0,25,226802,7,0,0,40,39,0,False,False,...,True,False,False,False,False,True,False,False,False,True
1,38,89814,9,0,0,50,39,0,False,False,...,False,False,False,False,False,False,False,True,False,True
2,28,336951,12,0,0,40,39,1,False,False,...,False,False,False,False,False,False,False,True,False,True
3,44,160323,10,7688,0,40,39,1,False,False,...,False,False,False,False,False,True,False,False,False,True
4,18,103497,10,0,0,30,39,0,True,False,...,True,False,False,False,False,False,False,True,True,False


In [8]:
# Binary Encoding
binary_encoded_data = label_encoded_data.copy()
binary_encoded_data = pd.get_dummies(binary_encoded_data, 
                                     columns=['Workclass', 'Education', 'Marital Status', 
                                              'Occupation', 'Relationship', 'Race', 'Sex'], drop_first=True)

In [9]:
binary_encoded_data.head()

Unnamed: 0,Age,Final Weight,Education Number of Years,Capital Gain,Capital Loss,Hours per Week,Native Country,Income,Workclass_1,Workclass_2,...,Relationship_1,Relationship_2,Relationship_3,Relationship_4,Relationship_5,Race_1,Race_2,Race_3,Race_4,Sex_1
0,25,226802,7,0,0,40,39,0,False,False,...,False,False,True,False,False,False,True,False,False,True
1,38,89814,9,0,0,50,39,0,False,False,...,False,False,False,False,False,False,False,False,True,True
2,28,336951,12,0,0,40,39,1,False,True,...,False,False,False,False,False,False,False,False,True,True
3,44,160323,10,7688,0,40,39,1,False,False,...,False,False,False,False,False,False,True,False,False,True
4,18,103497,10,0,0,30,39,0,False,False,...,False,False,True,False,False,False,False,False,True,False


In [10]:
# Frequency Encoding
frequency_encoded_data = label_encoded_data.copy()
for column in ['Workclass', 'Education', 'Marital Status', 'Occupation', 'Relationship', 'Race', 'Sex']:
    frequency_map = frequency_encoded_data[column].value_counts(normalize=True)
    frequency_encoded_data[column] = frequency_encoded_data[column].map(frequency_map)

In [11]:
frequency_encoded_data.head()

Unnamed: 0,Age,Workclass,Final Weight,Education,Education Number of Years,Marital Status,Occupation,Relationship,Race,Sex,Capital Gain,Capital Loss,Hours per Week,Native Country,Income
0,25,0.694198,226802,0.037099,7,0.329982,0.061873,0.155215,0.095922,0.668482,0,0,40,39,0
1,38,0.694198,89814,0.323164,9,0.458192,0.030507,0.403669,0.855043,0.668482,0,0,50,39,0
2,28,0.064207,336951,0.032779,12,0.458192,0.020126,0.403669,0.855043,0.668482,0,0,40,39,1
3,44,0.694198,160323,0.222718,10,0.458192,0.061873,0.403669,0.095922,0.668482,7688,0,40,39,1
4,18,0.057307,103497,0.222718,10,0.329982,0.057512,0.155215,0.855043,0.331518,0,0,30,39,0


In [12]:
# Handle 'Native Country' for Frequency Encoding
frequency_map_country = frequency_encoded_data['Native Country'].value_counts(normalize=True)
frequency_encoded_data['Native Country'] = frequency_encoded_data['Native Country'].map(frequency_map_country)

In [13]:
# Splitting data into train and test sets
X_label = label_encoded_data.drop('Income', axis=1)
y_label = label_encoded_data['Income']
X_train_label, X_test_label, y_train_label, y_test_label = train_test_split(X_label, y_label, test_size=0.2, random_state=42)

In [14]:
X_one_hot = one_hot_encoded_data.drop('Income', axis=1)
y_one_hot = one_hot_encoded_data['Income']
X_train_one_hot, X_test_one_hot, y_train_one_hot, y_test_one_hot = train_test_split(X_one_hot, y_one_hot, test_size=0.2, random_state=42)

In [15]:
X_binary = binary_encoded_data.drop('Income', axis=1)
y_binary = binary_encoded_data['Income']
X_train_binary, X_test_binary, y_train_binary, y_test_binary = train_test_split(X_binary, y_binary, test_size=0.2, random_state=42)

In [16]:
X_frequency = frequency_encoded_data.drop('Income', axis=1)
y_frequency = frequency_encoded_data['Income']
X_train_frequency, X_test_frequency, y_train_frequency, y_test_frequency = train_test_split(X_frequency, y_frequency, test_size=0.2, random_state=42)

In [18]:
# Scale the features
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_label_scaled = scaler.fit_transform(X_train_label)
X_test_label_scaled = scaler.transform(X_test_label)

In [19]:
# Scale the features
scaler = StandardScaler()
X_train_one_hot_scaled = scaler.fit_transform(X_train_one_hot)
X_test_one_hot_scaled = scaler.transform(X_test_one_hot)

In [20]:
# Scale the features
scaler = StandardScaler()
X_train_binary_scaled = scaler.fit_transform(X_train_binary)
X_test_binary_scaled = scaler.transform(X_test_binary)

In [21]:
scaler = StandardScaler()
X_train_frequency_scaled = scaler.fit_transform(X_train_frequency)
X_test_frequency_scaled = scaler.transform(X_test_frequency)

In [22]:
# Training Logistic Regression models
logreg_label = LogisticRegression(max_iter=5000)
logreg_label.fit(X_train_label_scaled, y_train_label)

In [23]:
logreg_one_hot = LogisticRegression(max_iter=5000)
logreg_one_hot.fit(X_train_one_hot_scaled, y_train_one_hot)

In [24]:
logreg_binary = LogisticRegression(max_iter=5000)
logreg_binary.fit(X_train_binary_scaled, y_train_binary)

In [21]:
logreg_frequency = LogisticRegression(max_iter=5000)
logreg_frequency.fit(X_train_frequency_scaled, y_train_frequency)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [22]:
# Evaluating models
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

In [23]:
accuracy_label = evaluate_model(logreg_label, X_test_label_scaled, y_test_label)
accuracy_one_hot = evaluate_model(logreg_one_hot, X_test_one_hot_scaled, y_test_one_hot)
accuracy_binary = evaluate_model(logreg_binary, X_test_binary_scaled, y_test_binary)
accuracy_frequency = evaluate_model(logreg_frequency, X_test_frequency_scaled, y_test_frequency)

# Comparing performance
print("Accuracy with Label Encoding:", accuracy_label)
print("Accuracy with One-Hot Encoding:", accuracy_one_hot)
print("Accuracy with Binary Encoding:", accuracy_binary)
print("Accuracy with Frequency Encoding:", accuracy_frequency)

Accuracy with Label Encoding: 0.8110349063363701
Accuracy with One-Hot Encoding: 0.8526973078104207
Accuracy with Binary Encoding: 0.8539256832838571
Accuracy with Frequency Encoding: 0.8423584809089979
