In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split


In [2]:
# Sample dataset
data = {
    'Education': ['High_School', "Bachelor", "Master", "PhD", "Bachelor", "Master", "PhD", "High_School"],
    'City': ['New York', 'Los Angeles', 'Chicago', 'Houston', 'New York', 'Chicago', 'Houston', 'Los Angeles'],
    'Salary': [50000, 60000, 80000, 90000, 55000, 85000, 92000, 48000],
    'Purchased': [1, 0, 1, 1, 0, 1, 1, 0]  # Target variable
}


In [3]:
df = pd.DataFrame(data)

In [4]:
df

Unnamed: 0,Education,City,Salary,Purchased
0,High_School,New York,50000,1
1,Bachelor,Los Angeles,60000,0
2,Master,Chicago,80000,1
3,PhD,Houston,90000,1
4,Bachelor,New York,55000,0
5,Master,Chicago,85000,1
6,PhD,Houston,92000,1
7,High_School,Los Angeles,48000,0


In [5]:
# One-Hot Encoding (for Nominal Data)
ohe = OneHotEncoder(sparse_output=False)
edu_ohe = ohe.fit_transform(df[['Education']])
edu_ohe_df = pd.DataFrame(edu_ohe, columns=ohe.get_feature_names_out(['Education']))
df = df.join(edu_ohe_df)
print("\nOne-Hot Encoded Education:")
df.head()



One-Hot Encoded Education:


Unnamed: 0,Education,City,Salary,Purchased,Education_Bachelor,Education_High_School,Education_Master,Education_PhD
0,High_School,New York,50000,1,0.0,1.0,0.0,0.0
1,Bachelor,Los Angeles,60000,0,1.0,0.0,0.0,0.0
2,Master,Chicago,80000,1,0.0,0.0,1.0,0.0
3,PhD,Houston,90000,1,0.0,0.0,0.0,1.0
4,Bachelor,New York,55000,0,1.0,0.0,0.0,0.0


In [6]:
le = LabelEncoder()
df['Education_Label'] = le.fit_transform(df['Education'])
print("\nLabel Encoded Education:")
print(df[['Education', 'Education_Label']])


Label Encoded Education:
     Education  Education_Label
0  High_School                1
1     Bachelor                0
2       Master                2
3          PhD                3
4     Bachelor                0
5       Master                2
6          PhD                3
7  High_School                1


In [7]:
# Ordinal Encoding (for Ordered Categories)
ordinal_enc = OrdinalEncoder(categories=[['High_School', "Bachelor", "Master", "PhD"]])
df['Education_Ordinal'] = ordinal_enc.fit_transform(df[['Education']])
print("\nOrdinal Encoded Education:")
print(df[['Education', 'Education_Ordinal']])


Ordinal Encoded Education:
     Education  Education_Ordinal
0  High_School                0.0
1     Bachelor                1.0
2       Master                2.0
3          PhD                3.0
4     Bachelor                1.0
5       Master                2.0
6          PhD                3.0
7  High_School                0.0


In [8]:
# Frequency Encoding
freq_enc = df['City'].value_counts().to_dict()
df['City_Frequency'] = df['City'].map(freq_enc)
print("\nFrequency Encoded City:")
print(df[['City', 'City_Frequency']])



Frequency Encoded City:
          City  City_Frequency
0     New York               2
1  Los Angeles               2
2      Chicago               2
3      Houston               2
4     New York               2
5      Chicago               2
6      Houston               2
7  Los Angeles               2


In [9]:
X_train, X_test, y_train, y_test = train_test_split(df[['City']], df['Purchased'], test_size=0.3, random_state=42)

# Compute Mean Target Encoding on Training Set
city_target_means = X_train.join(y_train).groupby('City')['Purchased'].mean().to_dict()

# Apply Target Encoding
X_train['City_Target'] = X_train['City'].map(city_target_means)
X_test['City_Target'] = X_test['City'].map(city_target_means)  # Use training means

# Print Results
print("\nTarget Encoded City (Train Set):")
print(X_train)
print("\nTarget Encoded City (Test Set):")
print(X_test)


Target Encoded City (Train Set):
          City  City_Target
7  Los Angeles          0.0
2      Chicago          1.0
4     New York          0.0
3      Houston          1.0
6      Houston          1.0

Target Encoded City (Test Set):
          City  City_Target
1  Los Angeles          0.0
5      Chicago          1.0
0     New York          0.0
