<a href="https://colab.research.google.com/github/KrushnaTaur/ML-Practice/blob/main/02_Data_Preprocessing/03_Data_Encoding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Import Libraries
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder

In [2]:
# Create or Load Sample Data

data = {
    'Name': ['Amit', 'Riya', 'Karan', 'Neha', 'Vikas'],
    'Gender': ['Male', 'Female', 'Male', 'Female', 'Male'],
    'City': ['Pune', 'Delhi', 'Mumbai', 'Pune', 'Delhi'],
    'Experience_Level': ['Beginner', 'Intermediate', 'Expert', 'Beginner', 'Expert']
}

df = pd.DataFrame(data)
df

Unnamed: 0,Name,Gender,City,Experience_Level
0,Amit,Male,Pune,Beginner
1,Riya,Female,Delhi,Intermediate
2,Karan,Male,Mumbai,Expert
3,Neha,Female,Pune,Beginner
4,Vikas,Male,Delhi,Expert


In [4]:
# Label Encoding (for Binary Columns like Gender)
le = LabelEncoder()
df['Gender_Encoded'] = le.fit_transform(df['Gender'])
df

Unnamed: 0,Name,Gender,City,Experience_Level,Gender_Encoded
0,Amit,Male,Pune,Beginner,1
1,Riya,Female,Delhi,Intermediate,0
2,Karan,Male,Mumbai,Expert,1
3,Neha,Female,Pune,Beginner,0
4,Vikas,Male,Delhi,Expert,1


In [6]:
#To see encoding:
print(dict(zip(le.classes_, le.transform(le.classes_))))

{'Female': np.int64(0), 'Male': np.int64(1)}


In [9]:
# One-Hot Encoding (for Multi-Category Columns like City)

df_encoded = pd.get_dummies(df, columns=['City'], drop_first=False)
df_encoded

# This creates new columns like:
# City_Delhi, City_Mumbai, City_Pune

Unnamed: 0,Name,Gender,Experience_Level,Gender_Encoded,City_Delhi,City_Mumbai,City_Pune
0,Amit,Male,Beginner,1,False,False,True
1,Riya,Female,Intermediate,0,True,False,False
2,Karan,Male,Expert,1,False,True,False
3,Neha,Female,Beginner,0,False,False,True
4,Vikas,Male,Expert,1,True,False,False


In [10]:
# Using OneHotEncoder (from sklearn)
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(sparse_output=False)
city_encoded = ohe.fit_transform(df[['City']])

# Convert back to DataFrame
city_df = pd.DataFrame(city_encoded, columns=ohe.get_feature_names_out(['City']))
df_ohe = pd.concat([df, city_df], axis=1)
df_ohe

Unnamed: 0,Name,Gender,City,Experience_Level,Gender_Encoded,City_Delhi,City_Mumbai,City_Pune
0,Amit,Male,Pune,Beginner,1,0.0,0.0,1.0
1,Riya,Female,Delhi,Intermediate,0,1.0,0.0,0.0
2,Karan,Male,Mumbai,Expert,1,0.0,1.0,0.0
3,Neha,Female,Pune,Beginner,0,0.0,0.0,1.0
4,Vikas,Male,Delhi,Expert,1,1.0,0.0,0.0


In [11]:
# Ordinal Encoding (for Ordered Categories like Experience Level)
# Manual mapping
exp_map = {'Beginner': 1, 'Intermediate': 2, 'Expert': 3}
df['Experience_Level_Encoded'] = df['Experience_Level'].map(exp_map)
df

Unnamed: 0,Name,Gender,City,Experience_Level,Gender_Encoded,Experience_Level_Encoded
0,Amit,Male,Pune,Beginner,1,1
1,Riya,Female,Delhi,Intermediate,0,2
2,Karan,Male,Mumbai,Expert,1,3
3,Neha,Female,Pune,Beginner,0,1
4,Vikas,Male,Delhi,Expert,1,3


In [12]:
# Or using OrdinalEncoder:

oe = OrdinalEncoder(categories=[['Beginner', 'Intermediate', 'Expert']])
df['Experience_Level_Encoded'] = oe.fit_transform(df[['Experience_Level']])
df

Unnamed: 0,Name,Gender,City,Experience_Level,Gender_Encoded,Experience_Level_Encoded
0,Amit,Male,Pune,Beginner,1,0.0
1,Riya,Female,Delhi,Intermediate,0,1.0
2,Karan,Male,Mumbai,Expert,1,2.0
3,Neha,Female,Pune,Beginner,0,0.0
4,Vikas,Male,Delhi,Expert,1,2.0


In [13]:
#Encoding Multiple Columns at Once
cat_cols = ['Gender', 'City']
df_multi_encoded = pd.get_dummies(df, columns=cat_cols, drop_first=True)
df_multi_encoded

Unnamed: 0,Name,Experience_Level,Gender_Encoded,Experience_Level_Encoded,Gender_Male,City_Mumbai,City_Pune
0,Amit,Beginner,1,0.0,True,False,True
1,Riya,Intermediate,0,1.0,False,False,False
2,Karan,Expert,1,2.0,True,True,False
3,Neha,Beginner,0,0.0,False,False,True
4,Vikas,Expert,1,2.0,True,False,False


In [14]:
# Mini Task
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [15]:
# Sample data
data = {
    'Name': ['Amit', 'Riya', 'Karan', 'Neha', 'Vikas'],
    'Gender': ['Male', 'Female', 'Male', 'Female', 'Male'],
    'City': ['Pune', 'Delhi', 'Mumbai', 'Pune', 'Delhi']
}
df = pd.DataFrame(data)
print("Before Encoding:\n", df)

Before Encoding:
     Name  Gender    City
0   Amit    Male    Pune
1   Riya  Female   Delhi
2  Karan    Male  Mumbai
3   Neha  Female    Pune
4  Vikas    Male   Delhi


In [16]:
# Label Encode Gender (Binary)
le = LabelEncoder()
df['Gender'] = le.fit_transform(df['Gender'])

In [17]:
# One-Hot Encode City (Multi-category)
df = pd.get_dummies(df, columns=['City'], drop_first=False)

In [18]:
print("After Encoding:\n", df)

After Encoding:
     Name  Gender  City_Delhi  City_Mumbai  City_Pune
0   Amit       1       False        False       True
1   Riya       0        True        False      False
2  Karan       1       False         True      False
3   Neha       0       False        False       True
4  Vikas       1        True        False      False
