# One-Hot Encoding: Category and Gender

This notebook creates a static sample dataset (10 columns, no randomness) and performs one-hot encoding on the categorical features:
- Category: [Good, Better, Best]
- Gender: [Male, Female]

We'll preview the original DataFrame, apply encoding, and inspect the transformed columns.

In [1]:
# Build a static DataFrame with 10 columns (no randomness)
import pandas as pd

# Columns:
# id, name, age, Category, Gender, city, signup_year, is_active, score, tier

data = [
    [1, "Alice", 28, "Good",  "Female", "New York", 2021, True,  88.5, "Silver"],
    [2, "Bob",   34, "Better", "Male",   "Chicago",  2020, False, 75.0, "Gold"],
    [3, "Cara",  22, "Best",   "Female", "Austin",  2022, True,  92.3, "Platinum"],
    [4, "Dan",   29, "Good",   "Male",   "Seattle", 2019, True,  66.7, "Silver"],
    [5, "Eve",   31, "Better", "Female", "Miami",   2021, False, 81.2, "Gold"],
    [6, "Frank", 45, "Best",   "Male",   "Denver",  2018, True,  90.0, "Platinum"],
    [7, "Gina",  27, "Good",   "Female", "Boston",  2020, True,  70.4, "Silver"],
    [8, "Hank",  39, "Better", "Male",   "Portland",2019, False, 78.9, "Gold"],
    [9, "Ivy",   26, "Best",   "Female", "San Jose",2022, True,  95.1, "Platinum"],
    [10,"Jake",  33, "Good",   "Male",   "Dallas",  2021, True,  73.5, "Silver"],
]

columns = ["id", "name", "age", "Category", "Gender", "city", "signup_year", "is_active", "score", "tier"]

df = pd.DataFrame(data, columns=columns)

# Ensure Category and Gender have the specified categories in a stable order
cat_type = pd.api.types.CategoricalDtype(categories=["Good", "Better", "Best"], ordered=True)
gender_type = pd.api.types.CategoricalDtype(categories=["Male", "Female"], ordered=False)

df["Category"] = df["Category"].astype(cat_type)
df["Gender"] = df["Gender"].astype(gender_type)

df.head()

Unnamed: 0,id,name,age,Category,Gender,city,signup_year,is_active,score,tier
0,1,Alice,28,Good,Female,New York,2021,True,88.5,Silver
1,2,Bob,34,Better,Male,Chicago,2020,False,75.0,Gold
2,3,Cara,22,Best,Female,Austin,2022,True,92.3,Platinum
3,4,Dan,29,Good,Male,Seattle,2019,True,66.7,Silver
4,5,Eve,31,Better,Female,Miami,2021,False,81.2,Gold


In [2]:
# One-hot encode 'Category' and 'Gender'
# We'll keep all levels (no drop) to match the specified distinct values
encoded = pd.get_dummies(
    df,
    columns=["Category", "Gender"],
    prefix=["Category", "Gender"],
    prefix_sep="_",
    drop_first=False,
    dtype=int
)

# Reorder encoded columns to show one-hot cols together (optional)
ordered_cols = [
    "id", "name", "age", "city", "signup_year", "is_active", "score", "tier",
    "Category_Good", "Category_Better", "Category_Best",
    "Gender_Male", "Gender_Female",
]

# Some columns are automatically placed; ensure all exist before reordering
final_cols = [c for c in ordered_cols if c in encoded.columns] + [
    c for c in encoded.columns if c not in ordered_cols
]

encoded = encoded[final_cols]
encoded.head()

Unnamed: 0,id,name,age,city,signup_year,is_active,score,tier,Category_Good,Category_Better,Category_Best,Gender_Male,Gender_Female
0,1,Alice,28,New York,2021,True,88.5,Silver,1,0,0,0,1
1,2,Bob,34,Chicago,2020,False,75.0,Gold,0,1,0,1,0
2,3,Cara,22,Austin,2022,True,92.3,Platinum,0,0,1,0,1
3,4,Dan,29,Seattle,2019,True,66.7,Silver,1,0,0,1,0
4,5,Eve,31,Miami,2021,False,81.2,Gold,0,1,0,0,1


In [3]:
# Validate expected one-hot columns and category levels
expected_cols = {
    "Category_Good", "Category_Better", "Category_Best",
    "Gender_Male", "Gender_Female",
}
print("Missing expected columns:", expected_cols - set(encoded.columns))
print("Extra encoded columns:", set(encoded.columns) - expected_cols - set(df.columns))

print("Category levels:", list(df["Category"].cat.categories))
print("Gender levels:", list(df["Gender"].cat.categories))

Missing expected columns: set()
Extra encoded columns: set()
Category levels: ['Good', 'Better', 'Best']
Gender levels: ['Male', 'Female']
