In [70]:
#implement label encoding 

import numpy as np 
import pandas as pd 
from sklearn.preprocessing import LabelEncoder

In [71]:
df = pd.read_csv('datasets/data-example7.csv')

In [72]:
df.describe(include='all')

Unnamed: 0,x1,x2,x3,x4,x5,x6
count,9,9.0,9,9,9.0,9.0
unique,2,,3,3,,
top,male,,high,full-time,,
freq,5,,4,4,,
mean,,6.222222,,,8.0,3.444444
std,,6.457124,,,13.518506,2.242271
min,,1.0,,,3.0,0.0
25%,,2.0,,,3.0,2.0
50%,,5.0,,,3.0,4.0
75%,,5.0,,,4.0,5.0


In [73]:
df.select_dtypes(include=['object', 'category'])

Unnamed: 0,x1,x3,x4
0,male,low,full-time
1,male,med,part-time
2,female,high,full-time
3,male,high,unemployed
4,female,low,part-time
5,male,low,unemployed
6,male,med,full-time
7,female,high,part-time
8,female,high,full-time


In [74]:
# Identify categorical columns
"""
Step 2: Identify Categorical Features
Objective:
Determine which columns are of categorical type (i.e., object or category) and thus may require encoding.

Actions:

Select Categorical Columns: Use df.select_dtypes(include=['object', 'category']).
Review Unique Values: For each categorical column, check the number of unique values to assess the cardinality.
"""

categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
print("\nCategorical Columns:", categorical_cols)

# Check the unique values for each categorical column
for col in categorical_cols:
    unique_count = df[col].nunique()
    print(f"Column '{col}' has {unique_count} unique values.")



Categorical Columns: ['x1', 'x3', 'x4']
Column 'x1' has 2 unique values.
Column 'x3' has 3 unique values.
Column 'x4' has 3 unique values.


In [75]:
"""
Step 3: Check Cardinality & Domain Knowledge
Objective:
Assess whether a categorical column has low, moderate, or high cardinality, and determine if it’s nominal or ordinal.

Actions:

Low Cardinality: If a column has few unique values (e.g., “Gender”, “Status”), one-hot or label encoding might be appropriate.
High Cardinality: If a column has many unique values (e.g., “Zip Codes”, “Product IDs”), consider methods like frequency encoding, hashing, or binary encoding.
Ordinal Data: If there’s an inherent order (e.g., “Low”, “Medium”, “High”), define an explicit mapping for ordinal encoding."""

# Example of reviewing each categorical column for encoding suitability
for col in categorical_cols:
    unique_count = df[col].nunique()
    # Define a threshold; here, we use 10 as an arbitrary threshold for low cardinality
    if unique_count <= 10:
        print(f"Column '{col}' is a candidate for One-Hot or Label Encoding (Low Cardinality: {unique_count} unique values).")
    else:
        print(f"Column '{col}' might benefit from Frequency, Target, or Hashing Encoding (High Cardinality: {unique_count} unique values).")
        

Column 'x1' is a candidate for One-Hot or Label Encoding (Low Cardinality: 2 unique values).
Column 'x3' is a candidate for One-Hot or Label Encoding (Low Cardinality: 3 unique values).
Column 'x4' is a candidate for One-Hot or Label Encoding (Low Cardinality: 3 unique values).


In [76]:
df

Unnamed: 0,x1,x2,x3,x4,x5,x6
0,male,1,low,full-time,3,2
1,male,12,med,part-time,4,5
2,female,5,high,full-time,3,6
3,male,1,high,unemployed,44,0
4,female,2,low,part-time,3,1
5,male,4,low,unemployed,5,4
6,male,5,med,full-time,3,2
7,female,21,high,part-time,4,5
8,female,5,high,full-time,3,6


In [77]:
"""Step 4: Check for Missing Values & Inconsistencies
Objective:
Identify missing values and ensure consistency in data formats before applying any encoding.

Actions:

Imputation: Decide whether to fill missing values or drop rows/columns.
Data Cleaning: Standardize values if there are inconsistent formats (e.g., "Yes", "yes", "Y")."""

 # Checking missing values in categorical columns
for col in categorical_cols:
    missing = df[col].isnull().sum()
    if missing > 0:
        print(f"Column '{col}' has {missing} missing values. Consider imputation or creating a new category (e.g., 'Unknown').")


In [78]:
"""Step 5: Decide on the Encoding Strategy
Based on your exploration, choose the encoding method for each categorical feature:

Label Encoding:
Use when the categorical feature is ordinal (or when using tree-based models that aren’t sensitive to ordering issues).

One-Hot Encoding:
Ideal for nominal features with low cardinality to avoid any artificial ordering.

Ordinal Encoding:
Apply when your categorical variable has a natural order. You can define a mapping manually.

Binary Encoding, Frequency Encoding, Target Encoding, Hashing:
Consider these when dealing with high-cardinality features. For instance, frequency encoding replaces each category with its frequency in the dataset, while hashing encoding can reduce dimensionality for very high-cardinality variables."""

'Step 5: Decide on the Encoding Strategy\nBased on your exploration, choose the encoding method for each categorical feature:\n\nLabel Encoding:\nUse when the categorical feature is ordinal (or when using tree-based models that aren’t sensitive to ordering issues).\n\nOne-Hot Encoding:\nIdeal for nominal features with low cardinality to avoid any artificial ordering.\n\nOrdinal Encoding:\nApply when your categorical variable has a natural order. You can define a mapping manually.\n\nBinary Encoding, Frequency Encoding, Target Encoding, Hashing:\nConsider these when dealing with high-cardinality features. For instance, frequency encoding replaces each category with its frequency in the dataset, while hashing encoding can reduce dimensionality for very high-cardinality variables.'

In [79]:
le = LabelEncoder()
df['x3'] = le.fit_transform(df['x3'])

In [80]:
df

Unnamed: 0,x1,x2,x3,x4,x5,x6
0,male,1,1,full-time,3,2
1,male,12,2,part-time,4,5
2,female,5,0,full-time,3,6
3,male,1,0,unemployed,44,0
4,female,2,1,part-time,3,1
5,male,4,1,unemployed,5,4
6,male,5,2,full-time,3,2
7,female,21,0,part-time,4,5
8,female,5,0,full-time,3,6


In [81]:
df

Unnamed: 0,x1,x2,x3,x4,x5,x6
0,male,1,1,full-time,3,2
1,male,12,2,part-time,4,5
2,female,5,0,full-time,3,6
3,male,1,0,unemployed,44,0
4,female,2,1,part-time,3,1
5,male,4,1,unemployed,5,4
6,male,5,2,full-time,3,2
7,female,21,0,part-time,4,5
8,female,5,0,full-time,3,6


In [82]:
onehotencode = pd.get_dummies(df['x1'] , prefix='category')
df = pd.concat([df , onehotencode] , axis = 1)



In [83]:
df

Unnamed: 0,x1,x2,x3,x4,x5,x6,category_female,category_male
0,male,1,1,full-time,3,2,False,True
1,male,12,2,part-time,4,5,False,True
2,female,5,0,full-time,3,6,True,False
3,male,1,0,unemployed,44,0,False,True
4,female,2,1,part-time,3,1,True,False
5,male,4,1,unemployed,5,4,False,True
6,male,5,2,full-time,3,2,False,True
7,female,21,0,part-time,4,5,True,False
8,female,5,0,full-time,3,6,True,False


In [84]:
df['category_male'] = df['category_male'].astype(int)
df['category_female'] = df['category_female'].astype(int)

In [85]:
df

Unnamed: 0,x1,x2,x3,x4,x5,x6,category_female,category_male
0,male,1,1,full-time,3,2,0,1
1,male,12,2,part-time,4,5,0,1
2,female,5,0,full-time,3,6,1,0
3,male,1,0,unemployed,44,0,0,1
4,female,2,1,part-time,3,1,1,0
5,male,4,1,unemployed,5,4,0,1
6,male,5,2,full-time,3,2,0,1
7,female,21,0,part-time,4,5,1,0
8,female,5,0,full-time,3,6,1,0


In [86]:
df = df.drop(columns='x1')

In [87]:
df

Unnamed: 0,x2,x3,x4,x5,x6,category_female,category_male
0,1,1,full-time,3,2,0,1
1,12,2,part-time,4,5,0,1
2,5,0,full-time,3,6,1,0
3,1,0,unemployed,44,0,0,1
4,2,1,part-time,3,1,1,0
5,4,1,unemployed,5,4,0,1
6,5,2,full-time,3,2,0,1
7,21,0,part-time,4,5,1,0
8,5,0,full-time,3,6,1,0


In [88]:
x4_onehotencode = pd.get_dummies(df['x4'] , prefix='x4')

df =pd.concat([df , x4_onehotencode] , axis = 1)

df.drop(columns=['x4'] , inplace=True)

In [89]:
df

Unnamed: 0,x2,x3,x5,x6,category_female,category_male,x4_full-time,x4_part-time,x4_unemployed
0,1,1,3,2,0,1,True,False,False
1,12,2,4,5,0,1,False,True,False
2,5,0,3,6,1,0,True,False,False
3,1,0,44,0,0,1,False,False,True
4,2,1,3,1,1,0,False,True,False
5,4,1,5,4,0,1,False,False,True
6,5,2,3,2,0,1,True,False,False
7,21,0,4,5,1,0,False,True,False
8,5,0,3,6,1,0,True,False,False
