In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Data Representation and Feature Engineering

- **Challenge**: Many machine learning algorithms require numerical input, and categorical variables are non-numeric.
- **Solution**: Represent categorical variables numerically through encoding techniques.

### Methods and Examples:

**Ordinal Encoding:**

- Method: Assign integer values to categories based on their order or ranking.
- Example: Education levels (low, medium, high) mapped to (1, 2, 3).

**One-Hot Encoding:**

-  Method: Create binary columns for each category, indicating its presence or absence.
-  Example: Gender categories (male, female) become two binary columns (0 or 1).

**Label Encoding:**

- Method: Assign a unique integer to each category.
- Example: Days of the week (Monday, Tuesday, ...) mapped to (1, 2, ...).

**Frequency Encoding:**

- Method: Encode categories based on their frequency of occurrence.
- Example: Encode city names based on how frequently they appear in a dataset.

In [2]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

# Create a DataFrame with a categorical column
data = {'Color': ['Red', 'Blue', 'Green', 'Red', 'Green']}
df = pd.DataFrame(data)

# Initialize the OneHotEncoder
encoder = OneHotEncoder()

# Fit and transform the categorical column
encoded_data = encoder.fit_transform(df[['Color']]).toarray()

# Create a new DataFrame with the encoded features
encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(['Color']))

# Display the result
print(encoded_df)


   Color_Blue  Color_Green  Color_Red
0         0.0          0.0        1.0
1         1.0          0.0        0.0
2         0.0          1.0        0.0
3         0.0          0.0        1.0
4         0.0          1.0        0.0
