# Introduction to Feature Engineering with Simulated Dataset

In [1]:

import pandas as pd
import numpy as np

# Set random seed for reproducibility
np.random.seed(42)

# Generate random categorical variables
categories = ['Category1', 'Category2', 'Category3']
categorical_data = np.random.choice(categories, size=100, replace=True)

# Generate random numerical variables
numerical_data = np.random.normal(loc=0, scale=1, size=100)

# Generate random missing values
missing_mask = np.random.choice([True, False], size=100, p=[0.2, 0.8])
numerical_data[missing_mask] = np.nan

# Create a DataFrame
data = pd.DataFrame({
    'Category': categorical_data,
    'Numeric': numerical_data
})

# Print the DataFrame
print(data.head())


    Category   Numeric
0  Category3  0.582123
1  Category1       NaN
2  Category3  0.894332
3  Category3  0.754998
4  Category1 -0.207166


## 1. Encoding Categorical Variables

In [2]:

# Convert categorical variables into numerical representations
df_encoded = pd.get_dummies(data, columns=['Category'])

# Print the encoded DataFrame
print(df_encoded.head())


    Numeric  Category_Category1  Category_Category2  Category_Category3
0  0.582123               False               False                True
1       NaN                True               False               False
2  0.894332               False               False                True
3  0.754998               False               False                True
4 -0.207166                True               False               False


## 2. Handling Missing Values

In [3]:

# Replace missing values with the mean
df_filled = df_encoded.fillna(df_encoded.mean())

# Print the DataFrame after handling missing values
print(df_filled.head())


    Numeric  Category_Category1  Category_Category2  Category_Category3
0  0.582123               False               False                True
1 -0.014239                True               False               False
2  0.894332               False               False                True
3  0.754998               False               False                True
4 -0.207166                True               False               False


## 3. Creating New Features

In [4]:

# Create new derived features
df_filled['Feature1_squared'] = df_filled['Numeric']**2
df_filled['Feature2_cubed'] = df_filled['Numeric']**3

# Calculate log of numeric values, handling negative or zero values
df_filled['Feature3_log'] = np.log(df_filled['Numeric'].replace({0: np.nan, -np.inf: np.nan}))

# Print the updated DataFrame
print(df_filled.head())


    Numeric  Category_Category1  Category_Category2  Category_Category3  \
0  0.582123               False               False                True   
1 -0.014239                True               False               False   
2  0.894332               False               False                True   
3  0.754998               False               False                True   
4 -0.207166                True               False               False   

   Feature1_squared  Feature2_cubed  Feature3_log  
0          0.338867        0.197262     -0.541074  
1          0.000203       -0.000003           NaN  
2          0.799830        0.715314     -0.111678  
3          0.570022        0.430365     -0.281040  
4          0.042918       -0.008891           NaN  


  result = getattr(ufunc, method)(*inputs, **kwargs)
