# Feature Engineering
Transforming categorical data into numerical format.


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# Ensure output messages display properly
print("Libraries loaded successfully.")



## Load Cleaned Data
Displaying cleaned data before feature engineering.


In [None]:
# Load the cleaned dataset
data = pd.read_csv("../data/final_cleaned_train.csv")

# Display the first few rows
print("Loaded cleaned dataset:")
data.head()


## Encoding Categorical Variables
Categorical variables need to be converted into numerical format before model training.
- One-hot encoding is applied to categorical features.
- `drop_first=True` is used to avoid multicollinearity.

In [None]:
# Identify categorical columns
categorical_cols = data.select_dtypes(include=['object']).columns

# Apply one-hot encoding
data = pd.get_dummies(data, columns=categorical_cols, drop_first=True)

# Display dataset info after encoding
print(f"Feature Engineering: Categorical variables encoded. Data now has {data.shape[1]} features.")


## Feature Scaling
Standardizing numeric variables ensures fair weighting in models.
- StandardScaler is applied to normalize the dataset.



In [None]:
# Initialize the scaler
scaler = StandardScaler()

# Apply scaling only to numeric columns
numeric_cols = data.select_dtypes(include=['number']).columns
data[numeric_cols] = scaler.fit_transform(data[numeric_cols])

print("Feature Engineering: Applied standard scaling.")


## Save Processed Dataset
The final processed dataset is saved for use in model training.


In [None]:
# Save the processed dataset
data.to_csv("../data/processed_train.csv", index=False)

print("Feature Engineering complete. Processed dataset saved successfully!")
