# Feature Engineering

## Lesson 1: Understanding Feature Engineering

## Lesson 2: Data Preprocessing (Handling Missing Data)

In [None]:
import pandas as pd

# Load dataset
dft = pd.read_csv("https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv")

# Check missing values
print(dft.isnull().sum())


In [None]:
#fillna function

In [None]:
dft['Age'].fillna(dft['Age'].median(), inplace=True)
print(dft['Age'].isnull().sum())  # Should print 0


## Lesson 3: Encoding Categorical Variables

In [None]:
dft['Sex'] = dft['Sex'].map({'male': 0, 'female': 1})
print(dft[['Sex']].head())


 Using map() with a Function

In [None]:
import pandas as pd

df = pd.DataFrame({"A": [1, 2, 3, 4]})
df["A"] = df["A"].map(lambda x: x * 2)  # Multiply each value by 2
print(df)


 Using map() with a Dictionary

In [None]:
df = pd.DataFrame({"Gender": ["M", "F", "M", "F"]})
df["Gender"] = df["Gender"].map({"M": "Male", "F": "Female"})
print(df)


Using map() with a Scalar

In [None]:
df

In [None]:
df = pd.DataFrame({"A": [1, 2, 3, 4]})
df['A'] = df['A'].map(lambda x : 10)
print(df)


In [None]:
df

In [None]:
dft.columns

In [None]:
dft.head()

In [None]:
dft = pd.get_dummies(dft, columns=['Embarked'])
print(dft.head())


In [None]:
dft.head()

One hot coding 

In [None]:
import pandas as pd

# Sample dataset
data = {'Color': ['Red', 'Green', 'Blue', 'Green', 'Red']}

# Create DataFrame
df = pd.DataFrame(data)

# Apply one-hot encoding
one_hot = pd.get_dummies(df, columns=['Color'])

print(one_hot)


In [None]:
#get_dummies function 

In [None]:
#using the sklearn lib

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
'''creates an instance of OneHotEncoder.
sparse=False: Ensures the output is a dense NumPy array instead of a sparse matrix.'''
encoder = OneHotEncoder(sparse_output=False)

In [None]:
# Transform categorical data
encoded = encoder.fit_transform(df[['Color']])

'''
fit_transform(df[['Color']]):
fit() learns the unique categories from the "Color" column.
transform() converts them into one-hot encoded format.
The result is a NumPy array where each row represents a category as a binary vector.
'''
print("Pass")

In [None]:
# Convert to DataFrame
encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(['Color']))
'''
converts the NumPy array into a pandas DataFrame.
encoder.get_feature_names_out(['Color']):
Generates proper column names like Color_Blue, Color_Green, etc.
'''
print(encoded_df)


## Lesson 4: Feature Scaling

![Formula](Formula.png) 


In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

In [None]:
# Sample Data
data = {'Age': [20, 30, 40, 50, 60], 'Salary': [30000, 50000, 70000, 90000, 110000]}
df = pd.DataFrame(data)

In [None]:
# Apply Min-Max Scaling
scaler = MinMaxScaler() #MinMaxScaler obj 'scaler' created 
scaled_data = scaler.fit_transform(df) #using the fit_transform() function on df with the scale obj

In [None]:
# Convert back to DataFrame
scaled_df = pd.DataFrame(scaled_data, columns=df.columns)
print(scaled_df)


![Formula](Formula2.png) 


In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
# Apply Standardization
scaler = StandardScaler()
standardized_data = scaler.fit_transform(df)


In [None]:

# Convert back to DataFrame
standardized_df = pd.DataFrame(standardized_data, columns=df.columns)
print(standardized_df)


3️⃣  Robust Scaling (Median-based Scaling)

![Formula3.png](attachment:c6f9eb91-3fdf-49b7-b5ee-64fb1cb4ee9e.png)

In [None]:
from sklearn.preprocessing import RobustScaler

# Apply Robust Scaling
scaler = RobustScaler()
robust_scaled_data = scaler.fit_transform(df)

# Convert back to DataFrame
robust_scaled_df = pd.DataFrame(robust_scaled_data, columns=df.columns)
print(robust_scaled_df)


![Formula4.png](attachment:1686cf6c-8639-465b-be91-e727b14405f6.png)

In [None]:
from sklearn.preprocessing import MaxAbsScaler

# Apply Max Abs Scaling
scaler = MaxAbsScaler()
maxabs_scaled_data = scaler.fit_transform(df)

# Convert back to DataFrame
maxabs_scaled_df = pd.DataFrame(maxabs_scaled_data, columns=df.columns)
print(maxabs_scaled_df)


![image1.png](attachment:720afce5-0124-4eea-b1e0-b6ba460df5b1.png)

In [None]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# Min-Max Scaling
scaler = MinMaxScaler()
dft[['Age', 'Fare']] = scaler.fit_transform(dft[['Age', 'Fare']])

# Standardization
scaler = StandardScaler()
dft[['Age', 'Fare']] = scaler.fit_transform(dft[['Age', 'Fare']])


In [None]:
dft.head()

## Lesson 5: Feature Creation (New Features from Existing Data)

In [None]:
dft['FamilySize'] = dft['SibSp'] + dft['Parch'] + 1
print(dft[['FamilySize']].head())


## Lesson 6: Dimensionality Reduction (PCA, LDA)

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
reduced_features = pca.fit_transform(dft[['Age', 'Fare', 'FamilySize']])
print(reduced_features[:5])


Dimensionality reduction is a technique used to reduce the number of input features (dimensions) while preserving important information. It helps in:
✅ Reducing computation time
✅ Removing noise and redundancy
✅ Avoiding the curse of dimensionality
✅ Improving model performance