# Feature Engineering

## Lesson 1: Understanding Feature Engineering

## Lesson 2: Data Preprocessing (Handling Missing Data)

In [1]:
import pandas as pd

# Load dataset
dft = pd.read_csv("https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv")

# Check missing values
print(dft.isnull().sum())


PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


In [2]:
#fillna function

In [3]:
dft['Age'].fillna(dft['Age'].median(), inplace=True)
print(dft['Age'].isnull().sum())  # Should print 0


0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dft['Age'].fillna(dft['Age'].median(), inplace=True)


## Lesson 3: Encoding Categorical Variables

In [4]:
dft['Sex'] = dft['Sex'].map({'male': 0, 'female': 1})
print(dft[['Sex']].head())


   Sex
0    0
1    1
2    1
3    1
4    0


 Using map() with a Function

In [5]:
import pandas as pd

df = pd.DataFrame({"A": [1, 2, 3, 4]})
df["A"] = df["A"].map(lambda x: x * 2)  # Multiply each value by 2
print(df)


   A
0  2
1  4
2  6
3  8


 Using map() with a Dictionary

In [6]:
df = pd.DataFrame({"Gender": ["M", "F", "M", "F"]})
df["Gender"] = df["Gender"].map({"M": "Male", "F": "Female"})
print(df)


   Gender
0    Male
1  Female
2    Male
3  Female


Using map() with a Scalar

In [7]:
df

Unnamed: 0,Gender
0,Male
1,Female
2,Male
3,Female


In [8]:
df = pd.DataFrame({"A": [1, 2, 3, 4]})
df['A'] = df['A'].map(lambda x : 10)
print(df)


    A
0  10
1  10
2  10
3  10


In [9]:
df

Unnamed: 0,A
0,10
1,10
2,10
3,10


In [10]:
dft.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [11]:
dft.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,S


In [12]:
dft = pd.get_dummies(dft, columns=['Embarked'])
print(dft.head())


   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name  Sex   Age  SibSp  Parch  \
0                            Braund, Mr. Owen Harris    0  22.0      1      0   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...    1  38.0      1      0   
2                             Heikkinen, Miss. Laina    1  26.0      0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)    1  35.0      1      0   
4                           Allen, Mr. William Henry    0  35.0      0      0   

             Ticket     Fare Cabin  Embarked_C  Embarked_Q  Embarked_S  
0         A/5 21171   7.2500   NaN       False       False        True  
1          PC 17599  71.2833   C85        True       False       False  
2  STON/O2. 3101282   7.9250   NaN       False       False        True  
3   

In [13]:
dft.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked_C,Embarked_Q,Embarked_S
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,False,False,True
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,True,False,False
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,False,False,True
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,False,False,True
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,False,False,True


One hot coding 

In [14]:
import pandas as pd

# Sample dataset
data = {'Color': ['Red', 'Green', 'Blue', 'Green', 'Red']}

# Create DataFrame
df = pd.DataFrame(data)

# Apply one-hot encoding
one_hot = pd.get_dummies(df, columns=['Color'])

print(one_hot)


   Color_Blue  Color_Green  Color_Red
0       False        False       True
1       False         True      False
2        True        False      False
3       False         True      False
4       False        False       True


In [15]:
#get_dummies function 

In [16]:
#using the sklearn lib

In [17]:
from sklearn.preprocessing import OneHotEncoder

In [18]:
'''creates an instance of OneHotEncoder.
sparse=False: Ensures the output is a dense NumPy array instead of a sparse matrix.'''
encoder = OneHotEncoder(sparse_output=False)

In [19]:
# Transform categorical data
encoded = encoder.fit_transform(df[['Color']])

'''
fit_transform(df[['Color']]):
fit() learns the unique categories from the "Color" column.
transform() converts them into one-hot encoded format.
The result is a NumPy array where each row represents a category as a binary vector.
'''
print("Pass")

Pass


In [20]:
# Convert to DataFrame
encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(['Color']))
'''
converts the NumPy array into a pandas DataFrame.
encoder.get_feature_names_out(['Color']):
Generates proper column names like Color_Blue, Color_Green, etc.
'''
print(encoded_df)


   Color_Blue  Color_Green  Color_Red
0         0.0          0.0        1.0
1         0.0          1.0        0.0
2         1.0          0.0        0.0
3         0.0          1.0        0.0
4         0.0          0.0        1.0


## Lesson 4: Feature Scaling

![Formula](Formula.png) 


In [21]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

In [22]:
# Sample Data
data = {'Age': [20, 30, 40, 50, 60], 'Salary': [30000, 50000, 70000, 90000, 110000]}
df = pd.DataFrame(data)

In [23]:
# Apply Min-Max Scaling
scaler = MinMaxScaler() #MinMaxScaler obj 'scaler' created 
scaled_data = scaler.fit_transform(df) #using the fit_transform() function on df with the scale obj

In [24]:
# Convert back to DataFrame
scaled_df = pd.DataFrame(scaled_data, columns=df.columns)
print(scaled_df)


    Age  Salary
0  0.00    0.00
1  0.25    0.25
2  0.50    0.50
3  0.75    0.75
4  1.00    1.00


![Formula](Formula2.png) 


In [25]:
from sklearn.preprocessing import StandardScaler

In [26]:
# Apply Standardization
scaler = StandardScaler()
standardized_data = scaler.fit_transform(df)


In [27]:

# Convert back to DataFrame
standardized_df = pd.DataFrame(standardized_data, columns=df.columns)
print(standardized_df)


        Age    Salary
0 -1.414214 -1.414214
1 -0.707107 -0.707107
2  0.000000  0.000000
3  0.707107  0.707107
4  1.414214  1.414214


3️⃣  Robust Scaling (Median-based Scaling)

![Formula3.png](attachment:c6f9eb91-3fdf-49b7-b5ee-64fb1cb4ee9e.png)

In [28]:
from sklearn.preprocessing import RobustScaler

# Apply Robust Scaling
scaler = RobustScaler()
robust_scaled_data = scaler.fit_transform(df)

# Convert back to DataFrame
robust_scaled_df = pd.DataFrame(robust_scaled_data, columns=df.columns)
print(robust_scaled_df)


   Age  Salary
0 -1.0    -1.0
1 -0.5    -0.5
2  0.0     0.0
3  0.5     0.5
4  1.0     1.0


![Formula4.png](attachment:1686cf6c-8639-465b-be91-e727b14405f6.png)

In [29]:
from sklearn.preprocessing import MaxAbsScaler

# Apply Max Abs Scaling
scaler = MaxAbsScaler()
maxabs_scaled_data = scaler.fit_transform(df)

# Convert back to DataFrame
maxabs_scaled_df = pd.DataFrame(maxabs_scaled_data, columns=df.columns)
print(maxabs_scaled_df)


        Age    Salary
0  0.333333  0.272727
1  0.500000  0.454545
2  0.666667  0.636364
3  0.833333  0.818182
4  1.000000  1.000000


![image1.png](attachment:720afce5-0124-4eea-b1e0-b6ba460df5b1.png)

In [30]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# Min-Max Scaling
scaler = MinMaxScaler()
dft[['Age', 'Fare']] = scaler.fit_transform(dft[['Age', 'Fare']])

# Standardization
scaler = StandardScaler()
dft[['Age', 'Fare']] = scaler.fit_transform(dft[['Age', 'Fare']])


In [31]:
dft.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked_C,Embarked_Q,Embarked_S
0,1,0,3,"Braund, Mr. Owen Harris",0,-0.565736,1,0,A/5 21171,-0.502445,,False,False,True
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,0.663861,1,0,PC 17599,0.786845,C85,True,False,False
2,3,1,3,"Heikkinen, Miss. Laina",1,-0.258337,0,0,STON/O2. 3101282,-0.488854,,False,False,True
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,0.433312,1,0,113803,0.42073,C123,False,False,True
4,5,0,3,"Allen, Mr. William Henry",0,0.433312,0,0,373450,-0.486337,,False,False,True


## Lesson 5: Feature Creation (New Features from Existing Data)

In [32]:
dft['FamilySize'] = dft['SibSp'] + dft['Parch'] + 1
print(dft[['FamilySize']].head())


   FamilySize
0           2
1           2
2           1
3           2
4           1


## Lesson 6: Dimensionality Reduction (PCA, LDA)

In [33]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
reduced_features = pca.fit_transform(dft[['Age', 'Fare', 'FamilySize']])
print(reduced_features[:5])


[[ 0.11825055 -0.75253139]
 [ 0.09623265  1.02894653]
 [-0.90446456 -0.54294622]
 [ 0.0778821   0.60437115]
 [-1.04747036 -0.06702416]]


Dimensionality reduction is a technique used to reduce the number of input features (dimensions) while preserving important information. It helps in:
✅ Reducing computation time
✅ Removing noise and redundancy
✅ Avoiding the curse of dimensionality
✅ Improving model performance