<a href="https://colab.research.google.com/github/JamshedAli18/Feature-Engineering-Techniques/blob/main/basic_feature_engineering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PolynomialFeatures



In [5]:
california = fetch_california_housing()
df = pd.DataFrame(california.data, columns=california.feature_names)


In [6]:
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [7]:
# 1. Handling Missing Values
df.iloc[5:10, 0:2] = np.nan  # Introduce some missing values
imputer = SimpleImputer(strategy="mean")
df.iloc[:, 0:2] = imputer.fit_transform(df.iloc[:, 0:2])



In [8]:
# 2. Scaling and Normalization
scaler = MinMaxScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)



In [10]:
df_scaled.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,0.539668,0.784314,0.043512,0.020469,0.008941,0.001499,0.567481,0.211155
1,0.538027,0.392157,0.038224,0.018929,0.06721,0.001141,0.565356,0.212151
2,0.466028,1.0,0.052756,0.02194,0.013818,0.001698,0.564293,0.210159
3,0.354699,1.0,0.035241,0.021929,0.015555,0.001493,0.564293,0.209163
4,0.230776,1.0,0.038534,0.022166,0.015752,0.001198,0.564293,0.209163


In [12]:

# 3. Binning
df['Binned_MEDINC'] = pd.cut(df['MedInc'], bins=3, labels=["Low", "Medium", "High"])



In [13]:
df

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,Binned_MEDINC
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23,Medium
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22,Medium
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24,Medium
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,Medium
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,Low
...,...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,Low
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,Low
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,Low
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32,Low


In [14]:
# 4. Log Transform
df['Log_MedInc'] = np.log1p(df['MedInc'])



In [15]:
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,Binned_MEDINC,Log_MedInc
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,Medium,2.23272
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,Medium,2.230165
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,Medium,2.11111
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,Medium,1.893579
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,Low,1.578195


In [17]:
print("Basic Feature Engineering completed!")

Basic Feature Engineering completed!
