In [None]:
# Feature engineering = tạo features mới từ features cũ
# để model "thấy" pattern rõ hơn

# Ví dụ:
# - Feature gốc: diện tích nhà, số phòng
# - Feature mới: diện tích/phòng (m² mỗi phòng)
#   → có thể dự đoán giá tốt hơn


In [2]:
from sklearn.preprocessing import PolynomialFeatures
import numpy as np

# Data mẫu: 1 feature
X_simple = np.array([[1], [2], [3]])

# Tạo polynomial degree 2
poly = PolynomialFeatures(degree=2)
X_poly = poly.fit_transform(X_simple)

print("Original X:\n", X_simple)
print("\nPolynomial features (degree=2):\n", X_poly)
print("Feature names:", poly.get_feature_names_out())


Original X:
 [[1]
 [2]
 [3]]

Polynomial features (degree=2):
 [[1. 1. 1.]
 [1. 2. 4.]
 [1. 3. 9.]]
Feature names: ['1' 'x0' 'x0^2']


In [3]:
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.datasets import fetch_california_housing

data = fetch_california_housing()
X, y = data.data, data.target

# Chọn top 5 features quan trọng nhất
selector = SelectKBest(score_func=f_regression, k=5)
X_selected = selector.fit_transform(X, y)

print("Original features:", X.shape[1])
print("Selected features:", X_selected.shape[1])
print("Selected feature indices:", selector.get_support(indices=True))
print("Feature scores:", selector.scores_)


Original features: 8
Selected features: 5
Selected feature indices: [0 1 2 3 6]
Feature scores: [1.85565716e+04 2.32841479e+02 4.87757462e+02 4.51085756e+01
 1.25474103e+01 1.16353421e+01 4.38005453e+02 4.36989761e+01]


In [4]:
from sklearn.impute import SimpleImputer
import numpy as np

# Data có missing (NaN)
X_missing = np.array([
    [1, 2],
    [np.nan, 3],
    [7, 6],
    [4, np.nan]
])

# Impute bằng mean
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X_missing)

print("Before imputation:\n", X_missing)
print("\nAfter imputation (mean):\n", X_imputed)


Before imputation:
 [[ 1.  2.]
 [nan  3.]
 [ 7.  6.]
 [ 4. nan]]

After imputation (mean):
 [[1.         2.        ]
 [4.         3.        ]
 [7.         6.        ]
 [4.         3.66666667]]


In [5]:
import pandas as pd

# Load California Housing
data = fetch_california_housing(as_frame=True)
df = data.frame

# Feature 1: Rooms per household
df['RoomsPerHousehold'] = df['AveRooms'] * df['AveOccup']

# Feature 2: Bedrooms per room ratio
df['BedroomsPerRoom'] = df['AveBedrms'] / df['AveRooms']

# Feature 3: Population density
df['PopulationDensity'] = df['Population'] / df['AveOccup']

print(df[['RoomsPerHousehold', 'BedroomsPerRoom', 'PopulationDensity']].head())


   RoomsPerHousehold  BedroomsPerRoom  PopulationDensity
0          17.848325         0.146591              126.0
1          13.161483         0.155797             1138.0
2          23.225510         0.129516              177.0
3          14.822293         0.184458              219.0
4          13.703657         0.172096              259.0


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

# Tạo X mới (features gốc + features mới)
X_new = df.drop(columns=['MedHouseVal']).values
y_new = df['MedHouseVal'].values

X_train, X_test, y_train, y_test = train_test_split(X_new, y_new, test_size=0.2, random_state=42)

# Scale và train
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = LinearRegression()
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)

print("R² with new features:", r2_score(y_test, y_pred))


R² with new features: 0.6305323199048851
