In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
import numpy as np

# Load data
data = fetch_california_housing()
X, y = data.data, data.target

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Chuẩn hóa: mean=0, std=1
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


print("Before scaling:")
print("Mean:", X_train.mean(axis=0)[:3])
print("Std:", X_train.std(axis=0)[:3])

print("\nAfter scaling:")
print("Mean:", X_train_scaled.mean(axis=0)[:3])
print("Std:", X_train_scaled.std(axis=0)[:3])


array([[-1.15508475, -0.28632369, -0.52068576, ...,  0.06740798,
         0.1951    ,  0.28534728],
       [-0.70865905,  0.11043502, -0.16581537, ..., -0.03602975,
        -0.23549054,  0.06097472],
       [-0.21040155,  1.85617335, -0.61076476, ..., -0.14998876,
         1.00947776, -1.42487026],
       ...,
       [ 2.80902421, -0.28632369,  0.75501156, ..., -0.02646898,
         0.78014149, -1.23041404],
       [-0.57542978,  0.58654547, -0.06124296, ..., -0.04390537,
         0.52740357, -0.08860699],
       [-0.17259111, -0.92113763, -0.6058703 , ...,  0.05466644,
        -0.66608108,  0.60445493]], shape=(4128, 8))

In [6]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_train_normalized = scaler.fit_transform(X_train)
X_test_normalized = scaler.transform(X_test)

print("Min:", X_train_normalized.min(axis=0)[:3])
print("Max:", X_train_normalized.max(axis=0)[:3])


Min: [0. 0. 0.]
Max: [1. 1. 1.]


In [7]:
from sklearn.preprocessing import LabelEncoder

labels = ['cat', 'dog', 'cat', 'bird', 'dog']
le = LabelEncoder()
y_encoded = le.fit_transform(labels)

print("Original:", labels)
print("Encoded:", y_encoded)
print("Classes:", le.classes_)


Original: ['cat', 'dog', 'cat', 'bird', 'dog']
Encoded: [1 2 1 0 2]
Classes: ['bird' 'cat' 'dog']


In [8]:
from sklearn.preprocessing import OneHotEncoder
import pandas as pd

# Data mẫu
df = pd.DataFrame({
    'color': ['red', 'blue', 'green', 'red'],
    'size': ['S', 'M', 'L', 'M']
})

encoder = OneHotEncoder(sparse_output=False)
X_encoded = encoder.fit_transform(df[['color', 'size']])

print("Encoded shape:", X_encoded.shape)
print("Feature names:", encoder.get_feature_names_out())
print("\nEncoded:\n", X_encoded)


Encoded shape: (4, 6)
Feature names: ['color_blue' 'color_green' 'color_red' 'size_L' 'size_M' 'size_S']

Encoded:
 [[0. 0. 1. 0. 0. 1.]
 [1. 0. 0. 0. 1. 0.]
 [0. 1. 0. 1. 0. 0.]
 [0. 0. 1. 0. 1. 0.]]


In [9]:
from sklearn.model_selection import train_test_split

# 80-20
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("80-20:", X_train.shape, X_test.shape)

# 70-30
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
print("70-30:", X_train.shape, X_test.shape)

# Stratified split (dùng khi class imbalance)
# train_test_split(..., stratify=y)


80-20: (16512, 8) (4128, 8)
70-30: (14448, 8) (6192, 8)


In [10]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression

# Pipeline: scale → train
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', LinearRegression())
])

# Fit pipeline (tự động fit scaler rồi fit model)
pipeline.fit(X_train, y_train)

# Predict (tự động transform rồi predict)
y_pred = pipeline.predict(X_test)

print("R² score:", pipeline.score(X_test, y_test))


R² score: 0.5957702326061665
