In [None]:
# Lý do lưu model:
# 1. Không cần train lại mỗi lần dùng
# 2. Deploy model vào production
# 3. Chia sẻ model với team
# 4. Version control models

# 2 thư viện phổ biến:
# - joblib: tối ưu cho models sklearn (nén array numpy tốt)
# - pickle: Python built-in, dùng cho mọi object


In [7]:
import joblib
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
import os

os.makedirs("./models", exist_ok=True)  # Tạo thư mục nếu chưa có

# Train model nhanh
data = fetch_california_housing()
X, y = data.data, data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Train
model = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
model.fit(X_train_scaled, y_train)

# Lưu model
joblib.dump(model, './models/house_price_model.pkl')

# Lưu scaler (quan trọng!)
joblib.dump(scaler, './models/scaler.pkl')

print("✅ Model và scaler đã lưu!")


✅ Model và scaler đã lưu!


In [8]:
# Load model
loaded_model = joblib.load('./models/house_price_model.pkl')
loaded_scaler = joblib.load('./models/scaler.pkl')

# Predict trên test data
X_test_scaled = loaded_scaler.transform(X_test)
predictions = loaded_model.predict(X_test_scaled)

print("First 5 predictions:", predictions[:5])
print("First 5 actual:", y_test[:5])

# Đánh giá
from sklearn.metrics import r2_score
r2 = r2_score(y_test, predictions)
print(f"R² score: {r2:.4f}")


First 5 predictions: [0.56803151 0.81136037 4.83575176 2.45484476 2.04452869]
First 5 actual: [0.477   0.458   5.00001 2.186   2.78   ]
R² score: 0.7739


In [9]:
import numpy as np

# Giả lập 1 nhà mới cần dự đoán giá
# Features: [MedInc, HouseAge, AveRooms, AveBedrms, Population, AveOccup, Latitude, Longitude]
new_house = np.array([[
    8.3252,    # MedInc: thu nhập trung vị
    41.0,      # HouseAge: tuổi nhà
    6.98,      # AveRooms: số phòng trung bình
    1.02,      # AveBedrms: phòng ngủ trung bình
    322.0,     # Population
    2.55,      # AveOccup
    37.88,     # Latitude
    -122.23    # Longitude
]])

# Scale và predict
new_house_scaled = loaded_scaler.transform(new_house)
predicted_price = loaded_model.predict(new_house_scaled)

print(f"Predicted house price: ${predicted_price[0] * 100000:,.0f}")


Predicted house price: $434,876


In [10]:
from sklearn.pipeline import Pipeline

# Tạo pipeline hoàn chỉnh
full_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42))
])

# Train
full_pipeline.fit(X_train, y_train)

# Lưu 1 file duy nhất
joblib.dump(full_pipeline, './models/full_pipeline.pkl')

# Load và dùng
loaded_pipeline = joblib.load('./models/full_pipeline.pkl')
predictions_pipeline = loaded_pipeline.predict(X_test)  # không cần scale thủ công!

print("✅ Pipeline saved and loaded successfully!")
print(f"R² from pipeline: {r2_score(y_test, predictions_pipeline):.4f}")


✅ Pipeline saved and loaded successfully!
R² from pipeline: 0.7739


In [11]:
import pickle
import os

# Lưu bằng pickle
with open('models/model_pickle.pkl', 'wb') as f:
    pickle.dump(model, f)

# Lưu bằng joblib
joblib.dump(model, 'models/model_joblib.pkl')

# So sánh kích thước file
size_pickle = os.path.getsize('models/model_pickle.pkl')
size_joblib = os.path.getsize('models/model_joblib.pkl')

print(f"Pickle size: {size_pickle:,} bytes")
print(f"Joblib size: {size_joblib:,} bytes")
print(f"Joblib nhỏ hơn: {(1 - size_joblib/size_pickle)*100:.1f}%")


Pickle size: 10,211,903 bytes
Joblib size: 10,219,777 bytes
Joblib nhỏ hơn: -0.1%


In [12]:
# Best practice: thêm metadata
import datetime

model_metadata = {
    'model_type': 'RandomForestRegressor',
    'train_date': str(datetime.datetime.now()),
    'train_samples': X_train.shape[0],
    'features': list(data.feature_names),
    'performance': {
        'train_r2': model.score(X_train_scaled, y_train),
        'test_r2': r2_score(y_test, predictions)
    }
}

# Lưu model + metadata
joblib.dump({
    'model': model,
    'scaler': scaler,
    'metadata': model_metadata
}, 'models/model_v1.0.pkl')

# Load
loaded = joblib.load('models/model_v1.0.pkl')
print("Model metadata:")
print(loaded['metadata'])


Model metadata:
{'model_type': 'RandomForestRegressor', 'train_date': '2025-12-07 16:05:30.716803', 'train_samples': 16512, 'features': ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude'], 'performance': {'train_r2': 0.8719225011198944, 'test_r2': 0.7738887441938533}}
