In [14]:
import pandas as pd
import numpy as np

In [15]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error

In [16]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import lightning as L
from lightning.pytorch.callbacks import EarlyStopping, ModelCheckpoint
import optuna

In [17]:
from sklearn.model_selection import KFold

In [18]:
#Try DNN
train_processed = pd.read_csv('train_processed.csv')
test_processed = pd.read_csv('test_processed.csv')
test_id = pd.read_csv('test.csv')['Id']

# define features and target variable
X = train_processed.drop(['SalePrice'], axis=1).to_numpy()
y = train_processed['SalePrice'].to_numpy()  # log transformed
X_test = test_processed.copy().to_numpy()

# split train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# PyTorch 定义数据集类,将特征和目标转换为张量
class HousePriceDataset(Dataset):
    def __init__(self, X, y=None):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32) if y is not None else None

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        if self.y is not None:
            return self.X[idx], self.y[idx]
        return self.X[idx]

# 创建 DataLoader
train_dataset = HousePriceDataset(X_train, y_train)
val_dataset = HousePriceDataset(X_val, y_val)
test_dataset = HousePriceDataset(X_test)

batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=batch_size, num_workers=4)
test_loader = DataLoader(test_dataset, batch_size=batch_size, num_workers=4)

In [None]:
class HousePriceDNN(L.LightningModule):
    def __init__(self, input_dim, hidden_dims=[2048, 1024, 512, 256, 128, 64, 32], dropout=0.2):
        super(HousePriceDNN, self).__init__()
        layers = []
        prev_dim = input_dim
        for dim in hidden_dims:
            layers.extend([
                nn.Linear(prev_dim, dim),
                nn.ReLU(),
                nn.BatchNorm1d(dim),
                nn.Dropout(dropout)
            ])
            prev_dim = dim
        layers.append(nn.Linear(prev_dim, 1))  # 回归输出层
        self.model = nn.Sequential(*layers)
        self.criterion = nn.MSELoss()

    def forward(self, x):
        return self.model(x).squeeze()

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_pred = self(x)
        loss = self.criterion(y_pred, y)
        self.log('train_loss', loss, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_pred = self(x)
        loss = self.criterion(y_pred, y)
        rmse = torch.sqrt(loss)
        self.log('val_rmse', rmse, prog_bar=True)
        return rmse

    def test_step(self, batch, batch_idx):
        x = batch
        y_pred = self(x)
        self.predictions.append(y_pred.cpu().numpy())

    def configure_optimizers(self):
        optimizer = optim.Adam(self.parameters(), lr=0.001)
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=20)
        return {
            'optimizer': optimizer,
            'lr_scheduler': {
                'scheduler': scheduler,
                'monitor': 'val_rmse'
            }
        }

    def on_test_epoch_start(self):
        self.predictions = []

# 初始化模型
input_dim = X_train.shape[1]
model = HousePriceDNN(input_dim)

# 设置训练器
trainer = L.Trainer(
    max_epochs=200,
    accelerator='auto',  # 自动选择 GPU/CPU
    callbacks=[
        EarlyStopping(monitor='val_rmse', patience=20, mode='min'),
        ModelCheckpoint(monitor='val_rmse', save_top_k=1, mode='min')
    ]
)

# 训练模型
trainer.fit(model, train_loader, val_loader)

# 测试集预测
trainer.test(model, test_loader)
test_pred = np.concatenate(model.predictions)
test_pred = np.expm1(test_pred)  # 逆对数变换

# 生成提交文件
submission = pd.DataFrame({'Id': test_id, 'SalePrice': test_pred})
submission.to_csv('submission_dnn.csv', index=False)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name      | Type       | Params | Mode 
-------------------------------------------------
0 | model     | Sequential | 1.9 M  | train
1 | criterion | MSELoss    | 0      | train
-------------------------------------------------
1.9 M     Trainable params
0         Non-trainable params
1.9 M     Total params
7.405     Total estimated model params size (MB)
31        Modules in train mode
0         Modules in eval mode


Epoch 80: 100%|██████████| 37/37 [00:00<00:00, 44.87it/s, v_num=55, train_loss=1.650, val_rmse=0.220]
Testing DataLoader 0: 100%|██████████| 46/46 [00:00<00:00, 228.49it/s]


In [2]:
def objective(trial):
    hidden_dims = [
        trial.suggest_int('layer1', 128, 512),
        trial.suggest_int('layer2', 64, 512),
        trial.suggest_int('layer3', 32, 512),
        trial.suggest_int('layer4', 16, 512)
    ]
    dropout = trial.suggest_float('dropout', 0.1, 0.5)
    model = HousePriceDNN(input_dim, hidden_dims, dropout)
    trainer = L.Trainer(max_epochs=50, callbacks=[EarlyStopping(monitor='val_rmse', patience=10)])
    trainer.fit(model, train_loader, val_loader)
    return trainer.callback_metrics['val_rmse'].item()

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)


NameError: name 'optuna' is not defined

In [1]:
# 检查最佳参数
best_params = study.best_params
best_hidden_sizes = [best_params[f'layer{i}'] for i in range(1, 5)]
best_dropout = best_params['dropout']
print("Best Parameters:")
print(f"  Hidden Layer Sizes: {best_hidden_sizes}")
print(f"  Dropout Rate: {best_dropout}")
print(f"  Best RMSE from Optimization: {study.best_value:.5f}")

NameError: name 'study' is not defined

In [29]:
# 用最佳参数重训
best_model = HousePriceDNN(input_dim, best_hidden_sizes, best_dropout)
trainer = L.Trainer(max_epochs=500, callbacks=[EarlyStopping(monitor='val_rmse', patience=10)])
trainer.fit(best_model, train_loader, val_loader)

# Evaluate on validation set
best_model.eval()
val_preds = []
val_true = []
with torch.no_grad():
    for batch in val_loader:
        x, y = batch
        preds = best_model(x)
        val_preds.extend(preds.cpu().numpy())
        val_true.extend(y.cpu().numpy())

# Calculate RMSE on validation set
val_rmse = np.sqrt(mean_squared_error(val_true, val_preds))
print(f"Final Validation RMSE after Training: {val_rmse:.5f}")

💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name      | Type       | Params | Mode 
-------------------------------------------------
0 | model     | Sequential | 1.6 M  | train
1 | criterion | MSELoss    | 0      | train
-------------------------------------------------
1.6 M     Trainable params
0         Non-trainable params
1.6 M     Total params
6.205     Total estimated model params size (MB)
15        Modules in train mode
0         Modules in eval mode


Epoch 26: 100%|██████████| 37/37 [00:00<00:00, 53.89it/s, v_num=106, train_loss=0.807, val_rmse=0.538]
Final Validation RMSE after Training: 0.54575


In [None]:
# DNN feature importances

In [31]:
# DNN cross-validation
# catboost_cv_scores = cross_val_score(catboost_best, X, y, cv=5, scoring='neg_root_mean_squared_error')
# print(f"CatBoost cross-validation RMSE: {-catboost_cv_scores.mean():.5f} (+/- {catboost_cv_scores.std() * 2:.5f})")

# Kfold
kf = KFold(n_splits=5, shuffle=True, random_state=42)
test_preds = np.zeros(len(X_test))
for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    train_data = HousePriceDataset(X[train_idx], y[train_idx])
    val_data = HousePriceDataset(X[val_idx], y[val_idx])
    train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
    val_loader = DataLoader(val_data, batch_size=32)
    model = HousePriceDNN(input_dim)
    trainer = pl.Trainer(max_epochs=100, callbacks=[pl.callbacks.EarlyStopping(monitor='val_rmse', patience=20)])
    trainer.fit(model, train_loader, val_loader)
    test_preds += np.expm1(np.concatenate(trainer.test(model, test_loader)))
test_preds /= 5
submission = pd.DataFrame({'Id': test_id, 'SalePrice': test_preds})
submission.to_csv('submission_cv.csv', index=False)

IndexError: index 4 is out of bounds for dimension 0 with size 4

In [None]:
# predict on test set
# xgb_test_pred = xgb_best.predict(X_test)
# lgb_test_pred = lgb_best.predict(X_test)
# catboost_test_pred = catboost_best.predict(X_test)

# simple ensemble (average predictions)
final_pred = test_pred

# expm1 transformation to reverse log transformation
final_pred = np.expm1(final_pred)

# save submission file
submission = pd.DataFrame({'Id': test_id, 'SalePrice': final_pred})
submission.to_csv('submission_baseline.csv', index=False)

In [None]:
# define base models for stacking
base_models = [
    ('rf', RandomForestRegressor(random_state=42)),
    ('xgb', XGBRegressor(**xgb_grid.best_params_, random_state=42)),
    ('lgb', LGBMRegressor(**lgb_grid.best_params_, random_state=42)),
    ('catboost', CatBoostRegressor(**catboost_grid.best_params_, cat_features=categorical_features, random_state=42, verbose=0))
]

# define meta learner
# meta_learner = Ridge()
# first try use normal parameters for LGBMRegressor
meta_learner = LGBMRegressor(n_estimators=100, learning_rate=0.05, max_depth=3, 
                              num_leaves=15, random_state=42)
# initia Stacking
stacking_model = StackingRegressor(estimators=base_models, final_estimator=meta_learner, cv=5)

# train Stacking model
stacking_model.fit(X_train, y_train)

# validate Stacking model
stacking_pred = stacking_model.predict(X_val)
stacking_rmse = np.sqrt(mean_squared_error(y_val, stacking_pred))
print(f"Stacking RMSE: {stacking_rmse:.5f}")

# cross-validation
stacking_cv_scores = cross_val_score(stacking_model, X, y, cv=5, scoring='neg_root_mean_squared_error')
print(f"Stacking cross-validation RMSE: {-stacking_cv_scores.mean():.5f} (+/- {stacking_cv_scores.std() * 2:.5f})")

# predict on test set
stacking_test_pred = stacking_model.predict(X_test)
stacking_test_pred = np.expm1(stacking_test_pred)

# save submission file
submission_stacking = pd.DataFrame({'Id': test['Id'], 'SalePrice': stacking_test_pred})
submission_stacking.to_csv('submission_stacking.csv', index=False)