# Kaggle Competition 

In [1]:
import pandas as pd
import numpy as np
import joblib
import pytorch_lightning as pl
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

# setup

In [70]:
val_df =  pd.read_parquet("data/validate_data.parquet")

In [11]:
# feature_cols = [f"feature_{idx:02d}" for idx in range(79)]+ [f"responder_{idx}_lag_1" for idx in range(9)]
feature_cols = ["symbol_id", "time_id"] + [f"feature_{idx:02d}" for idx in range(79)]+ [f"responder_{idx}_lag_1" for idx in range(9)]

#select target values
target_cols = ["responder_6"]

# select the weight values
weight_cols = ["weight"]

In [12]:
X_val = val_df[feature_cols]
y_val = val_df[target_cols]
w_val = val_df[weight_cols]

In [71]:
val_df.date_id.max()

1698.0

In [13]:
class MyModel(pl.LightningModule):
    def __init__(self, input_dim, hidden_dims, dropouts, lr, weight_decay):
        super().__init__()
        self.save_hyperparameters()
        layers = []
        in_dim = input_dim
        for i, hidden_dim in enumerate(hidden_dims):
            layers.append(nn.BatchNorm1d(in_dim))
            if i > 0:
                layers.append(nn.SiLU())
            if i < len(dropouts):
                layers.append(nn.Dropout(dropouts[i]))
            layers.append(nn.Linear(in_dim, hidden_dim))
            # layers.append(nn.ReLU())
            in_dim = hidden_dim
        layers.append(nn.Linear(in_dim, 1))  # 输出层
        layers.append(nn.Tanh())
        self.model = nn.Sequential(*layers)
        self.lr = lr
        self.weight_decay = weight_decay
        self.validation_step_outputs = []
    
    def forward(self, x):
        return 5 * self.model(x).squeeze(-1)
    
    def training_step(self, batch):
        x, y, w = batch
        y_hat = self(x)
        y = y.view(-1)
        loss = F.mse_loss(y_hat, y, reduction='none') * w  #
        loss = loss.mean()
        self.log('train_loss', loss, on_step=False, on_epoch=True, batch_size=x.size(0))
        return loss

    def validation_step(self, batch):
        x, y, w = batch
        y_hat = self(x)
        y = y.view(-1)
        loss = F.mse_loss(y_hat, y, reduction='none') * w
        loss = loss.mean()
        self.log('val_loss', loss, on_step=False, on_epoch=True, batch_size=x.size(0))
        self.validation_step_outputs.append((y_hat, y, w))
        return loss

    # def on_validation_epoch_end(self):
    #     """Calculate validation WRMSE at the end of the epoch."""
    #     y = torch.cat([x[1] for x in self.validation_step_outputs]).cpu().numpy()
    #     if self.trainer.sanity_checking:
    #         prob = torch.cat([x[0] for x in self.validation_step_outputs]).cpu().numpy()
    #     else:
    #         prob = torch.cat([x[0] for x in self.validation_step_outputs]).cpu().numpy()
    #         weights = torch.cat([x[2] for x in self.validation_step_outputs]).cpu().numpy()
    #         # Ensure the shapes match by specifying the axis
    #         val_r_square = r2_val(y, prob, weights, axis=0)
    #         self.log("val_r_square", val_r_square, prog_bar=True, on_step=False, on_epoch=True)
    #     self.validation_step_outputs.clear()

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.lr, weight_decay=self.weight_decay)
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5,
                                                               verbose=True)
        return {
            'optimizer': optimizer,
            'lr_scheduler': {
                'scheduler': scheduler,
                'monitor': 'val_loss',
            }
        }

    # def on_train_epoch_end(self):
    #     if self.trainer.sanity_checking:
    #         return
    #     epoch = self.trainer.current_epoch
    #     metrics = {k: v.item() if isinstance(v, torch.Tensor) else v for k, v in self.trainer.logged_metrics.items()}
    #     formatted_metrics = {k: f"{v:.5f}" for k, v in metrics.items()}
    #     print(f"Epoch {epoch}: {formatted_metrics}")

    def save_model(self, path):
        """Save the model's state dictionary to the specified path."""
        torch.save(self.state_dict(), path)

In [60]:
import statistics
def cv(pred, n_folds = 10):    
    batch_size = len(y_val)//n_folds 
    r2 = []  
    for i in range(0, len(X_val), batch_size):
        if i == batch_size*10: return r2
        pred_batch = pred[i:i+batch_size]
        y_batch = y_val[i:i+batch_size]
        w_batch = w_val[i:i+batch_size]
        r2.append(r2_score(y_batch, pred_batch, sample_weight= w_batch))
    return r2

# download the model

In [19]:
i= 0 
# model  = MyModel.load_from_checkpoint(f"lightning_logs/version_{i+25}/checkpoints/models/nn__fold{i}.model.ckpt")
model  = MyModel.load_from_checkpoint(f"lightning_logs/version_{32}/checkpoints/models/nn.model.ckpt")
model= model.to("cuda")
model.eval()

MyModel(
  (model): Sequential(
    (0): BatchNorm1d(90, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (1): Dropout(p=0.1, inplace=False)
    (2): Linear(in_features=90, out_features=512, bias=True)
    (3): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (4): SiLU()
    (5): Dropout(p=0.1, inplace=False)
    (6): Linear(in_features=512, out_features=512, bias=True)
    (7): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (8): SiLU()
    (9): Linear(in_features=512, out_features=256, bias=True)
    (10): Linear(in_features=256, out_features=1, bias=True)
    (11): Tanh()
  )
)

In [20]:
XGB_vote_model  =  joblib.load("model/XGBoost_voting_ensemble6.pkl")

# check the score of the model

In [16]:
from sklearn.metrics import r2_score

In [21]:
with torch.no_grad():
  pred = model(torch.FloatTensor(X_val.values).to("cuda"))
pred_nn = pred.cpu().detach().numpy()
r2 =  r2_score(y_val, pred_nn, sample_weight= w_val)
print(f"R2 score: {r2:.6f}")

R2 score: 0.007622


In [22]:
pred_XGB = XGB_vote_model.predict(X_val)
r2 =  r2_score(y_val, pred_XGB, sample_weight= w_val)
print(f"R2 score: {r2:.6f}")

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




R2 score: 0.007910


In [61]:
cv(pred_nn)

[0.0033342838287353516,
 0.0171239972114563,
 0.004226744174957275,
 0.003619074821472168,
 0.016565322875976562,
 0.007086396217346191,
 0.005833983421325684,
 0.005219638347625732,
 0.006827950477600098,
 0.004177272319793701]

In [62]:
cv(pred_XGB)

[0.00596308708190918,
 0.01482301950454712,
 0.005225837230682373,
 0.003408968448638916,
 0.016702592372894287,
 0.007123231887817383,
 0.007173597812652588,
 0.003072500228881836,
 0.007205665111541748,
 0.0062503814697265625]

# weight ensemble

In [63]:
ratio  = [x/100 for x in range(101)]
max_r2 = -5
max_i = None
for i in ratio:
    pred  = pred_XGB*i + pred_nn*(1-i)
    r2 =  statistics.mean(cv(pred))
    # print(f"R2 score of XGB * {i} and nn*{1-i}: {r2:.6f}")
    if r2 > max_r2:
        max_r2 = r2
        max_i = i
print(f"max_r2: {max_r2:.6f} , XGB *{max_i} + nn{1-max_i:.2f}")

max_r2: 0.008407 , XGB *0.54 + nn0.46


# try sth

In [68]:
predb = pred_XGB*0.54 + pred_nn*0.46
r2 =  r2_score(y_val, predb, sample_weight= w_val)
print(f"R2 score: {r2:.6f}")

R2 score: 0.008594


# ensemble model from other person that share online

In [64]:
xgb = joblib.load("download_model/result.pkl")
xgb_dmodel = xgb["model"]

configuration generated by an older version of XGBoost, please export the model by calling
`Booster.save_model` from that version first, then load it back in current version. See:

    https://xgboost.readthedocs.io/en/stable/tutorials/saving_model.html

for more details about differences between saving model and serializing.

