# Kaggle Competition 

In [1]:
import pandas as pd
import polars
import numpy as np

In [2]:
import pytorch_lightning as pl
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

In [3]:
train_df =  polars.read_parquet("data/train_data.parquet")

In [4]:
train_df.shape

(23871848, 101)

In [3]:
valid_df =  polars.read_parquet("data/validate_data.parquet")

In [4]:
valid_df.shape

(1082224, 101)

In [5]:
feature_cols = ["symbol_id", "time_id"] +  [f"feature_{idx:02d}" for idx in range(79)]+ [f"responder_{idx}_lag_1" for idx in range(9)]

#select target values
target_cols = ["responder_6"]

# select the weight values
weight_cols = ["weight"]

In [6]:
#finsh
# work fine with sample data
# work fine with all train data set and validation data set
class CustomDataset(Dataset):
    def __init__(self, dataframe, accerlerator):
        # Store each part of the dataframe that needed as a tensor
        self.features = torch.FloatTensor(dataframe.select(feature_cols).to_numpy()).to(accerlerator)
        self.labels = torch.FloatTensor(dataframe.select(target_cols).to_numpy()).to(accerlerator)
        self.weights = torch.FloatTensor(dataframe.select(weight_cols).to_numpy()).to(accerlerator)


    def __len__(self):
        # Returb the length of the dataset
        return len(self.labels)

    def __getitem__(self, idx):
        # return the data at a given index
        # x = all features defined in the feature_cols
        # y = the target values that needed to be predicted definded in the target_cols
        # w = the weight values for calculating the loss defined in the weight_cols
        x = self.features[idx]
        y = self.labels[idx]
        w = self.weights[idx]
        return x, y, w

In [7]:
dataset = CustomDataset(valid_df,accerlerator="cpu")

In [11]:
dataset[1]

(tensor([ 1.0000e+00,  0.0000e+00,  3.8219e-01,  4.5869e-01,  5.4435e-01,
          9.2550e-02, -9.1184e-01,  1.3097e-01, -9.0515e-01, -3.0100e-01,
         -1.8795e-01,  1.1000e+01,  7.0000e+00,  7.6000e+01, -9.2035e-01,
          2.4371e+00, -7.5225e-02, -4.6374e-01, -5.0446e-01, -4.4867e-01,
         -1.6828e+00, -1.3096e+00,  1.3573e+00, -7.4740e-03,  1.1079e+00,
          3.7456e-01,  1.0593e+00,  1.1514e+00, -1.9860e+00, -4.6567e-01,
          8.2513e-01, -7.5664e-01, -7.3525e-01, -6.6193e-03,  7.8764e-01,
         -2.9642e-02,  4.9586e-01,  5.5345e-01, -1.5847e+00, -5.8390e-02,
          8.6217e-02,  5.1014e-02,  5.1481e-01,  7.5502e-02, -1.1175e-01,
         -6.5288e-01, -1.4616e-01, -1.5985e+00,  1.1257e+00, -4.9231e-02,
          4.2109e-01, -1.8579e-01, -1.4100e-01,  1.5677e+00, -1.9486e-01,
          1.3818e-01, -1.1379e+00,  1.6245e-01, -1.8266e+00,  1.2302e+00,
         -2.9080e-02,  1.2527e+00,  4.3781e-01,  1.0275e+00, -3.4252e-01,
         -4.2132e-01, -4.2403e-01, -1.

In [9]:
# finish
#have to test with train_data loader and val_dataloader
# the __init and setup seem to work fine
class DataModule(pl.LightningDataModule):
    def __init__(self, dataframe, batch_size, valid_df=None, accelerator='cuda'):
        super().__init__()
        self.dataframe = dataframe
        self.batch_size = batch_size
        self.dates = dataframe["date_id"].unique().to_list()
        self.accelerator = accelerator
        self.train_dataset = None
        self.valid_df = valid_df
        self.val_dataset = None

    def setup(self, fold=0, N_fold=5, stage=None):
        selected_dates = [date for ii, date in enumerate(self.dates) if ii % N_fold != fold]  # leave one section out
        df_train = self.dataframe.filter(polars.col("date_id").is_in(selected_dates))
        
        if df_train.is_empty():
            raise ValueError("Training dataset is empty after filtering.")
        
        self.train_dataset = CustomDataset(df_train, self.accelerator)
        
        if self.valid_df is not None:
            df_valid = self.valid_df
            if df_valid.is_empty():
                raise ValueError("Validation dataset is empty.")
            self.val_dataset = CustomDataset(df_valid, self.accelerator)

    def train_dataloader(self, Pw=False, n_workers=0):
        if self.train_dataset is None or len(self.train_dataset) == 0:
            raise ValueError("Training dataset is empty.")
        return DataLoader(self.train_dataset, batch_size=self.batch_size, persistent_workers=Pw, shuffle=True, num_workers=n_workers)

    def val_dataloader(self, Pw=False, n_workers=0):
        if self.val_dataset is None or len(self.val_dataset) == 0:
            raise ValueError("Validation dataset is empty.")
        return DataLoader(self.val_dataset, batch_size=self.batch_size, persistent_workers=Pw, shuffle=False, num_workers=n_workers)

In [10]:
# def r2_val(y_true, y_pred, weights, axis=None):
#     """
#     Calculate the weighted R^2 value.
    
#     Parameters:
#     y_true (np.ndarray): True values.
#     y_pred (np.ndarray): Predicted values.
#     weights (np.ndarray): Weights for each sample.
#     axis (int, optional): Axis along which to calculate the weighted R^2 value.
    
#     Returns:
#     float: Weighted R^2 value.
#     """
#     # Ensure weights are normalized
#     # weights = weights / np.sum(weights, axis=axis, keepdims=True)
    
#     # Calculate weighted mean of y_true
#     y_true_mean = np.average(y_true, weights=weights, axis=axis)
    
#     # Calculate total sum of squares (SST)
#     sst = np.sum(weights * (y_true - y_true_mean)**2, axis=axis)
    
#     # Calculate residual sum of squares (SSR)
#     ssr = np.sum(weights * (y_true - y_pred)**2, axis=axis)
    
#     # Calculate R^2 value
#     r2 = 1 - (ssr / sst)
    
#     return r2

In [11]:
# finish
class MyModel(pl.LightningModule):
    def __init__(self, input_dim, hidden_dims, dropouts, lr, weight_decay):
        super().__init__()
        self.save_hyperparameters()
        layers = []
        in_dim = input_dim
        for i, hidden_dim in enumerate(hidden_dims):
            layers.append(nn.BatchNorm1d(in_dim))
            if i > 0:
                layers.append(nn.SiLU())
            if i < len(dropouts):
                layers.append(nn.Dropout(dropouts[i]))
            layers.append(nn.Linear(in_dim, hidden_dim))
            # layers.append(nn.ReLU())
            in_dim = hidden_dim
        layers.append(nn.Linear(in_dim, 1))  # 输出层
        layers.append(nn.Tanh())
        self.model = nn.Sequential(*layers)
        self.lr = lr
        self.weight_decay = weight_decay
        self.validation_step_outputs = []
    
    def forward(self, x):
        return 5 * self.model(x).squeeze(-1)
    
    def training_step(self, batch):
        x, y, w = batch
        y_hat = self(x)
        y = y.view(-1)
        loss = F.mse_loss(y_hat, y, reduction='none') * w  
        loss = loss.mean()
        self.log('train_loss', loss, on_step=False, on_epoch=True, batch_size=x.size(0))
        return loss

    def validation_step(self, batch):
        x, y, w = batch
        y_hat = self(x)
        y = y.view(-1)
        loss = F.mse_loss(y_hat, y, reduction='none') * w
        loss = loss.mean()
        self.log('val_loss', loss, on_step=False, on_epoch=True, batch_size=x.size(0))
        self.validation_step_outputs.append((y_hat, y, w))
        return loss

    # def on_validation_epoch_end(self):
    #     """Calculate validation WRMSE at the end of the epoch."""
    #     y = torch.cat([x[1] for x in self.validation_step_outputs]).cpu().numpy()
    #     if self.trainer.sanity_checking:
    #         prob = torch.cat([x[0] for x in self.validation_step_outputs]).cpu().numpy()
    #     else:
    #         prob = torch.cat([x[0] for x in self.validation_step_outputs]).cpu().numpy()
    #         weights = torch.cat([x[2] for x in self.validation_step_outputs]).cpu().numpy()
    #         # Ensure the shapes match by specifying the axis
    #         val_r_square = r2_val(y, prob, weights, axis=0)
    #         self.log("val_r_square", val_r_square, prog_bar=True, on_step=False, on_epoch=True)
    #     self.validation_step_outputs.clear()

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.lr, weight_decay=self.weight_decay)
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5,
                                                               verbose=True)
        return {
            'optimizer': optimizer,
            'lr_scheduler': {
                'scheduler': scheduler,
                'monitor': 'val_loss',
            }
        }

    def on_train_epoch_end(self):
        if self.trainer.sanity_checking:
            return
        epoch = self.trainer.current_epoch
        metrics = {k: v.item() if isinstance(v, torch.Tensor) else v for k, v in self.trainer.logged_metrics.items()}
        formatted_metrics = {k: f"{v:.5f}" for k, v in metrics.items()}
        print(f"Epoch {epoch}: {formatted_metrics}")


In [12]:
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint, Timer

In [13]:
n_hidden = [512, 512, 256]
dropout = [0.1, 0.1]
lr = 1e-3
max_epochs = 100
weight_decay = 5e-4
patience = 25
# try to increase batch size 
# 512   4.40
# 1024  3.30  have some bottle neck during dataloader # try to fix it 
# 8192 is fine
batch_size = 1024
pw = False # set to false when numloader = 0
num_loader = 0
if num_loader > 0: pw = True # set to false when numloader > 0

In [14]:
data_module = DataModule(train_df, batch_size = batch_size , valid_df= valid_df, accelerator= "cpu") 
data_module.setup(1,1)
del train_df
input_dim = data_module.train_dataset.features.shape[1]

In [15]:
model = MyModel(
    input_dim= input_dim,
    hidden_dims= n_hidden,
    dropouts = dropout,
    lr = lr,
    weight_decay = weight_decay
)      
early_stopping = EarlyStopping('val_loss', patience=patience, mode='min', verbose=False)
checkpoint_callback = ModelCheckpoint(monitor='val_loss', mode='min', save_top_k=1, verbose=False, filename=f"./models/nn.model") 
timer = Timer()
trainer = pl.Trainer(
    max_epochs= max_epochs,
    accelerator="gpu",
    devices = 1,
    logger = None,
    callbacks=[early_stopping, checkpoint_callback, timer],
    enable_progress_bar=True
)
trainer.fit(model, data_module.train_dataloader(n_workers = num_loader), data_module.val_dataloader(n_workers = num_loader))

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA GeForce RTX 4050 Laptop GPU') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type       | Params | Mode 
---------------------------------------------
0 | model | Sequential | 443 K  | train
---------------------------------------------
443 K     Trainable params
0         Non-trainable params
443 K     Total params
1.772     Total estimated model params size (MB)
13        Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

c:\Users\Asus\anaconda3\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:425: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.
c:\Users\Asus\anaconda3\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 0: {'val_loss': '1.13350', 'train_loss': '1.58756'}


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 1: {'val_loss': '1.13217', 'train_loss': '1.58113'}


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 2: {'val_loss': '1.13193', 'train_loss': '1.58103'}


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 3: {'val_loss': '1.13166', 'train_loss': '1.58075'}


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 4: {'val_loss': '1.13143', 'train_loss': '1.58088'}


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 5: {'val_loss': '1.13416', 'train_loss': '1.58076'}


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 6: {'val_loss': '1.13359', 'train_loss': '1.58083'}


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 7: {'val_loss': '1.13325', 'train_loss': '1.58074'}


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 8: {'val_loss': '1.13370', 'train_loss': '1.58065'}


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 9: {'val_loss': '1.13333', 'train_loss': '1.58075'}


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 10: {'val_loss': '1.13399', 'train_loss': '1.58081'}


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 11: {'val_loss': '1.13312', 'train_loss': '1.57918'}


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 12: {'val_loss': '1.13361', 'train_loss': '1.57916'}


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 13: {'val_loss': '1.13332', 'train_loss': '1.57902'}


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 14: {'val_loss': '1.13143', 'train_loss': '1.57892'}


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 15: {'val_loss': '1.13582', 'train_loss': '1.57883'}


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 16: {'val_loss': '1.13188', 'train_loss': '1.57862'}


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 17: {'val_loss': '1.13222', 'train_loss': '1.57764'}


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 18: {'val_loss': '1.13296', 'train_loss': '1.57754'}


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 19: {'val_loss': '1.13126', 'train_loss': '1.57734'}


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 20: {'val_loss': '1.13276', 'train_loss': '1.57732'}


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 21: {'val_loss': '1.13275', 'train_loss': '1.57732'}


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 22: {'val_loss': '1.13191', 'train_loss': '1.57745'}


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 23: {'val_loss': '1.13106', 'train_loss': '1.57732'}


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 24: {'val_loss': '1.13164', 'train_loss': '1.57732'}


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 25: {'val_loss': '1.13291', 'train_loss': '1.57714'}


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 26: {'val_loss': '1.13396', 'train_loss': '1.57711'}


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 27: {'val_loss': '1.13366', 'train_loss': '1.57718'}


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 28: {'val_loss': '1.13285', 'train_loss': '1.57711'}


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 29: {'val_loss': '1.13151', 'train_loss': '1.57713'}


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 30: {'val_loss': '1.13211', 'train_loss': '1.57642'}


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 31: {'val_loss': '1.13350', 'train_loss': '1.57634'}


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 32: {'val_loss': '1.13159', 'train_loss': '1.57619'}


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 33: {'val_loss': '1.13337', 'train_loss': '1.57621'}


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 34: {'val_loss': '1.13265', 'train_loss': '1.57615'}


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 35: {'val_loss': '1.13270', 'train_loss': '1.57639'}


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 36: {'val_loss': '1.13220', 'train_loss': '1.57562'}


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 37: {'val_loss': '1.13267', 'train_loss': '1.57569'}


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 38: {'val_loss': '1.13262', 'train_loss': '1.57561'}


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 39: {'val_loss': '1.13321', 'train_loss': '1.57572'}


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 40: {'val_loss': '1.13331', 'train_loss': '1.57574'}


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 41: {'val_loss': '1.13273', 'train_loss': '1.57564'}


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 42: {'val_loss': '1.13240', 'train_loss': '1.57534'}


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 43: {'val_loss': '1.13228', 'train_loss': '1.57529'}


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 44: {'val_loss': '1.13239', 'train_loss': '1.57525'}


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 45: {'val_loss': '1.13292', 'train_loss': '1.57512'}


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 46: {'val_loss': '1.13309', 'train_loss': '1.57515'}


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 47: {'val_loss': '1.13363', 'train_loss': '1.57526'}


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 48: {'val_loss': '1.13308', 'train_loss': '1.57506'}


In [None]:
# right now data loader is not work when num_loader > 0
N_fold = 5
for fold in range(N_fold):
    data_module = DataModule(train_df, batch_size = batch_size , valid_df= valid_df, accelerator= "cpu") 
    data_module.setup(fold,N_fold)
    input_dim = data_module.train_dataset.features.shape[1]
    model = MyModel(
        input_dim= input_dim,
        hidden_dims= n_hidden,
        dropouts = dropout,
        lr = lr,
        weight_decay = weight_decay
    )      
    early_stopping = EarlyStopping('val_loss', patience=patience, mode='min', verbose=False)
    checkpoint_callback = ModelCheckpoint(monitor='val_loss', mode='min', save_top_k=1, verbose=False, filename=f"./models/nn__fold{fold}.model") 
    timer = Timer()
    trainer = pl.Trainer(
        max_epochs= max_epochs,
        accelerator="gpu",
        devices = 1,
        logger = None,
        callbacks=[early_stopping, checkpoint_callback, timer],
        enable_progress_bar=True
    )
    trainer.fit(model, data_module.train_dataloader(n_workers = num_loader), data_module.val_dataloader(n_workers = num_loader))