# Kaggle Competition 

In [2]:
import pandas as pd
import numpy as np

In [3]:
import pytorch_lightning as pl
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

In [4]:
train_df =  pd.read_parquet("data/train_data.parquet")

In [5]:
train_df.shape

(11774752, 101)

In [6]:
valid_df =  pd.read_parquet("data/validate_data.parquet")

In [7]:
valid_df.shape

(639848, 101)

In [8]:
train_sample = train_df[:10000]
valid_sample = valid_df[:1000]
train_sample.shape, valid_sample.shape

((10000, 101), (1000, 101))

In [9]:
feature_cols = [f"feature_{idx:02d}" for idx in range(79)]+ [f"responder_{idx}_lag_1" for idx in range(9)]

#select target values
target_cols = ["responder_6"]

# select the weight values
weight_cols = ["weight"]

In [10]:
#finsh
# work fine with sample data
# work fine with all train data set and validation data set
class CustomDataset(Dataset):
    def __init__(self, dataframe, accerlerator):
        # Store each part of the dataframe that needed as a tensor
        self.features = torch.FloatTensor(dataframe[feature_cols].values).to(accerlerator)
        self.labels = torch.FloatTensor(dataframe[target_cols].values).to(accerlerator)
        self.weights = torch.FloatTensor(dataframe[weight_cols].values).to(accerlerator)


    def __len__(self):
        # Returb the length of the dataset
        return len(self.labels)

    def __getitem__(self, idx):
        # return the data at a given index
        # x = all features defined in the feature_cols
        # y = the target values that needed to be predicted definded in the target_cols
        # w = the weight values for calculating the loss defined in the weight_cols
        x = self.features[idx]
        y = self.labels[idx]
        w = self.weights[idx]
        return x, y, w

In [11]:
# finish
#have to test with train_data loader and val_dataloader
# the __init and setup seem to work fine
class DataModule(pl.LightningDataModule):
    def __init__(self, dataframe,  batch_size, valid_df = None, accelerator='cpu'):
        super().__init__()
        self.dataframe = dataframe
        self.batch_size = batch_size
        self.dates = dataframe["date_id"].unique()
        self.accelerator = accelerator
        self.train_dataset = None
        self.valid_df = None
        if valid_df is not None:
            self.valid_df = valid_df
        self.val_dataset = None
        
    def setup(self, fold=0, N_fold=5, stage=None):
        selected_dates = [date for ii, date in enumerate(self.dates) if ii % N_fold != fold] #leave one section out
        df_train = self.dataframe.loc[self.dataframe['date_id'].isin(selected_dates)]
        self.train_dataset = CustomDataset(df_train, self.accelerator)
        if self.valid_df is not None:
            df_valid = self.valid_df
            self.val_dataset = CustomDataset(df_valid, self.accelerator)

    def train_dataloader(self, Pw =False , n_workers=0):
        return DataLoader(self.train_dataset, batch_size=self.batch_size, persistent_workers=Pw ,shuffle=True, num_workers=n_workers)

    def val_dataloader(self, Pw =False ,n_workers=0):
        return DataLoader(self.val_dataset, batch_size=self.batch_size, persistent_workers=Pw, shuffle=False, num_workers=n_workers)


In [12]:
def r2_val(y_true, y_pred, weights, axis=None):
    """
    Calculate the weighted R^2 value.
    
    Parameters:
    y_true (np.ndarray): True values.
    y_pred (np.ndarray): Predicted values.
    weights (np.ndarray): Weights for each sample.
    axis (int, optional): Axis along which to calculate the weighted R^2 value.
    
    Returns:
    float: Weighted R^2 value.
    """
    # Ensure weights are normalized
    # weights = weights / np.sum(weights, axis=axis, keepdims=True)
    
    # Calculate weighted mean of y_true
    y_true_mean = np.average(y_true, weights=weights, axis=axis)
    
    # Calculate total sum of squares (SST)
    sst = np.sum(weights * (y_true - y_true_mean)**2, axis=axis)
    
    # Calculate residual sum of squares (SSR)
    ssr = np.sum(weights * (y_true - y_pred)**2, axis=axis)
    
    # Calculate R^2 value
    r2 = 1 - (ssr / sst)
    
    return r2

In [13]:
# finish
class MyModel(pl.LightningModule):
    def __init__(self, input_dim, hidden_dims, dropouts, lr, weight_decay):
        super().__init__()
        self.save_hyperparameters()
        layers = []
        in_dim = input_dim
        for i, hidden_dim in enumerate(hidden_dims):
            layers.append(nn.BatchNorm1d(in_dim))
            if i > 0:
                layers.append(nn.SiLU())
            if i < len(dropouts):
                layers.append(nn.Dropout(dropouts[i]))
            layers.append(nn.Linear(in_dim, hidden_dim))
            # layers.append(nn.ReLU())
            in_dim = hidden_dim
        layers.append(nn.Linear(in_dim, 1))  # 输出层
        layers.append(nn.Tanh())
        self.model = nn.Sequential(*layers)
        self.lr = lr
        self.weight_decay = weight_decay
        self.validation_step_outputs = []
    
    def forward(self, x):
        return 5 * self.model(x).squeeze(-1)
    
    def training_step(self, batch):
        x, y, w = batch
        y_hat = self(x)
        y = y.view(-1)
        loss = F.mse_loss(y_hat, y, reduction='none') * w  
        loss = loss.mean()
        self.log('train_loss', loss, on_step=False, on_epoch=True, batch_size=x.size(0))
        return loss

    def validation_step(self, batch):
        x, y, w = batch
        y_hat = self(x)
        y = y.view(-1)
        loss = F.mse_loss(y_hat, y, reduction='none') * w
        loss = loss.mean()
        self.log('val_loss', loss, on_step=False, on_epoch=True, batch_size=x.size(0))
        self.validation_step_outputs.append((y_hat, y, w))
        return loss

    # def on_validation_epoch_end(self):
    #     """Calculate validation WRMSE at the end of the epoch."""
    #     y = torch.cat([x[1] for x in self.validation_step_outputs]).cpu().numpy()
    #     if self.trainer.sanity_checking:
    #         prob = torch.cat([x[0] for x in self.validation_step_outputs]).cpu().numpy()
    #     else:
    #         prob = torch.cat([x[0] for x in self.validation_step_outputs]).cpu().numpy()
    #         weights = torch.cat([x[2] for x in self.validation_step_outputs]).cpu().numpy()
    #         # Ensure the shapes match by specifying the axis
    #         val_r_square = r2_val(y, prob, weights, axis=0)
    #         self.log("val_r_square", val_r_square, prog_bar=True, on_step=False, on_epoch=True)
    #     self.validation_step_outputs.clear()

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.lr, weight_decay=self.weight_decay)
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5,
                                                               verbose=True)
        return {
            'optimizer': optimizer,
            'lr_scheduler': {
                'scheduler': scheduler,
                'monitor': 'val_loss',
            }
        }

    def on_train_epoch_end(self):
        if self.trainer.sanity_checking:
            return
        epoch = self.trainer.current_epoch
        metrics = {k: v.item() if isinstance(v, torch.Tensor) else v for k, v in self.trainer.logged_metrics.items()}
        formatted_metrics = {k: f"{v:.5f}" for k, v in metrics.items()}
        print(f"Epoch {epoch}: {formatted_metrics}")


In [14]:
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint, Timer

In [15]:
n_hidden = [512, 512, 256]
dropout = [0.1, 0.1]
lr = 1e-3
max_epochs = 100
weight_decay = 5e-4
patience = 25
# try to increase batch size 
# 512   4.40
# 1024  3.30  have some bottle neck during dataloader # try to fix it 
# 8192 is fine
batch_size = 1024
pw = False # set to false when numloader = 0
num_loader = 0
if num_loader > 0: pw = True # set to false when numloader > 0

In [16]:
data_module = DataModule(train_df, batch_size = batch_size , valid_df= valid_df, accelerator= "cpu") 
data_module.setup(1,1)

In [17]:
# right now data loader is not work when num_loader > 0
input_dim = data_module.train_dataset.features.shape[1]
model = MyModel(
    input_dim= input_dim,
    hidden_dims= n_hidden,
    dropouts = dropout,
    lr = lr,
    weight_decay = weight_decay
)      
early_stopping = EarlyStopping('val_loss', patience=patience, mode='min', verbose=False)
checkpoint_callback = ModelCheckpoint(monitor='val_loss', mode='min', save_top_k=1, verbose=False, filename=f"./models/nn.model") 
timer = Timer()
trainer = pl.Trainer(
    max_epochs= max_epochs,
    accelerator="gpu",
    devices = 1,
    logger = None,
    callbacks=[early_stopping, checkpoint_callback, timer],
    enable_progress_bar=True
)
trainer.fit(model, data_module.train_dataloader(n_workers = num_loader), data_module.val_dataloader(n_workers = num_loader))

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA GeForce RTX 4050 Laptop GPU') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type       | Params | Mode 
---------------------------------------------
0 | model | Sequential | 442 K  | train
---------------------------------------------
442 K     Trainable params
0         Non-trainable params
442 K     Total params
1.768     Total estimated model params size (MB)
13        Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

c:\Users\Asus\anaconda3\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:425: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.
c:\Users\Asus\anaconda3\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 0: {'val_loss': '1.13274', 'train_loss': '1.80325'}


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 1: {'val_loss': '1.13607', 'train_loss': '1.78794'}


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 2: {'val_loss': '1.13318', 'train_loss': '1.78543'}


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 3: {'val_loss': '1.13380', 'train_loss': '1.78410'}


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 4: {'val_loss': '1.13384', 'train_loss': '1.78313'}


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 5: {'val_loss': '1.13414', 'train_loss': '1.78283'}


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 6: {'val_loss': '1.13406', 'train_loss': '1.78212'}


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 7: {'val_loss': '1.13296', 'train_loss': '1.77935'}


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 8: {'val_loss': '1.13308', 'train_loss': '1.77863'}


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 9: {'val_loss': '1.13424', 'train_loss': '1.77864'}


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 10: {'val_loss': '1.13300', 'train_loss': '1.77865'}


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 11: {'val_loss': '1.13417', 'train_loss': '1.77807'}


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 12: {'val_loss': '1.13217', 'train_loss': '1.77805'}


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 13: {'val_loss': '1.13213', 'train_loss': '1.77778'}


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 14: {'val_loss': '1.13337', 'train_loss': '1.77759'}


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 15: {'val_loss': '1.13245', 'train_loss': '1.77727'}


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 16: {'val_loss': '1.13257', 'train_loss': '1.77722'}


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 17: {'val_loss': '1.13204', 'train_loss': '1.77702'}


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 18: {'val_loss': '1.13150', 'train_loss': '1.77656'}


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 19: {'val_loss': '1.13237', 'train_loss': '1.77639'}


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 20: {'val_loss': '1.13147', 'train_loss': '1.77629'}


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 21: {'val_loss': '1.13249', 'train_loss': '1.77589'}


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 22: {'val_loss': '1.13275', 'train_loss': '1.77567'}


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 23: {'val_loss': '1.13220', 'train_loss': '1.77536'}


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 24: {'val_loss': '1.13103', 'train_loss': '1.77565'}


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 25: {'val_loss': '1.13254', 'train_loss': '1.77474'}


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 26: {'val_loss': '1.13416', 'train_loss': '1.77501'}


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 27: {'val_loss': '1.13232', 'train_loss': '1.77431'}


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 28: {'val_loss': '1.13311', 'train_loss': '1.77447'}


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 29: {'val_loss': '1.13231', 'train_loss': '1.77400'}


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 30: {'val_loss': '1.13388', 'train_loss': '1.77387'}


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 31: {'val_loss': '1.13397', 'train_loss': '1.77186'}


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 32: {'val_loss': '1.13299', 'train_loss': '1.77142'}


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 33: {'val_loss': '1.13289', 'train_loss': '1.77138'}


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 34: {'val_loss': '1.13390', 'train_loss': '1.77140'}


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 35: {'val_loss': '1.13592', 'train_loss': '1.77121'}


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 36: {'val_loss': '1.13258', 'train_loss': '1.77089'}


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 37: {'val_loss': '1.13310', 'train_loss': '1.77004'}


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 38: {'val_loss': '1.13194', 'train_loss': '1.76973'}


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 39: {'val_loss': '1.13280', 'train_loss': '1.76995'}


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 40: {'val_loss': '1.13239', 'train_loss': '1.76945'}


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 41: {'val_loss': '1.13292', 'train_loss': '1.76943'}


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 42: {'val_loss': '1.13210', 'train_loss': '1.76940'}


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 43: {'val_loss': '1.13243', 'train_loss': '1.76859'}


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 44: {'val_loss': '1.13206', 'train_loss': '1.76878'}


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 45: {'val_loss': '1.13334', 'train_loss': '1.76823'}


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 46: {'val_loss': '1.13160', 'train_loss': '1.76839'}


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 47: {'val_loss': '1.13185', 'train_loss': '1.76818'}


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 48: {'val_loss': '1.13301', 'train_loss': '1.76868'}


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 49: {'val_loss': '1.13225', 'train_loss': '1.76783'}
