In [None]:
!pip install -r requirements.txt

In [10]:
import boto3
import os 

s3_resource = boto3.resource('s3')
bucket = s3_resource.Bucket('sgn-sagemaker-dev') 
os.makedirs("data",exist_ok=True)
for obj in bucket.objects.filter(Prefix = 'patient_retention/data'):
    if "." in os.path.basename(obj.key):
        if(not os.path.exists(os.path.basename(obj.key))):
            bucket.download_file(obj.key, os.path.join('data',os.path.basename(obj.key)))

In [14]:
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint,EarlyStopping
from pytorch_lightning.loggers import CSVLogger
import torch

from data_module import PatientDataModule
from model_module import PatientModelModule 

from train_util.utils import load_hparams_from_yaml

model_name = 'crossformer'
cur_date = datetime.now().isoformat()
hparams_path = f"configs/{model_name}.yaml"
hparams = load_hparams_from_yaml(hparams_path)
datamodule = PatientDataModule(hparams_path)
model = PatientModelModule(hparams_path)

In [15]:
checkpoint_cb = ModelCheckpoint(
    dirpath=f"log/{model_name}-{cur_date}/checkpoints",
    filename=f'{hparams["model"]["name"]}' + "_epoch({epoch:02d})_step({step:04d})_val_{val/mse:.4f}",
    
    monitor="val/mse",
    mode="min",
    
    auto_insert_metric_name=False,
    
    save_last=True,
    save_weights_only=True,
    save_top_k=3,
)
csv_logger = CSVLogger(
    save_dir=f"log/{model_name}-{cur_date}/logs",
)
early_stopping_cb = EarlyStopping(
    monitor="val/mse",
    mode="min",
    patience=5,
    min_delta = 0
)

# input_temp = torch.randn((4,16,49))
# output = model.net(input_temp)
# print(output.shape)

trainer = pl.Trainer(
    accelerator="gpu",
    devices=[0],
    logger=csv_logger,
    precision=32,
    max_epochs=hparams["training_param"]["max_epochs"],
    check_val_every_n_epoch=1,
    callbacks=[
        checkpoint_cb,
        early_stopping_cb
    ],
    log_every_n_steps=5
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


In [None]:
trainer.fit(model=model,datamodule=datamodule)
trainer.test(model=model,datamodule=datamodule,ckpt_path="last")
trainer.test(model=model,datamodule=datamodule,ckpt_path="best")

Missing logger folder: log/crossformer-2024-10-21T08:07:21.247517/logs/lightning_logs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type        | Params | Mode 
--------------------------------------------------
0 | net       | Crossformer | 44.3 M | train
1 | loss_func | MSELoss     | 0      | train
--------------------------------------------------
44.3 M    Trainable params
0         Non-trainable params
44.3 M    Total params
177.160   Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]