<a href="https://colab.research.google.com/github/NH0917/petfinder/blob/main/base_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
no = "PetFineder"
description = "ベンチーマーク"
#swin_large_patch4_window7_224

In [None]:
!pip install timm
!pip install albumentations==0.4.6
!pip install transformers
!pip install mlflow
!pip install ttach
!pip install dotmap





---



In [None]:
from pandas.core.algorithms import value_counts
import torch
import torchvision
import cv2
from torch.utils.data import Dataset
import os

class petfinder(Dataset):
    def __init__(self,df,data_dir,feature_col,transforms=False):
        self.df = df
        self.data_dir = data_dir
        self.feature_col = feature_col
        self.transforms = transforms

    def __len__(self):
        return len(self.df)

    def __getitem__(self,idx):
        img_name = self.df["Id"][idx]
        img_path = os.path.join(self.data_dir,img_name) + str(".jpg")
        img = cv2.imread(img_path)
        img = cv2.cvtColor(img,cv2.COLOR_BGR2RGB)

        label = self.df["Pawpularity"][idx]
        feature = self.df[self.feature_col].loc[idx].values

        if self.transforms:
            img = self.transforms(image=img)
            img = img["image"]

        else:
            img
        return img,label,feature

In [None]:
import torch
import numpy as np

class EarlyStopping:
    """Early stops the training if validation loss doesn't improve after a given patience."""
    def __init__(self, patience=7, verbose=False, delta=0, path='checkpoint.pt', trace_func=print):
        """
        Args:
            patience (int): How long to wait after last time validation loss improved.
                            Default: 7
            verbose (bool): If True, prints a message for each validation loss improvement. 
                            Default: False
            delta (float): Minimum change in the monitored quantity to qualify as an improvement.
                            Default: 0
            path (str): Path for the checkpoint to be saved to.
                            Default: 'checkpoint.pt'
            trace_func (function): trace print function.
                            Default: print            
        """
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf
        self.delta = delta
        self.path = path
        self.trace_func = trace_func
    def __call__(self, val_loss, model):

        score = -val_loss

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
        elif score < self.best_score + self.delta:
            self.counter += 1
            self.trace_func(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
            self.counter = 0

    def save_checkpoint(self, val_loss, model):
        '''Saves model when validation loss decrease.'''
        if self.verbose:
            self.trace_func(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...')
        torch.save(model.state_dict(), self.path)
        self.val_loss_min = val_loss

In [None]:
def mixup_augmentation(x:torch.Tensor, y:torch.Tensor, alpha:float = 1.0):
    """
    Function which performs Mixup augmentation
    """
    assert alpha > 0, "Alpha must be greater than 0"
    assert x.shape[0] > 1, "Need more than 1 sample to apply mixup"

    lam = np.random.beta(alpha, alpha)
    rand_idx = torch.randperm(x.shape[0])
    mixed_x = lam * x + (1 - lam) * x[rand_idx, :]

    target_a, target_b = y, y[rand_idx]

    return mixed_x, target_a, target_b, lam

In [None]:
from albumentations.augmentations.transforms import ShiftScaleRotate
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
import albumentations as A
import cv2
from tqdm.notebook import tqdm
import  matplotlib.pylab as plt
from torch.utils.data import Dataset,DataLoader
from sklearn.model_selection import StratifiedKFold
import timm
import os
from albumentations.pytorch import ToTensorV2
import gc
import mlflow
import shutil
import torch.optim as optim
from torch.optim import optimizer
from torch.optim import lr_scheduler
from torch.optim.adam import Adam
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts, CosineAnnealingLR, ReduceLROnPlateau
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
import ttach as tta
from dotmap import DotMap


CFG = DotMap({
    "IMG_SIZE":384,
    "batch_size":7,
    "LR":1e-5,
    "epochs":50,
    "patience":2,
    "MODEL_NAME":"swin_large_patch4_window12_384_in22k",
    "pretrained":True,
    "device":"cuda",
    "nfolds":10,
    "grad_accum_steps":0,
    "use_amp":True
})

train_augmentations = A.Compose([
    A.Resize(CFG.IMG_SIZE,CFG.IMG_SIZE),
    A.RandomBrightness(p=0.3),
    A.RandomContrast(p=0.3),
    A.RandomBrightnessContrast(0.3),
    A.HueSaturationValue(0.3),
    A.Normalize([0.485,0.456,0.406],[0.229,00.224,0.225]), #mean ,std
    ToTensorV2()
])

test_augmentions = A.Compose([
    A.Resize(CFG.IMG_SIZE,CFG.IMG_SIZE),
    A.Normalize([0.485,0.456,0.406],[0.229,00.224,0.225]), #mean ,std
    ToTensorV2()
])

transforms = tta.Compose(
    [
        tta.HorizontalFlip(),
        tta.VerticalFlip(),
        tta.Rotate90(angles=[0, 180]),
    ]
)

def seed_everything(seed=42):
    os.environ['PYTHONASSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything()

train_path = "/content/drive/MyDrive/ColabNotebooks/kaggle/petfinder/data/petfinder/input/train_skf10.csv"
data_dir = "/content/drive/MyDrive/ColabNotebooks/kaggle/petfinder/data/petfinder/input/train"
save_dir = "/content/drive/MyDrive/ColabNotebooks/kaggle/petfinder/data/petfinder/output"
mlflow_dir = "/content/drive/MyDrive/ColabNotebooks/kaggle/petfinder/data/petfinder/mlflow"

train = pd.read_csv(train_path)
train["Pawpularity"] = train["Pawpularity"]/100


df_col = [col for col in train.columns.tolist() if col not in ["Id","Pawpularity"]]
feature_col = [col for col in train.columns.tolist() if col not in ["Id","Pawpularity","fold"]]


class PetModel(nn.Module):
    def __init__(self,cfg):
        super().__init__()
        self.model = timm.create_model(cfg.MODEL_NAME,pretrained=cfg.pretrained)
        #self.model = nn.Sequential(*list(self.model.children())[:-1])
        self.model.head = nn.Identity()
        self.head = nn.Linear(1536+len(feature_col),1)
        #self.head = nn.Linear(1000,1)

    def forward(self,x,features):
        x = self.model(x)
        x = torch.cat([x,features],dim=1)
        x = self.head(x)
        return x.squeeze()

def trainer(train_loader,valid_loader,model,loss_fn,device,optimizer,lr_scheduler,epochs,fold,tta_trans):
    train_rmse_list = []
    valid_rmse_list = []

    criterion = torch.nn.BCEWithLogitsLoss()
    earlystopping =  EarlyStopping(3)

    scaler = torch.cuda.amp.GradScaler()

    for epoch in tqdm(range(epochs)):
        print(f"Start Epcoh{epoch}")

        train_progress_bar = tqdm(train_loader)
        valid_progress_bar = tqdm(valid_loader)
        train_loss = 0
        train_rmse = 0
        val_rmse = 0

        for i,(img,labels,features) in enumerate(train_progress_bar):
              model.train()
              optimizer.zero_grad()
              img,labels,features = img.to(device),labels.to(device),features.to(device)

              with torch.cuda.amp.autocast(enabled=True):
                  outputs = model(img,features)
                  loss = criterion(outputs,labels)
                  train_loss += float(loss.detach().cpu().numpy().item())
                  train_rmse += float(loss_fn(labels,outputs).detach().cpu().numpy().item())
              scaler.scale(loss).backward()
              
              #if ((i + 1) % CFG["grad_accum_steps"] == 0 ) or ((i + 1) == len(train_loader)):
              scaler.step(optimizer)
              scaler.update()

        lr_scheduler.step()

        

        with torch.no_grad():
            model.eval()
            for j ,(img,labels,features) in enumerate(valid_progress_bar):
                img,labels,features = img.to(device),labels.to(device),features.to(device)
                tta_model = tta.ClassificationTTAWrapper(model,tta_trans)
                outputs = tta_model(img,features)
                val_rmse += float(loss_fn(labels,outputs).detach().cpu().numpy().item())

            avg_train_loss = train_loss / len(train_loader)
            avg_train_rmse = train_rmse / len(train_loader)
            avg_valid_rmse = val_rmse / len(valid_loader)

            train_rmse_list.append(avg_train_rmse)
            valid_rmse_list.append(avg_valid_rmse)
            

            print(f"train_loss_is{avg_train_loss}")
            print(f"train_rmse_is{avg_train_rmse}")
            print(f"valid_rmse_is{avg_valid_rmse}")
            earlystopping(avg_valid_rmse,model)
            del img,labels,features,outputs,tta_model
        if earlystopping.early_stop:
            print("Early_stopping")
            break

    if os.path.exists(save_dir):
        pass
    else:
        os.mkdir(save_dir)

    torch.save(model.state_dict(),os.path.join(save_dir,f"model_state_fold_{fold}.pth"))
    gc.collect()
    return train_rmse_list,valid_rmse_list
     
def loss_fn(y_true,y_hat):
    y_hat = torch.sigmoid(y_hat) * 100
    y_hat = torch.round(y_hat)
    y_true = y_true*100
    return torch.sqrt(torch.mean((y_hat-y_true)**2))


def main(cfg):
    
    train_loss_list_all = []
    valid_loss_list_all = []

    train_loss_list = []
    valid_loss_list = []

    for fold in range(3,cfg.nfolds):
        print(f"Start Fold{fold}")
        print(f"UseCol{feature_col}")
        print(torch.cuda.memory_snapshot())
        gc.collect()
        model = PetModel(cfg)
        model.to(cfg.device)
        
        train_idx,val_idx = train.query(f"fold!={fold}").index,train.query(f"fold=={fold}").index
        train_df,val_df = train.loc[train_idx].reset_index(drop=True),train.loc[val_idx].reset_index(drop=True)
        train_ds = petfinder(train_df,data_dir,feature_col,transforms=train_augmentations)
        train_dl = DataLoader(train_ds,batch_size=cfg.batch_size,shuffle=True,num_workers=2,pin_memory=True,drop_last=True)
        val_ds = petfinder(val_df,data_dir,feature_col,transforms=test_augmentions)
        val_dl = DataLoader(val_ds,batch_size=cfg.batch_size*2,shuffle=False,num_workers=2,pin_memory=True,drop_last=False)

        optimizer = torch.optim.AdamW(model.parameters(),lr=2e-5)
        lr_scheduler = get_cosine_schedule_with_warmup(optimizer,num_warmup_steps=0,num_training_steps=10*len(train_dl))

        train_loss_list,valid_loss_list = trainer(train_dl,val_dl,model,loss_fn,cfg.device,optimizer,lr_scheduler,cfg.epochs,fold,transforms) 

        train_loss_list_all.append(train_loss_list),valid_loss_list_all.append(valid_loss_list)
        
        print(torch.cuda.memory_snapshot())
        del model,optimizer,train_ds,train_dl,val_ds,val_dl
        print("del complete")
        gc.collect()
        torch.cuda.empty_cache()
        print(torch.cuda.memory_snapshot())


    return [train_loss_list_all,valid_loss_list_all]


def ploting_and_output(total_loss_and_acc_list,save_dir):
    
    metrics_name = ["train_loss","valid_loss"]

    for i,metric in enumerate(metrics_name):
        
        fig,ax = plt.subplots()

        ax.set_xlabel("Epoch")
        ax.set_xlabel(metric)

        for j,metric_num in enumerate(total_loss_and_acc_list[i]):
            ax.plot(range(len(metric_num)),metric_num,label="Fold_{}".format(j))
        plt.legend() 
        fig.savefig(os.path.join(save_dir,"{}.png".format(metric)))

def save_mlflow(mlflow_dir,save_dir,total_loss_and_acc_list,cfg,desc,no):
    
    
    dic1 = dict()
    dic2 = dict()
    train_acc_list_tmp = []
    val_acc_list_tmp = []
    train_acc_list,val_acc_list = total_loss_and_acc_list

    for i,metric in enumerate(train_acc_list):
        best_acc = np.min(metric)
        dic1[f"train_rmse_fold{i}"] = best_acc
        train_acc_list_tmp.append(best_acc)
    mean_acc = np.mean(train_acc_list_tmp)
    dic2["train_rmse_mean"] = mean_acc
    
    for i,metric in enumerate(val_acc_list):
        best_acc = np.min(metric)
        dic1[f"val_rmse_fold{i}"] = best_acc
        val_acc_list_tmp.append(best_acc)
    mean_acc = np.mean(val_acc_list_tmp)
    dic2["val_rmse_mean"] = mean_acc
        
    mlflow.set_tracking_uri(mlflow_dir)
    mlflow.set_experiment(no)
    
    with mlflow.start_run():
        mlflow.log_params(cfg)
        mlflow.log_metrics(dic1)
        mlflow.log_metrics(dic2)
        mlflow.log_artifact(save_dir)


if __name__ == "__main__":
    total_loss_and_acc_list = main(CFG)
    #ploting_and_output(total_loss_and_acc_list,save_dir)
    save_mlflow(mlflow_dir,save_dir,total_loss_and_acc_list,CFG,description,no)
    #shutil.copyfile("train.py",os.path.join(save_dir,train.py))

Start Fold3
UseCol['Subject Focus', 'Eyes', 'Face', 'Near', 'Action', 'Accessory', 'Group', 'Collage', 'Human', 'Occlusion', 'Info', 'Blur']
[]


  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


  0%|          | 0/50 [00:00<?, ?it/s]

Start Epcoh0


  0%|          | 0/1115 [00:00<?, ?it/s]

  0%|          | 0/62 [00:00<?, ?it/s]

train_loss_is0.646696990929377
train_rmse_is17.4333671068363
valid_rmse_is17.456398672478247
Start Epcoh1


  0%|          | 0/1115 [00:00<?, ?it/s]

  0%|          | 0/62 [00:00<?, ?it/s]

train_loss_is0.6319909749912106
train_rmse_is15.519297667174675
valid_rmse_is17.79237594785387
EarlyStopping counter: 1 out of 3
Start Epcoh2


  0%|          | 0/1115 [00:00<?, ?it/s]

  0%|          | 0/62 [00:00<?, ?it/s]

train_loss_is0.6140557757873643
train_rmse_is13.00655082452474
valid_rmse_is17.559181161971807
EarlyStopping counter: 2 out of 3
Start Epcoh3


  0%|          | 0/1115 [00:00<?, ?it/s]

  0%|          | 0/62 [00:00<?, ?it/s]

train_loss_is0.5959477082020002
train_rmse_is10.119923638730597
valid_rmse_is17.710149352939922
EarlyStopping counter: 3 out of 3
Early_stopping
[{'device': 0, 'address': 139853060636672, 'total_size': 509607936, 'allocated_size': 0, 'active_size': 0, 'segment_type': 'large', 'blocks': [{'size': 509607936, 'state': 'inactive'}]}, {'device': 0, 'address': 139853570244608, 'total_size': 20971520, 'allocated_size': 0, 'active_size': 0, 'segment_type': 'large', 'blocks': [{'size': 20971520, 'state': 'inactive'}]}, {'device': 0, 'address': 139854637694976, 'total_size': 127926272, 'allocated_size': 0, 'active_size': 0, 'segment_type': 'large', 'blocks': [{'size': 127926272, 'state': 'inactive'}]}, {'device': 0, 'address': 139854771912704, 'total_size': 127926272, 'allocated_size': 0, 'active_size': 0, 'segment_type': 'large', 'blocks': [{'size': 127926272, 'state': 'inactive'}]}, {'device': 0, 'address': 139855174565888, 'total_size': 255852544, 'allocated_size': 0, 'active_size': 0, 'segme

  0%|          | 0/50 [00:00<?, ?it/s]

Start Epcoh0


  0%|          | 0/1115 [00:00<?, ?it/s]

  0%|          | 0/62 [00:00<?, ?it/s]

RuntimeError: ignored