In [2]:
import os
os.chdir('/content/drive/MyDrive/apple/DATA')

In [None]:
!pip install timm
!pip install -U scikit-learn
!pip install pytorch-lightning
!pip install torchtext
!pip install adamp
!pip install wandb

In [4]:
import torch
import os
import copy
import random
import timm
import wandb
import cv2
import torchmetrics

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch.nn as nn
import pytorch_lightning as pl
import torchvision.transforms as transforms
import torch.optim.lr_scheduler as lr_scheduler

from adamp import AdamP
from datetime import datetime, timezone, timedelta
from PIL import Image
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from ipywidgets import interact

from torch.utils.data import DataLoader, Dataset
from torchmetrics.functional import accuracy, f1_score, precision, recall
# from sklearn.metrics import accuracy_score, f1_score

from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from sklearn.model_selection import KFold, StratifiedKFold, GroupKFold
from pytorch_lightning.loggers import WandbLogger

In [5]:
# 프로젝트 경로
PROJECT_DIR = '/content/drive/MyDrive/apple'
os.chdir(PROJECT_DIR)

# 데이터 경로
DATA_DIR = os.path.join(PROJECT_DIR, 'DATA')
TRAIN_IMG_DIR = os.path.join(DATA_DIR, 'train')
TRAIN_LABEL_DIR = os.path.join(DATA_DIR, 'train.csv')
TEST_IMG_DIR = os.path.join(DATA_DIR, 'test')
TEST_LABEL_DIR = os.path.join(DATA_DIR, 'test.csv')
SAMPLE_DIR = os.path.join(DATA_DIR, 'sample_submission.csv')

In [6]:
# 파일 수 확인 
print(len(os.listdir(TRAIN_IMG_DIR)))
print(len(os.listdir(TEST_IMG_DIR)))

10000
5000


In [7]:
RANDOM_SEED = 42
torch.manual_seed(RANDOM_SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)

In [8]:
# GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [9]:
# 학습 파일
traindf = pd.read_csv(TRAIN_LABEL_DIR)

traindf.head()

Unnamed: 0,img_id,a_type
0,yVvIyEOqz8.jpg,HJ
1,HLdFN3ULg2.jpg,HJ
2,bAm1CUZtO8.jpg,HJ
3,n5hMUvCCV6.jpg,HR
4,9RiwdmWGSq.jpg,SG


In [10]:
# 이미지 예시
@interact(index=(0, len(traindf)-1))
def show_images(index=0):
    image_path = os.path.join(TRAIN_IMG_DIR, traindf.iloc[index].img_id)
    image = cv2.imread(image_path)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

    plt.figure(figsize=(10,6))
    plt.title(traindf.iloc[index].a_type)
    plt.imshow(image)

interactive(children=(IntSlider(value=0, description='index', max=9999), Output()), _dom_classes=('widget-inte…

In [11]:
# 80% / 20% 비율로 분할
train,val = train_test_split(traindf, test_size=0.2, shuffle=True)

# train / validation로 나눈 데이터 프레임 저장
train.to_csv(os.path.join(DATA_DIR, 'train_labels.csv'), index=False)
val.to_csv(os.path.join(DATA_DIR, 'val_labels.csv'),index=False)

In [12]:
def make_transform():
    train_transform = transforms.Compose(
        [
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
            transforms.RandomHorizontalFlip(p=0.5),
            transforms.RandomVerticalFlip(p=0.5),
            transforms.RandomRotation(degrees=180),
         
        ]
    )

    return train_transform

In [13]:
class AppleDataset(Dataset):
    def __init__(self, img_folder, label_path, transform=None):
        # self.df = pd.read_csv(label_path, usecols=['img_id','a_type'], dtype={'a_type':str})
        self.df = label_path
        self.label_encoding = {'HJ':0, 'HR':1, 'SG':2, 'AR':3}
        self.img_folder = img_folder
        self.transform = transform
        self.img_ids = list(self.df['img_id'])
        self.labels = list(self.df['a_type'])

    def __len__(self):
        return len(self.img_ids)

    def __getitem__(self, index):
        image_path = os.path.join(self.img_folder, self.img_ids[index])
        img = Image.open(image_path)

        label = self.labels[index]
        label = self.label_encoding[label]
                
        if self.transform:
            img = self.transform(img)

        return img, label

In [14]:
# Test
train_transform = make_transform()
train_dataset = AppleDataset(img_folder = TRAIN_IMG_DIR, label_path = traindf, transform=train_transform)

In [15]:
@interact(index=(0, 7999))
def show_images(index=0):
    img, label = train_dataset[index]
    plt.figure(figsize=(10,6))
    plt.axis=("off")
    plt.imshow(img.permute(1,2,0))

interactive(children=(IntSlider(value=0, description='index', max=7999), Output()), _dom_classes=('widget-inte…

In [36]:
class ClassificationModel(pl.LightningModule):
    def __init__(self, args=None, optimizer='adam', scheduler='reducelr'):
        super().__init__()
        self.model = timm.create_model('efficientnet_b4', pretrained=True)
        self.model.classifier = nn.Sequential(
            nn.Linear(in_features = 1792, out_features = 625),
            nn.ReLU(),
            nn.Dropout(p=0.5),
            nn.Linear(625,256),
            nn.ReLU(),
            nn.Linear(256, 4)
        )
        for param in self.model.parameters():
            param.requires_grad = False
        for param in self.model.classifier.parameters():
            param.requires_grad = True

        self.args = args
        self.criterion = nn.CrossEntropyLoss()
        self.optimizer = optimizer
        self.scheduler = scheduler
        self.accuracy = torchmetrics.Accuracy()
    

    def forward(self, x):
        output = self.model(x)
        return output


    def configure_optimizers(self):
        if self.optimizer == 'adam':
            optimizer = torch.optim.Adam(self.parameters(), lr=0.001)
        elif self.args.optimizer == 'adamw':
            optimizer = torch.optim.AdamW(self.parameters(), lr=0.001)
        elif self.args.optimizer == 'adamp':
            optimizer = AdamP(self.parameters(), lr=0.001, betas=(0.9, 0.999), weight_decay=1e-2)

        if self.scheduler == "reducelr":
            scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, factor=0.5, mode="max", verbose=True)
            return {"optimizer": optimizer, "lr_scheduler": scheduler, "monitor": "val/f1"}

        elif self.args.scheduler == "cosineanneal":
            scheduler = lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=10, T_mult=1, eta_min=1e-5,
                                                                 last_epoch=-1, verbose=True)

        return {"optimizer": optimizer, "lr_scheduler": scheduler}


    def training_step(self, train_batch, batch_idx):
        image, label = train_batch

        outputs = self.model(image)
        loss = self.criterion(outputs, label)
        acc = accuracy(outputs, label)
        f1 = f1_score(outputs, label, num_classes=4, average="macro")
        self.log("train/acc", acc, on_epoch=True, on_step=True, prog_bar=True)
        self.log("train/loss", loss, on_epoch=True, on_step=True, prog_bar=True, sync_dist=True)
        self.log("train/f1", f1, on_epoch=True, on_step=True, prog_bar=True, sync_dist=True)

        return {"acc":acc, "loss":loss, "f1-score":f1}


    def validation_step(self, train_batch, batch_idx):
        image, label = train_batch

        outputs = self.model(image)
        loss = self.criterion(outputs, label)
        acc = accuracy(outputs, label)
        f1 = f1_score(outputs, label, num_classes=4, average="macro")
        self.log("val/acc", acc, on_epoch=True, on_step=True, prog_bar=True)
        self.log("val/loss", loss, on_epoch=True, on_step=True, prog_bar=True, sync_dist=True)
        self.log("val/f1", f1, on_epoch=True, on_step=True, prog_bar=True, sync_dist=True)

        return {"acc":acc, "loss":loss, "f1-score":f1}

In [None]:
kf = KFold(n_splits=3)
for idx, (train_index, val_index) in enumerate(kf.split(X=traindf['img_id'])):
    wandb_logger = WandbLogger(project="apple_classification", name=f'eff_b4_drop_fold{idx + 1:02d}', entity="leehm")
    checkpoint_callback = ModelCheckpoint(
            monitor="val/f1",
            dirpath="/content/drive/MyDrive/apple/results",
            filename=f"eff_b4_drop_fold{idx + 1:02d}_" + "{val/loss:.4f}",
            save_top_k=3,
            mode="max",
            # save_weights_only=True
        )

    early_stop_callback = EarlyStopping(monitor="val/loss", min_delta=0.00, patience=50, verbose=True,
                                            mode="min")
    model = ClassificationModel()
    train_transform= make_transform()

    train_ds = AppleDataset(TRAIN_IMG_DIR, traindf.iloc[train_index], train_transform)
    train_dataloader = torch.utils.data.DataLoader(train_ds, batch_size=32, num_workers=2, shuffle=True, drop_last=True)

    val_ds = AppleDataset(TRAIN_IMG_DIR, traindf.iloc[val_index], train_transform)
    val_dataloader = torch.utils.data.DataLoader(val_ds, batch_size=32, num_workers=2)

    trainer = pl.Trainer(accelerator='gpu',
                      devices=1,
                      precision=32,
                      max_epochs=30,
                      log_every_n_steps=10,
                      logger=wandb_logger,
                      callbacks=[checkpoint_callback, early_stop_callback])
    
    trainer.fit(model, train_dataloaders=train_dataloader, val_dataloaders=val_dataloader)

    wandb.finish()

  "There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse"
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name      | Type             | Params
-----------------------------------------------
0 | model     | EfficientNet     | 18.8 M
1 | criterion | CrossEntropyLoss | 0     
2 | accuracy  | Accuracy         | 0     
-----------------------------------------------
1.3 M     Trainable params
17.5 M    Non-trainable params
18.8 M    Total params
75.322    Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric val/loss improved. New best score: 0.209


Validation: 0it [00:00, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric val/loss improved by 0.048 >= min_delta = 0.0. New best score: 0.161


Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric val/loss improved by 0.001 >= min_delta = 0.0. New best score: 0.160


Validation: 0it [00:00, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric val/loss improved by 0.020 >= min_delta = 0.0. New best score: 0.140


Validation: 0it [00:00, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric val/loss improved by 0.001 >= min_delta = 0.0. New best score: 0.139


Validation: 0it [00:00, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric val/loss improved by 0.011 >= min_delta = 0.0. New best score: 0.129


Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric val/loss improved by 0.010 >= min_delta = 0.0. New best score: 0.119


Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric val/loss improved by 0.007 >= min_delta = 0.0. New best score: 0.112


Validation: 0it [00:00, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric val/loss improved by 0.001 >= min_delta = 0.0. New best score: 0.111


Validation: 0it [00:00, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric val/loss improved by 0.002 >= min_delta = 0.0. New best score: 0.109


In [20]:
class TestDataset(Dataset):
    def __init__(self, img_folder, label_path):
        self.df = pd.read_csv(label_path, usecols=['img_id'], dtype={'a_type':str})
        # self.df = label_path
        self.img_folder = img_folder
        self.transform = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize([0.485,0.456,0.406],[0.229,0.224,0.225])
        ])
        self.img_ids = list(self.df['img_id'])

    def __len__(self):
        return len(self.img_ids)

    def __getitem__(self, index):
        image_path = os.path.join(self.img_folder, self.img_ids[index])
        img = Image.open(image_path)
        img = self.transform(img)
        img_ids = self.img_ids[index]

        return img, img_ids

In [None]:
model = ClassificationModel()
# model.load_state_dict(torch.load('/content/drive/MyDrive/apple/results/eff_b4_fold01_val/loss=0.0866.ckpt'))
model = model.load_from_checkpoint('/content/drive/MyDrive/apple/results/eff_b4_fold01_val/loss=0.0866.ckpt')
model.to(device)
model.eval()

In [28]:
test_dataset = TestDataset(img_folder = TEST_IMG_DIR, label_path = os.path.join(TEST_LABEL_DIR))

# DataLoader
test_dataloader = DataLoader(dataset = test_dataset,
                              batch_size = 1,
                              num_workers = 2,
                              shuffle = False,
                              pin_memory = True,
                              drop_last = False)

In [29]:
# 예측 진행
y_preds = []
img_ids = []

for batch_index, (x, img_id) in enumerate(tqdm(test_dataloader)):
    x = x.to(device, dtype=torch.float)
    y_logits = model(x).cpu()
    y_pred = torch.argmax(y_logits, dim=1)
    y_pred = y_pred.cpu().tolist()
    img_ids.extend(img_id)
    y_preds.extend(y_pred)


  0%|          | 0/5000 [00:00<?, ?it/s][A
  0%|          | 1/5000 [00:00<15:56,  5.23it/s][A
  0%|          | 4/5000 [00:01<21:33,  3.86it/s][A
  0%|          | 6/5000 [00:02<31:30,  2.64it/s][A
  0%|          | 8/5000 [00:02<32:15,  2.58it/s][A
  0%|          | 10/5000 [00:04<38:55,  2.14it/s][A
  0%|          | 12/5000 [00:04<37:42,  2.20it/s][A
  0%|          | 14/5000 [00:06<39:30,  2.10it/s][A
  0%|          | 16/5000 [00:06<37:28,  2.22it/s][A
  0%|          | 18/5000 [00:07<39:40,  2.09it/s][A
  0%|          | 20/5000 [00:08<38:26,  2.16it/s][A
  0%|          | 22/5000 [00:09<39:48,  2.08it/s][A
  0%|          | 24/5000 [00:10<38:43,  2.14it/s][A
  1%|          | 26/5000 [00:11<40:47,  2.03it/s][A
  1%|          | 28/5000 [00:12<38:13,  2.17it/s][A
  1%|          | 30/5000 [00:13<40:04,  2.07it/s][A
  1%|          | 32/5000 [00:14<39:27,  2.10it/s][A
  1%|          | 34/5000 [00:15<37:58,  2.18it/s][A
  1%|          | 36/5000 [00:16<39:38,  2.09it/s][A
  1%|

In [30]:
pred_df = pd.DataFrame(list(zip(img_ids, y_preds)), columns=['img_id','a_type'])
label_decoding = {0:'HJ', 1:'HR', 2:'SG', 3:'AR'}
pred_df['a_type'] = pred_df['a_type'].replace(label_decoding)

In [31]:
sample_df = pd.read_csv(SAMPLE_DIR)
sorter = list(sample_df['img_id'])
resdf = pred_df.set_index('img_id')
result = resdf.loc[sorter].reset_index()

In [32]:
result

Unnamed: 0,img_id,a_type
0,VXKQzkmgbf.jpg,HJ
1,2JFr2uJU79.jpg,HJ
2,twROnuoPeB.jpg,HR
3,b8mY8Szmhb.jpg,HJ
4,tcPc4JOUkG.jpg,HR
...,...,...
4995,kUAVadtfy1.jpg,HJ
4996,qXGjcfOaj4.jpg,HJ
4997,xLnjiHkkp9.jpg,SG
4998,Hvf9rxz3FM.jpg,HJ


In [33]:
result.to_csv(os.path.join('/content/drive/MyDrive/apple/results','result.csv'),index=False)