### 사전 준비

In [1]:
!pip install transformers==4.29 torch

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers==4.29
  Downloading transformers-4.29.0-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m85.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0 (from transformers==4.29)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m29.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.29)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m117.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.15.1 tokenizers-0.13.3 transformers-4

In [2]:
!pip install pycocoevalcap

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pycocoevalcap
  Downloading pycocoevalcap-1.2-py3-none-any.whl (104.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.3/104.3 MB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pycocoevalcap
Successfully installed pycocoevalcap-1.2


In [3]:
!pip install tensorboardX

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tensorboardX
  Downloading tensorboardX-2.6-py2.py3-none-any.whl (114 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m114.5/114.5 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tensorboardX
Successfully installed tensorboardX-2.6


In [4]:
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
import json
import os
from PIL import Image
from torch import nn
import torch
import time

In [5]:
class ImageCaptioningDataset(Dataset):
    def __init__(self, dataset, processor):
        self.dataset = dataset
        self.processor = processor

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        encoding = self.processor(images=item["image"], text=item["text"], padding="max_length", return_tensors="pt")

        encoding = {k:v.squeeze() for k,v in encoding.items()}
        return encoding

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### 모델 및 process 준비

In [7]:
from transformers import  AutoProcessor
from transformers import AutoModelForCausalLM
mode = "microsoft/git-base"
processor = AutoProcessor.from_pretrained('/content/drive/MyDrive/재원/model/caption/git/preprocessor')
model = AutoModelForCausalLM.from_pretrained(mode)

Downloading (…)lve/main/config.json:   0%|          | 0.00/2.82k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/707M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/141 [00:00<?, ?B/s]

### 옵션

In [8]:
option = {
    'start': 0,
    'num': 5000,
    'batch_size':4,
}

### 데이터셋 준비 path는 알아서 잘 조정하기 - 1000개당 3분 정도 걸림

In [9]:
#zip 풀기
!unzip -qq '/content/drive/MyDrive/data/imagedata.zip' -d '/content'

In [10]:
labelpath="/content/caption.json"
with open(labelpath, 'r',encoding = 'utf-8' or 'cp949' ) as f: # json 파일 접근
    captions = json.load(f)
import random
random.seed(777)
random.shuffle(captions)

In [14]:
def image_list(captions,dir,n,m): # 이미지 데이터 가져오기
    imagelist=[]
    for i in range(n,n+m):
        path = dir+'/'+captions[i]['image']
        image = Image.open(path)
        imagelist.append(image)
    return imagelist

In [12]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [15]:
ImageList = image_list(captions,'/content/imagedata',option['start'],option['num'])

In [16]:
data = [{'text':captions[i]['label'],'image':ImageList[i]} for i in range(option['start'],option['start']+option['num'])]

In [17]:
train_dataset = ImageCaptioningDataset(data[:int(0.8*option['num'])], processor)
val_dataset = ImageCaptioningDataset(data[int(0.8*option['num']):], processor)
train_dataloader = DataLoader(train_dataset,shuffle=False,batch_size = option['batch_size'])
val_dataloader = DataLoader(val_dataset,shuffle=False,batch_size = option['batch_size'])

In [18]:
num_train = int(0.8*option['num']/option['batch_size'])
num_test = int(0.2*option['num']/option['batch_size'])

### 성능 평가 코드

In [19]:
def gen_captions(captions,filename):
    gen = []
    for i in range(len(captions)):
        gen.append({'image_id': i+1, 'caption': captions[i]})
    with open(filename,'w') as f:
      json.dump(gen,f)

In [20]:
def make_gt(captions,filename):
    annotations=[]
    images = []
    for i in range(len(captions)):
        annotations.append({'image_id': i+1, 'caption': captions[i],'id': i+1})
        images.append({'id':i+1})
    gt_captions ={
        'annotations': annotations,
        'images': images
    }
    with open(filename,'w') as f:
      json.dump(gt_captions,f)

In [21]:
import json
import os
from pycocotools.coco import COCO
from pycocoevalcap.eval import COCOEvalCap

def coco_caption_eval(annotation_file, results_file):

    coco = COCO(annotation_file)
    coco_result = coco.loadRes(results_file)

    coco_eval = COCOEvalCap(coco, coco_result)
    coco_eval.evaluate()

    # print output evaluation scores
    for metric, score in coco_eval.eval.items():
        print(f'{metric}: {score:.3f}')

    return coco_eval

### finetuning 진행

In [22]:
import torch
train_cpath = '/content/drive/MyDrive/재원/captions/none/train'
val_cpath = '/content/drive/MyDrive/재원/captions/none/val'
train_rpath = '/content/drive/MyDrive/재원/captions/none/train_real.json'
val_rpath = '/content/drive/MyDrive/재원/captions/none/val_real.json'
train_hist=[]
val_hist = []
train_eval = []
val_eval = []

In [23]:
make_gt([item['text'] for item in data][:int(0.8*option['num'])],train_rpath) # train의 진짜 caption
make_gt([item['text'] for item in data][int(0.8*option['num']):],val_rpath)#val의 진짜 caption

In [24]:
lr = 1e-5
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
E = 10

for epoch in range(E):
    model.train()
    Loss = 0

    train_caption = []
    for idx, batch in enumerate(train_dataloader):
        model.train()
        input_ids = batch.pop("input_ids").to(device)
        pixel_values = batch.pop("pixel_values").to(device)
        outputs = model(input_ids=input_ids,pixel_values=pixel_values, labels=input_ids)
        loss = outputs.loss
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        #성능을 보기 위한 작업

        with torch.no_grad():
            model.eval()
            train_caption+=processor.batch_decode(model.generate(pixel_values=pixel_values,max_length = 100),skip_special_tokens=True)
        Loss+=loss.tolist()


    train_hist.append(Loss/len(train_dataloader))


    #validation진행

    val = 0
    val_caption =[]
    with torch.no_grad():
        model.eval()
        for idx, batch in enumerate(val_dataloader):
            input_ids = batch.pop("input_ids").to(device)
            pixel_values = batch.pop("pixel_values").to(device)
            outputs = model(input_ids=input_ids,pixel_values=pixel_values, labels=input_ids)

            #성능을 보기위한 작업
            val_caption+=processor.batch_decode(model.generate(pixel_values=pixel_values,max_length = 100),skip_special_tokens=True)
            val+=outputs.loss.tolist()

    val_hist.append(val/len(val_dataloader))

    #checkpoint
    if val_hist[-1]==min(val_hist):
        torch.save(model,'/content/drive/MyDrive/재원/model/caption/git/git_none_all.pt')

    #Epoch 출력
    print("Epoch {}회차 - val_Loss:{}, ".format(epoch+1,val*option['batch_size']/(0.2*option['num'])))

    #epoch의 caption들 저장 및 성능 출력을 위한 코드들
    gen_captions(train_caption,train_cpath+'/'+str(epoch+1)+'.json')
    gen_captions(val_caption,val_cpath+'/'+str(epoch+1)+'.json')
    train_eval.append(coco_caption_eval(train_rpath,train_cpath+'/'+str(epoch+1)+'.json').eval.items())
    val_eval.append(coco_caption_eval(val_rpath,val_cpath+'/'+str(epoch+1)+'.json').eval.items())

Epoch 1회차 - val_Loss:0.6750589841604233, 
loading annotations into memory...
Done (t=0.00s)
creating index...
index created!
Loading and preparing results...
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
Downloading stanford-corenlp-3.6.0 for SPICE ...
Progress: 384.5M / 384.5M (100.0%)
Extracting stanford-corenlp-3.6.0 ...
Done.
computing Bleu score...
{'testlen': 21467, 'reflen': 21653, 'guess': [21467, 19867, 18267, 16667], 'correct': [17048, 11617, 7954, 5693]}
ratio: 0.991409966286381
Bleu_1: 0.787
Bleu_2: 0.676
Bleu_3: 0.582
Bleu_4: 0.508
computing METEOR score...
METEOR: 0.400
computing Rouge score...
ROUGE_L: 0.748
computing CIDEr score...
CIDEr: 3.309
computing SPICE score...
SPICE: 0.508
Bleu_1: 0.787
Bleu_2: 0.676
Bleu_3: 0.582
Bleu_4: 0.508
METEOR: 0.400
ROUGE_L: 0.748
CIDEr: 3.309
SPICE: 0.508
loading annotations into memory...
Done (t=0.00s)
creating index...
index created!
Loading and preparing results...
DONE (t=0.00s)
creating in

In [26]:
import json
total_list =[train_hist,[dict(item.mapping) for item in train_eval],val_hist,[dict(item.mapping) for item in val_eval]]
with open('/content/drive/MyDrive/재원/eval/loss/git_none_plot.json','w') as f:
    json.dump(total_list,f)