In [None]:
# 구글 드라이브와 연동합니다. 권한 허용이 필요합니다.
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# 라이브러리를 설치합니다.
%pip install -q transformers datasets accelerate
%pip install -q peft

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m81.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m469.0/469.0 KB[0m [31m47.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m215.3/215.3 KB[0m [31m27.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8/199.8 KB[0m [31m26.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m106.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 KB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.2/212.2 KB[0m [31m28.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.9/132.9 KB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━

In [None]:
import numpy as np
import pandas as pd
import os, gc
from tqdm.auto import tqdm
from datetime import datetime, timezone, timedelta

import torch
from torch.utils.data import DataLoader,Dataset

import datasets
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM

# HuggingFace peft 라이브러리
from peft import get_peft_model, PeftModel, TaskType, LoraConfig

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    'kakaobrain/kogpt', revision='KoGPT6B-ryan1.5b-float16',
)

Downloading (…)okenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

Downloading (…)oat16/tokenizer.json:   0%|          | 0.00/2.51M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/88.0 [00:00<?, ?B/s]

In [None]:
# 테스트 데이터를 HuggingFace Dataset으로 불러옵니다.
data_path = '/content/drive/MyDrive/GPT_Competition/train.csv'
train_df = pd.read_csv(data_path)
train_set = datasets.Dataset.from_pandas(train_df)
del train_df

In [None]:
def train_batch_preprocess(batch):
    prompt = "{text}를 한 줄로 요약해주세요"
    query_text = [prompt.format(text=text) for text in batch['text']]
    target_text = batch['summary']
    query = tokenizer(query_text)
    target = tokenizer(target_text)

    input_ids = [q + t + [tokenizer.eos_token_id] for q, t in zip(query['input_ids'], target['input_ids'])]
    attention_mask = [q + t + [1] for q, t in zip(query['attention_mask'], target['attention_mask'])]
    labels = [[-100] * len(q) + t + [tokenizer.eos_token_id] for q, t in zip(query['input_ids'], target['input_ids'])]

    # 결과로 돌려주는 값들이 추가됩니다.
    return {'input_ids': input_ids, 'attention_mask': attention_mask, 'labels': labels}

In [None]:
# batch단위로 전처리를 실행합니다.
# 토큰화 이후에 id, text, summary는 필요없으므로 버립니다.
train_set = train_set.map(
    train_batch_preprocess,
    remove_columns = ['id', 'text', 'summary'],
    batched = True,
    batch_size = 1000,
)

Map:   0%|          | 0/40400 [00:00<?, ? examples/s]

In [None]:
# 결과를 확인합니다.
# (eos_token_id = 1, ignore_index = -100)
print(train_set)
print(len(train_set[0]['input_ids']))
print(train_set[0]['input_ids'])
print(train_set[0]['attention_mask'])
print(train_set[0]['labels'])

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 40400
})
672
[10181, 13817, 465, 657, 746, 1421, 1141, 779, 3292, 6883, 7337, 10160, 60219, 633, 56154, 10716, 413, 12418, 1341, 2125, 530, 3862, 1403, 57039, 1951, 20774, 120, 63997, 327, 4169, 14180, 4800, 387, 642, 776, 779, 3292, 6883, 7337, 10160, 402, 7634, 3025, 413, 1114, 529, 374, 120, 63997, 327, 15248, 1842, 2154, 13574, 409, 5996, 379, 14565, 2383, 376, 930, 408, 2238, 708, 26442, 720, 409, 16615, 24673, 374, 120, 63997, 327, 4869, 4595, 387, 8161, 4795, 56624, 376, 7775, 395, 779, 641, 800, 118, 516, 3035, 16704, 762, 451, 393, 4595, 379, 2348, 387, 5700, 697, 10160, 378, 7634, 53554, 376, 2991, 374, 120, 63997, 327, 22462, 534, 2020, 2460, 696, 1573, 580, 534, 27589, 1997, 728, 511, 605, 387, 1431, 427, 3883, 511, 22794, 1859, 413, 52713, 385, 478, 1300, 2238, 374, 120, 63997, 327, 4579, 376, 3788, 1987, 1092, 19103, 35761, 120, 63997, 327, 23248, 2806, 10043, 4595, 387, 30001, 1575, 378, 191

In [None]:
def left_pad(sequence, value, max_len):
    return [value] * (max_len - len(sequence)) + sequence

def collate_fn(batch, device='cuda'):
    length = max(len(row['input_ids']) for row in batch)
    input_ids = [
        left_pad(row['input_ids'], tokenizer.pad_token_id, length)
        for row in batch
    ]
    attention_mask = [
        left_pad(row['attention_mask'], 0, length)
        for row in batch
    ]
    labels = [
        left_pad(row['input_ids'], -100, length)
        for row in batch
    ]
    return {
        'input_ids': torch.tensor(input_ids, dtype=torch.long , device=device),
        'attention_mask': torch.tensor(attention_mask, dtype=torch.long , device=device),
        'labels': torch.tensor(labels, dtype=torch.long , device=device),
    }

In [None]:
train_loader = DataLoader(
    train_set, batch_size=2, shuffle=True, num_workers=0,
    collate_fn=collate_fn,
)

In [None]:
base_model = AutoModelForCausalLM.from_pretrained(
    'kakaobrain/kogpt', revision = 'KoGPT6B-ryan1.5b-float16',
    torch_dtype = torch.float16,
    device_map = 'auto',
)

Downloading (…)-float16/config.json:   0%|          | 0.00/839 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/12.3G [00:00<?, ?B/s]

In [None]:
peft_config = LoraConfig(
    task_type = TaskType.CAUSAL_LM,
    r=8, lora_alpha=64, lora_dropout=0.15,
    target_modules = ['q_proj', 'v_proj'],
    # target_modules = r".*(q_proj|v_proj)",
)

In [None]:
print(peft_config)

LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, base_model_name_or_path=None, task_type=<TaskType.CAUSAL_LM: 'CAUSAL_LM'>, inference_mode=False, r=8, target_modules=['q_proj', 'v_proj'], lora_alpha=64, lora_dropout=0.1, merge_weights=False, fan_in_fan_out=False, enable_lora=None, bias='none', modules_to_save=None)


In [None]:
peft_model = get_peft_model(base_model, peft_config)
peft_model.to('cuda')
peft_model.train()

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): GPTJForCausalLM(
      (transformer): GPTJModel(
        (wte): Embedding(64512, 4096)
        (drop): Dropout(p=0.1, inplace=False)
        (h): ModuleList(
          (0): GPTJBlock(
            (ln_1): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
            (attn): GPTJAttention(
              (attn_dropout): Dropout(p=0.1, inplace=False)
              (resid_dropout): Dropout(p=0.0, inplace=False)
              (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
              (v_proj): Linear(
                in_features=4096, out_features=4096, bias=False
                (lora_dropout): Dropout(p=0.15, inplace=False)
                (lora_A): Linear(in_features=4096, out_features=8, bias=False)
                (lora_B): Linear(in_features=8, out_features=4096, bias=False)
              )
              (q_proj): Linear(
                in_features=4096, out_features=4096, bias=False
              

In [None]:
pd.DataFrame([
    (param.dtype, param.shape, param.device, param.requires_grad, name)
    for name, param in peft_model.named_parameters()
], columns=['dtype', 'shape', 'device', 'requires_grad', 'name'])

Unnamed: 0,dtype,shape,device,requires_grad,name
0,torch.float16,"(64512, 4096)",cuda:0,False,base_model.model.transformer.wte.weight
1,torch.float16,"(4096,)",cuda:0,False,base_model.model.transformer.h.0.ln_1.weight
2,torch.float16,"(4096,)",cuda:0,False,base_model.model.transformer.h.0.ln_1.bias
3,torch.float16,"(4096, 4096)",cuda:0,False,base_model.model.transformer.h.0.attn.k_proj.w...
4,torch.float16,"(4096, 4096)",cuda:0,False,base_model.model.transformer.h.0.attn.v_proj.w...
...,...,...,...,...,...
392,torch.float16,"(4096,)",cuda:0,False,base_model.model.transformer.h.27.mlp.fc_out.bias
393,torch.float16,"(4096,)",cuda:0,False,base_model.model.transformer.ln_f.weight
394,torch.float16,"(4096,)",cuda:0,False,base_model.model.transformer.ln_f.bias
395,torch.float16,"(64512, 4096)",cuda:0,False,base_model.model.lm_head.weight


In [None]:
learning_rate = 5e-5

optimizer = torch.optim.Adam(peft_model.parameters(), lr=learning_rate)
scaler = torch.cuda.amp.GradScaler()

In [None]:
def training_step(model, batch, optimizer, scaler):
    optimizer.zero_grad()
    with torch.cuda.amp.autocast():
        outputs = model(
            input_ids = batch['input_ids'],
            attention_mask = batch['attention_mask'],
            labels = batch['labels'],
        )
        step_loss = outputs[0]
    scaler.scale(step_loss).backward()
    scaler.step(optimizer)
    scaler.update()
    return step_loss.detach()

In [None]:
gc.collect()
torch.cuda.empty_cache()

In [None]:
NUM_EPOCHS = 1

peft_model.train()
for epoch in range(NUM_EPOCHS):
    total_loss = 0
    tr_loss = torch.tensor(0.0).to('cuda')
    for batch_idx, batch in enumerate(tqdm(train_loader), start=1):
        step_loss = training_step(peft_model, batch, optimizer, scaler)
        tr_loss += step_loss
        if batch_idx % 100 == 0:
            print("{}. tr_loss: {}".format(batch_idx, tr_loss.item()))
            tr_loss = torch.tensor(0.0).to('cuda')

  0%|          | 0/20200 [00:00<?, ?it/s]

100. tr_loss: 246.5784149169922
200. tr_loss: 235.14837646484375
300. tr_loss: 227.2115020751953
400. tr_loss: 221.83424377441406
500. tr_loss: 223.70423889160156
600. tr_loss: 220.0492401123047
700. tr_loss: 217.73651123046875
800. tr_loss: 216.53533935546875
900. tr_loss: 219.5530548095703
1000. tr_loss: 217.95742797851562
1100. tr_loss: 215.09219360351562
1200. tr_loss: 213.79066467285156
1300. tr_loss: 213.30377197265625
1400. tr_loss: 216.6600341796875
1500. tr_loss: 214.36566162109375
1600. tr_loss: 213.4978485107422
1700. tr_loss: 215.2415313720703
1800. tr_loss: 214.16368103027344
1900. tr_loss: 213.9835968017578
2000. tr_loss: 210.60934448242188
2100. tr_loss: 210.1337432861328
2200. tr_loss: 215.92344665527344
2300. tr_loss: 215.62071228027344
2400. tr_loss: 212.16001892089844
2500. tr_loss: 215.33401489257812
2600. tr_loss: 213.6705780029297
2700. tr_loss: 215.19898986816406
2800. tr_loss: 212.7417755126953
2900. tr_loss: 213.78785705566406
3000. tr_loss: 210.13058471679688


In [None]:
TIME_SERIAL = datetime.now(timezone(timedelta(hours=9))).strftime("%y%m%d-%H%M%S")
PEFT_MODEL_PATH = f'/content/drive/MyDrive/GPT_Competition/exp_{TIME_SERIAL}'
peft_model.save_pretrained(PEFT_MODEL_PATH)
print(PEFT_MODEL_PATH)

/content/drive/MyDrive/GPT_Competition/exp_230326-072534


In [None]:
# Install Library
%pip install -q transformers datasets accelerate
%pip install -q peft

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m84.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m469.0/469.0 KB[0m [31m22.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.8/212.8 KB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m89.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8/199.8 KB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.9/132.9 KB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 KB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m32.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader,Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
import os, gc
from datetime import datetime, timezone, timedelta
from tqdm.auto import tqdm

In [None]:
# Google Drive Mount
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
PEFT_MODEL_PATH = "/content/drive/MyDrive/GPT_Competition/exp_230326-024730"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    'kakaobrain/kogpt', revision='KoGPT6B-ryan1.5b-float16',
)

Downloading (…)okenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

Downloading (…)oat16/tokenizer.json:   0%|          | 0.00/2.51M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/88.0 [00:00<?, ?B/s]

In [None]:
base_model = AutoModelForCausalLM.from_pretrained(
    'kakaobrain/kogpt', revision = 'KoGPT6B-ryan1.5b-float16',
    torch_dtype = torch.float16,
    device_map = 'auto',
)

Downloading (…)-float16/config.json:   0%|          | 0.00/839 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/12.3G [00:00<?, ?B/s]

In [None]:
model = PeftModel.from_pretrained(model=base_model, model_id=PEFT_MODEL_PATH)

In [None]:
class SummaryTestDataset(Dataset):
    def __init__(self, data_path, tokenizer):
        self._data = pd.read_csv(data_path)
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self._data)

    def __getitem__(self, idx):
        row = self._data.iloc[idx]
        prompt = "{text}를 한 줄로 요약해주세요"
        input_text = prompt.format(text=row['text'])
        input_encoding = self.tokenizer(input_text)

        result = {
            'input_ids': input_encoding['input_ids'],
            'attention_mask': input_encoding['attention_mask'],
        }

        return result

    def _left_pad(self, sequence, value, max_len):
        return [value] * (max_len - len(sequence)) + sequence

    def collate_fn(self, batch, device='cuda'):
        input_length = max(len(row['input_ids']) for row in batch)

        input_ids = [
            self._left_pad(row['input_ids'], self.tokenizer.pad_token_id, input_length)
            for row in batch
        ]
        attention_mask = [
            self._left_pad(row['attention_mask'], 0, input_length)
            for row in batch
        ]

        return {
            'input_ids': torch.tensor(input_ids, dtype=torch.long, device=device),
            'attention_mask': torch.tensor(attention_mask, dtype=torch.long, device=device),
        }

In [None]:
test_path = '/content/drive/MyDrive/GPT_Competition/test.csv'
test_set = SummaryTestDataset(test_path, tokenizer)
test_loader = DataLoader(test_set, batch_size=2, num_workers=0, shuffle=False, collate_fn=test_set.collate_fn)

In [None]:
def predict():
    preds = []
    model.eval()
    for batch_idx, batch in enumerate(tqdm(test_loader)):
        with torch.no_grad():
            with torch.amp.autocast('cuda'):
                generated = model.generate(
                    input_ids = batch['input_ids'],
                    attention_mask = batch['attention_mask'],

                    pad_token_id = tokenizer.pad_token_id,
                    max_new_tokens = 100,
                    do_sample = False,
                    num_beams = 1,
                    num_beam_groups = 1,
                    penalty_alpha = None,
                    use_cache = True,

                    temperature = 1.0,

                )
            prompted_length = batch['input_ids'].size(-1)
            summary_tokens = generated[:, prompted_length:]
            summary = tokenizer.batch_decode(summary_tokens, skip_special_tokens=True)
            preds.extend(summary)
            print(*summary, sep='\n----------\n',end='\n========\n')
    return preds

preds = predict()

  0%|          | 0/250 [00:00<?, ?it/s]

 서울시여성가족재단은 세계 어린이날을 맞아 보육·교육기관에서 어린이가 겪는 성차별적 말과 행동을 양성평등 관점에서 바꾼 '서울시 성평등 어린이사전'을 발표했다.
----------
 한국전력의 적자가 누적된 상황에서 전 국민에 대한 전기 요금 인하나 유예는 현실적으로 어려운 만큼 선별적으로 전기 요금 납부를 유예해주는 방안이 검토되고 있다.
 창원 경상대병원 노동조합은 병원 의사 A씨와 B씨로부터 다수의 간호사가 직장 내 괴롭힘을 당했다며 노동부 창원지청에 진정서를 제출했다고 7일 밝혔다.
----------
 17개 과채주스 제품의 200'g당 평균 당류 함량은 17.28g으로 1일 영양성분 기준치 100g의 17% 수준에 해당하게 되겠습니다.
 포항지역 수험생들은 정신보건전문가 상담전화를 마련하여 도움을 받을 수 있도록 지원하고 있으며 수능 연기에 따른 정부의 조치현황을 정확하게 안내하고 국민의 고충을 듣고 신속하게 답변할 계획입니다.
----------
 현재 돌파감염은 모든 접종일정을 완료한 사람들 중에서 14일 정도 경과된 다음에 발생하는 상황을 말씀드리며 백신 간의 비교는 바람직하지 않다.
정부는 가명정보 제도 도입 1년의 성과를 바탕으로 현장의 의견을 반영한 규제혁신과 맞춤형 지원을 통해 안전하고 편리한 가명정보 확산을 본격적으로 추진하고자 한다.
----------
 중앙재난안전대책본부는 대구시의 신천지 고발 조치에 대해 강한 불만을 드러냈으며, 김강립 중대본 1총괄조정관은 2일 정례 브리핑에서 신천지 강제수사 주장에 대해 부정적 입장을 밝혔다.
 예방접종 후 이상반응으로 신고된 사망자를 살펴보면 화이자 백신 접종자에서 이상반응 사망신고가 많은 것은 접종대상자가 75세 이상 어르신과 노인시설 입소자분과 같이 고령층인 것으로 분석되고 있다.
----------
 23일 밤부터 부산 전역에 물폭탄이 쏟아지면서 산사태, 옹벽 붕괴, 주택과 지하차도 등이 침수돼 79명이 고립됐다가 구조됐다.
 노회찬 위원은 군병력이 테러진압이라는 이름으로 군사활동이 아

In [None]:
test_df = pd.read_csv(test_path)
test_df['summary'] = preds

In [None]:
# 현재 시간으로 이름붙인 제출파일을 생성합니다.
TIME_SERIAL = datetime.now(timezone(timedelta(hours=9))).strftime("%y%m%d-%H%M%S")
SUBMISSION_PATH = os.path.join(PEFT_MODEL_PATH, f"{TIME_SERIAL}.csv")
test_df[['id', 'summary']].to_csv(SUBMISSION_PATH, index=False)
print(SUBMISSION_PATH)

/content/drive/MyDrive/GPT_Competition/exp_230326-024730/230326-080554.csv


In [None]:
# 자동으로 세션을 종료하고 싶을때 사용하세요.
# from google.colab import runtime
# runtime.unassign()