In [1]:
import pandas as pd
import re
import contractions
import numpy as np

In [2]:
### 데이터 로드
filename = '../data/Books_ratings_100k.csv'
reviewDF = pd.read_csv(filename)

In [3]:
### 데이터 확인 (결측치 없음)
reviewDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 2 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   review/summary  100000 non-null  object
 1   review/text     100000 non-null  object
dtypes: object(2)
memory usage: 1.5+ MB


In [4]:
### 데이터 확인 (랜덤으로 샘플 10개씩 추출)
reviewDF.sample(10)

Unnamed: 0,review/summary,review/text
88558,Roads Go Ever Ever On,Having first read The Hobbit almost 20 years a...
54559,Use it to pass the exam and explore JSP and se...,This is a very good book for passing the SCWCD...
96697,"A very easy and comfortable, airplane read",This book has been overly overestimated in its...
66489,AWESOME!,Awesome guide! The best as finally been put ou...
27297,The original dystopia,"Read for a philosophy class in college, but I ..."
90024,My Review on the Hobbit,The Hobbit was a story about a little hobbit n...
78644,Missing quotations limit the value,The vocal duets and quartets by Brahms are per...
87194,A classic for many generations,This may have been the 20th time I've read The...
26915,Well written hokum,Earleys' rendering of a Perfect Past has it's ...
59502,text missing which is present in the project g...,I read a number of reviews pointing out that s...


In [17]:
### frac : 전체 데이터 중 얼마나 사용할 것인지 (나는 0.1에 해당하는 1만 개만 사용)
### 60%, 20%, 20% 로 데이터 분할
train, valid, test = np.split(
    reviewDF.sample(frac=0.01, random_state=42), [int(0.006 * len(reviewDF)), int(0.008 * len(reviewDF))]
)

In [18]:
len(train), len(valid), len(test)

(600, 200, 200)

In [19]:
### 텍스트 소문자화
sentence = 'I LOVED IT'
sentence.lower()

'i loved it'

In [20]:
### 소괄호로 둘러싸인 문자열 삭제
pattern = r'\([^)]*\)'
sentence = 'my husband (and myself) for'
re.sub(pattern, '', sentence)

'my husband  for'

In [21]:
### 대괄호로 둘러싸인 문자열 삭제
pattern = r'\[[^\]]*\]'
sentence = 'my husband [and myself] for'
re.sub(pattern, '', sentence)

'my husband  for'

In [22]:
### 344개 조합 (축약어, 슬랭)
len(contractions.contractions_dict.keys())

344

In [23]:
contractions.fix("I've known that.")

'I have known that.'

In [24]:
import torch
from transformers import BartTokenizer
from torch.utils.data import TensorDataset, DataLoader
from torch.utils.data import RandomSampler, SequentialSampler
from torch.nn.utils.rnn import pad_sequence

In [25]:
def make_dataset(data, tokenizer, device):
    tokenized = tokenizer(  # tokenized : transformers.tokenization_utils_base.BatchEncoding
        text=data['review/text'].tolist(),  # data 안의 text를 list로 변환
        padding="longest",                  # 가장 긴 시퀀스의 길이에 맞춰 패딩을 추가
        truncation=True,                    # 입력 데이터가 모델의 최대입력길이를 초과하는 경우 자르는 작업
        return_tensors="pt",                # 토큰화된 결과를 pytorch tensor로 반환
        max_length=256                      # transformers 4.37.2 에서는 없어도 자동으로 길이고정됨. transformers 4.40.0 에서는 max_length를 줘야 고정됨
    )   # tokenized : 'input_ids', 'attention_mask' 로 구성 (둘 다 tensor이고, shape : [3000, 3913])
    labels = []
    input_ids = tokenized["input_ids"].to(device)
    attention_mask = tokenized["attention_mask"].to(device)
    for target in data['review/summary']:
        ### 요약 문자열을 토큰화, 정수화 하여 텐서로 저장
        labels.append(tokenizer.encode(target, return_tensors="pt").squeeze())
    ### 입력으로 주어진 시퀀스 중 가장 긴 길이에 맞춰서 패딩을 수행한다. (shape : [30])
    labels = pad_sequence(labels, batch_first=True, padding_value=-100).to(device)
    return TensorDataset(input_ids, attention_mask, labels)

def get_datalodader(dataset, sampler, batch_size):
    data_sampler = sampler(dataset)
    dataloader = DataLoader(dataset, sampler=data_sampler, batch_size=batch_size)
    return dataloader

In [26]:
### 에폭, 배치사이즈, 디바이스, 토크나이저
epochs = 3
batch_size = 32
device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = BartTokenizer.from_pretrained(
    pretrained_model_name_or_path="facebook/bart-base"
)

In [27]:
type(tokenizer)

transformers.models.bart.tokenization_bart.BartTokenizer

In [28]:
train_dataset = make_dataset(train, tokenizer, device)
train_dataloader = get_datalodader(train_dataset, RandomSampler, batch_size)

valid_dataset = make_dataset(valid, tokenizer, device)
valid_dataloader = get_datalodader(valid_dataset, SequentialSampler, batch_size)

test_dataset = make_dataset(test, tokenizer, device)
test_dataloader = get_datalodader(test_dataset, SequentialSampler, batch_size)

print(train_dataset[0])

(tensor([    0,  2747,   534, 31901,   725,   328,   387, 12807,    42,  1040,
           25,    38,   115,    45,  2067,    13,     5,  1569,     7,    28,
          703,     4,    38,  2333,   109,    45,   101,  4133,   156,    31,
         2799,   734, 32259, 11737,  2520,  8302,  1569,    21,    10,  6906,
           13,  4327,     4, 10462,     6,   143, 19721,     6,  3049,    50,
           45,     6,    16,   357,    87,    42,  1040,     4,   243,   362,
          162,    81,   727,  6052,     7,   120,   341,     7,  9786,    18,
        13430,     6,  2982,   267, 26427,     6,     8,   475, 23521,  2496,
            4,    38,   888,    21,  2008,     9,  2185,  2053,    38,   218,
           75,  1166,   615,    31,  2182,  7601,     4,  1308,  1623, 17261,
          162,     8,    26,    60,  8987,   213,   356,    23,   110,  2799,
          700, 13491,    72,    38,   115,    45,  1413,    69,  2410,  2496,
            4,   264,    16,    45,    10, 26669,   868,  2048,

In [29]:
x, y, z = train_dataset[0]
print(x.shape)  
print(y.shape)
print(z.shape)

torch.Size([256])
torch.Size([256])
torch.Size([37])


In [30]:
x, y, z = valid_dataset[0]
print(x.shape)
print(y.shape)
print(z.shape)

torch.Size([256])
torch.Size([256])
torch.Size([22])


In [31]:
x, y, z = test_dataset[0]
print(x.shape)
print(y.shape)
print(z.shape)

torch.Size([256])
torch.Size([256])
torch.Size([24])


In [32]:
from torch import optim
from transformers import BartForConditionalGeneration


model = BartForConditionalGeneration.from_pretrained(
    pretrained_model_name_or_path="facebook/bart-base"
).to(device)
optimizer = optim.AdamW(model.parameters(), lr=5e-5, eps=1e-8)

In [33]:
for main_name, main_module in model.named_children():
    print(main_name)
    for sub_name, sub_module in main_module.named_children():
        print("└", sub_name)
        for ssub_name, ssub_module in sub_module.named_children():
            print("│  └", ssub_name)
            for sssub_name, sssub_module in ssub_module.named_children():
                print("│  │  └", sssub_name)

model
└ shared
└ encoder
│  └ embed_tokens
│  └ embed_positions
│  └ layers
│  │  └ 0
│  │  └ 1
│  │  └ 2
│  │  └ 3
│  │  └ 4
│  │  └ 5
│  └ layernorm_embedding
└ decoder
│  └ embed_tokens
│  └ embed_positions
│  └ layers
│  │  └ 0
│  │  └ 1
│  │  └ 2
│  │  └ 3
│  │  └ 4
│  │  └ 5
│  └ layernorm_embedding
lm_head


In [34]:
import numpy as np
import evaluate
import rouge_score

In [35]:
def calc_rouge(preds, labels):
    preds = preds.argmax(axis=-1)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    rouge2 = rouge_score.compute(
        predictions=decoded_preds,
        references=decoded_labels
    )
    return rouge2["rouge2"]

In [36]:
def train(model, optimizer, dataloader):
    model.train()
    train_loss, train_rouge = 0.0, 0.0

    for input_ids, attention_mask, labels in dataloader:
        # print(f'input_ids => {input_ids.shape}')
        # print(f'attention_mask => {attention_mask.shape}')
        # print(f'labels => {labels.shape}')
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        logits = logits.detach().cpu().numpy()
        label_ids = labels.to("cpu").numpy()
        rouge = calc_rouge(logits, label_ids)
        
        train_loss += loss.item()
        train_rouge += rouge
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    train_loss = train_loss / len(dataloader)
    train_rouge = train_rouge / len(dataloader)
    return train_loss, train_rouge

In [37]:
def evaluation(model, dataloader):
    with torch.no_grad():
        model.eval()
        val_loss, val_rouge = 0.0, 0.0

        for input_ids, attention_mask, labels in dataloader:
            outputs = model(
                input_ids=input_ids, attention_mask=attention_mask, labels=labels
            )
            logits = outputs.logits
            loss = outputs.loss

            logits = logits.detach().cpu().numpy()
            label_ids = labels.to("cpu").numpy()
            rouge = calc_rouge(logits, label_ids)
            
            val_loss += loss
            val_rouge += rouge

    val_loss = val_loss / len(dataloader)
    val_rouge = val_rouge / len(dataloader)
    return val_loss, val_rouge

In [38]:
from tqdm import tqdm

In [39]:
epoch_logs_path = '../models/progress.txt'
train_list, val_list = [[], []], [[], []]   ## 각각 loss, accuracy 저장

rouge_score = evaluate.load("rouge", tokenizer=tokenizer)
best_loss, best_epoch = 10000, 0
for epoch in tqdm(range(epochs)):
    train_metrics = train(model, optimizer, train_dataloader)
    val_metrics = evaluation(model, valid_dataloader)
    print(f"Epoch {epoch + 1}: Train Loss: {train_metrics[0]:.4f} Val Loss: {val_metrics[0]:.4f} Val Rouge {val_metrics[1]:.4f}")

    for i in range(2):
        train_list[i].append(train_metrics[i])
        val_list[i].append(val_metrics[i])
    
    if val_metrics[0] < best_loss:
        ### val_loss가 best_loss보다 개선되면 저장
        best_loss = val_metrics[0]
        best_epoch = epoch
        torch.save(model.state_dict(), "../models/Bart_Amazon_Books.pt")
        # print("Saved the model weights")
    
    ### 최신 진행정보 저장(txt 파일)
    with open(epoch_logs_path, 'w') as file:
        file.write(str(f'Epoch {epoch} Completed\nBest Epoch : {best_epoch}'))

  0%|          | 0/3 [00:00<?, ?it/s]

Epoch 1: Train Loss: 3.8364 Val Loss: 3.2528 Val Rouge 0.0654


 33%|███▎      | 1/3 [25:04<50:09, 1504.64s/it]


KeyboardInterrupt: 

In [None]:
### 테스트 결과 출력
test_loss, test_rouge_score = evaluation(model, test_dataloader)
print(f"Test Loss : {test_loss:.4f}")
print(f"Test ROUGE-2 Score : {test_rouge_score:.4f}")

In [None]:
from transformers import pipeline


summarizer = pipeline(
    task="summarization",
    model=model,
    tokenizer=tokenizer,
    max_length=54,
    device="cpu"
)

for index in range(5):
    review_text = test['review/text'].iloc[index]
    summarization = test['review/summary'].iloc[index]
    predicted_summarization = summarizer(review_text)[0]["summary_text"]
    print(f"정답 요약문 : {summarization}")
    print(f"모델 요약문 : {predicted_summarization}\n")