### 아마존 리뷰 데이터를 사용한 텍스트 요약 모델 만들기
- predict 구현한 코드

In [1]:
import torch
from transformers import BartTokenizer
from torch.utils.data import TensorDataset, DataLoader
from torch.utils.data import RandomSampler, SequentialSampler
from torch.nn.utils.rnn import pad_sequence

In [2]:
from torch import optim
from transformers import BartForConditionalGeneration

In [97]:
device = "cuda" if torch.cuda.is_available() else "cpu"

### 저장된 모델 불러오기(모델 객체 생성 -> 상태 사전 불러오기)
### first 에 속한 모델들은 자연어 전처리가 안 된 문장을 학습하였고,
### second 에 속한 모델들은 자연어 전처리가 된 문장을 학습하였다.
model = BartForConditionalGeneration.from_pretrained(
    pretrained_model_name_or_path="facebook/bart-base"
).to(device)
state_dict = torch.load('../models/second/Latest_Bart_Amazon_Books.pt',
                            map_location=torch.device(device))  # collections.OrderedDict
model.load_state_dict(state_dict)

<All keys matched successfully>

In [98]:
type(model)

transformers.models.bart.modeling_bart.BartForConditionalGeneration

In [99]:
from transformers import pipeline

### 평가 모드 ON
model.eval()

### 토크나이저 생성
tokenizer = BartTokenizer.from_pretrained(
    pretrained_model_name_or_path="facebook/bart-base"
)

### 파이프라인 생성
summarizer = pipeline(
    task="summarization",
    model=model,    # 모델 설정
    tokenizer=tokenizer,
    max_length=50,  # 입력 텍스트가 짧을 경우, 더 줄여도 된다. (예: 24)
    device="cpu"
)

In [58]:
import re
import contractions

def str_preprocessing(x: str):
    ### str이 아니면 그냥 반환
    if not isinstance(x, str): 
        return x
    
    ### 소문자로 변환
    x = x.lower()

    ### 소괄호로 둘러싸인 문자열 삭제
    pattern1 = r'\([^)]*\)'
    x = re.sub(pattern1, '', x)

    ### 대괄호로 둘러싸인 문자열 삭제
    pattern2 = r'\[[^\]]*\]'
    x = re.sub(pattern2, '', x)

    ### 축약어, 슬랭 처리
    x = contractions.fix(x)

    return x

In [24]:
def predict(input_sentence: str, preprocessing=False):
    if preprocessing:   # second 모델일 경우 사용
        input_sentence = str_preprocessing(input_sentence)
    summarizer_result = summarizer(input_sentence)
    predicted_summarization = summarizer_result[0]["summary_text"]
    return predicted_summarization

In [102]:
predict(input("요약할 내용을 입력하세요 : "))

'it is not that bad - it is just pretty boring'

In [25]:
predict_input = "I'm very amazed by this phone I had the Galaxy. S20, This phone was a huge advancement over my old phone. I read some of the reviews and was a little bit worried but it is the absolute best phone i've ever had. I really like the styling. I have it in a case which covers it up nicely, and it wasn't extremely expensive, the phone was quite expensive, but oh well. Figure I might not live forever so why not enjoy life a little and have the toys I want."

In [26]:
summarizer(predict_input)

[{'summary_text': "THE best phone I've ever had! : D"}]

In [11]:
predict(predict_input)

In [75]:
predict(input("요약할 내용을 입력하세요 : "))

"Not the best book I've ever listened to."