In [None]:
!pip install transformers
!pip install sentencepiece
!pip install rouge

# # Maceb download
# ! apt-get install -y openjdk-8-jdk python3-dev
# ! pip install konlpy "tweepy<4.0.0"
# ! /bin/bash <(curl -s https://raw.githubusercontent.com/konlpy/konlpy/master/scripts/mecab.sh)

Collecting transformers
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 4.1 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 63.3 MB/s 
Collecting tokenizers!=0.11.3,>=0.11.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 63.2 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 89.2 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 7.1 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found ex

In [None]:
import re
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random

from itertools import chain
from tqdm import tqdm

from transformers import AutoTokenizer, PreTrainedTokenizerFast, PreTrainedTokenizer
from transformers import BartTokenizerFast, BartTokenizer, BartForConditionalGeneration
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup

import torch
from torch.optim import Adam, AdamW
from torch.utils.data import DataLoader,TensorDataset,RandomSampler

# from konlpy.tag import Mecab
from rouge import Rouge

import gc

from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [None]:
%cd /content/gdrive/My Drive/summarization

/content/gdrive/My Drive/summarization


In [None]:
# 재현을 위해 랜덤시드 고정
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [None]:
def data_load(path):
    with open(f'datas/{path}', 'rb') as f:
        squad_dict = json.load(f)

    data_dict = {'news' : [],
                 'summary' : []}

    for documents in squad_dict['documents']:
        if documents['media_sub_type'] == '경제지':
            if documents['abstractive'][0] != '' or np.concatenate(documents['text']) != '':
                data_dict['summary'].append(documents['abstractive'][0])
                sentence = ' '.join([i['sentence'] for i in np.concatenate(documents['text'])])
                data_dict['news'].append(sentence)
    
    return pd.DataFrame(data_dict, columns = data_dict.keys())

In [None]:
train_df = data_load('train_news.json')
valid_df = data_load('valid_news.json')

# train_df = pd.read_csv('datas/sum_train.tsv', delimiter='\t')
# valid_df = pd.read_csv('datas/sum_test.tsv', delimiter='\t')

print(f'Train Data {len(train_df)}개')
print(f'Valid Data {len(valid_df)}개')

Train Data 65865개
Valid Data 21418개


In [None]:
tokenizer = PreTrainedTokenizerFast.from_pretrained('gogamza/kobart-base-v1')

model = BartForConditionalGeneration.from_pretrained('gogamza/kobart-base-v1')
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = model.to(device)

Downloading:   0%|          | 0.00/666k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.00 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/111 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.33k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/473M [00:00<?, ?B/s]

In [None]:
def add_padding_data(inputs):
    if len(inputs) < max_len:
        pad = np.array([pad_index] *(max_len - len(inputs)))
        inputs = np.concatenate([inputs, pad])
    else:
        inputs = inputs[:max_len]

    return list(inputs)

def add_ignored_data(inputs):
    if len(inputs) < max_len:
        pad = np.array([ignore_index] *(max_len - len(inputs)))
        inputs = np.concatenate([inputs, pad])
    else:
        inputs = inputs[:max_len]

    return list(map(int, list(inputs)))
    
    return list(inputs)

In [None]:
input = tokenizer(train_df['news'].tolist())['input_ids']

labels = tokenizer(train_df['summary'].tolist())['input_ids']
labels = [i + [tokenizer.eos_token_id] for i in labels]

In [None]:
pad_index = tokenizer.pad_token_id
ignore_index = -100
max_len = max(max([len(i) for i in input]), max([len(i) for i in labels]))

input_ids = torch.tensor([add_padding_data(i) for i in input])
attention_mask = torch.tensor([i.ne(tokenizer.pad_token_id).int().tolist() for i in input_ids])

decoder_input_ids = [[tokenizer.eos_token_id] + i[:-1] for i in labels]
decoder_input_ids = torch.tensor([add_padding_data(i) for i in decoder_input_ids])
decoder_attention_mask = torch.tensor([i.ne(tokenizer.pad_token_id).int().tolist() for i in decoder_input_ids])

labels = torch.tensor([add_ignored_data(i) for i in labels])

In [None]:
batch_size = 128
accumulation = 64

data = {'input_ids': input_ids,
        'attention_mask' : attention_mask,
        'decoder_input_ids': decoder_input_ids,
        'decoder_attention_mask' : decoder_attention_mask,
        'labels': labels}

dataset = TensorDataset(data['input_ids'],data['attention_mask'], data['decoder_input_ids'], data['decoder_attention_mask'],data['labels'])
train_dataloader = DataLoader(dataset, batch_size = batch_size//accumulation, num_workers=2, shuffle=True)

In [None]:
xx  = valid_df.sample(5000,random_state=42)

inputs = torch.tensor(tokenizer(xx['news'].tolist(),padding=True).input_ids)
dataset = TensorDataset(inputs)
valid_dataloader = DataLoader(dataset,batch_size=32)

In [None]:
def rouge_score(predictions, trues):
    r = Rouge()
    total_r1 = 0
    total_r2 = 0
    total_rl = 0
    for i in zip(predictions, trues):
        hy = i[0]
        re = i[1]
        score = r.get_scores(hy,re)[0]
        total_r1 += score['rouge-1']['f']
        total_r2 += score['rouge-2']['f']
        total_rl += score['rouge-l']['f']

    score_dict = {'rouge_r1' : total_r1/len(predictions),
                  'rouge_r2' : total_r2/len(predictions),
                  'rouge_rl' : total_rl/len(predictions)}

    return score_dict

In [None]:
path = 'bart_test_2'

gc.collect()
torch.cuda.empty_cache()

epochs = 4

optimizer = AdamW(model.parameters(),
                lr = 1e-4, # 학습률
                eps = 1e-8)

# 총 훈련 스텝 : 배치반복 횟수 * 에폭
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)


avg_loss = []
HM_score = []

step = 0
for epoch in range(1, epochs+1):
#  =====================================
#               Training
# =====================================
    model.train()

    running_loss = 0.0
    train_losses = []

    for batchs in tqdm(train_dataloader):
        # print(batch)
        batch = tuple(b.to(device) for b in batchs)

        inputs = {
            'src_ids' : batch[0],
            'src_mask' : batch[1],
            'tgt_ids' : batch[2],
            'tgt_mask' : batch[3],
            'tgt_label' : batch[4]
        }

        output = model(input_ids = inputs['src_ids'],
                       attention_mask = inputs['src_mask'],
                       decoder_input_ids = inputs['tgt_ids'],
                       decoder_attention_mask = inputs['tgt_mask'],
                       labels  = inputs['tgt_label'])

        
        (output.loss/accumulation).backward()
        running_loss += output.loss.item()

        del inputs
        step += 1
        if step % accumulation:
            continue

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

        train_losses.append(running_loss / accumulation)
        running_loss = 0.0
    
    train_loss = sum(train_losses) / len(train_losses) 
    print(f"train loss: {train_loss:.3f}")

    model.save_pretrained(f'models/{path}')
    tokenizer.save_pretrained(f'models/{path}')

    preds = []
    for i in tqdm(valid_dataloader):
        with torch.no_grad():
            generated = model.generate(i[0].to(device), num_beams=1,  max_length=200,
                                       do_sample=False, num_return_sequences=1,
                                       bos_token_id=model.config.bos_token_id,
                                       eos_token_id=model.config.eos_token_id)
            prediction = [tokenizer.decode(i,skip_special_tokens=True) for i in generated]

        preds.append(prediction)
        
    preds = list(chain(*preds))

    rouge_f1 = rouge_score(preds, xx['summary'].tolist())
    display(rouge_f1)

# TEST

In [None]:
path = 'models/bart_test'

tokenizer = PreTrainedTokenizerFast.from_pretrained(path)

model = BartForConditionalGeneration.from_pretrained(path)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(device)

In [None]:
def bart_sum_test(data, batch_size=32,max_len=100,beams=1,path=''):
    tokenizer = PreTrainedTokenizerFast.from_pretrained(path)

    model = BartForConditionalGeneration.from_pretrained(path)
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model = model.to(device)

    inputs = torch.tensor(tokenizer(data,padding=True).input_ids)
    
    dataset = TensorDataset(inputs)
    dataloader = DataLoader(dataset,batch_size=batch_size)

    generate = []
    for i in tqdm(dataloader):
        with torch.no_grad():
            generated = model.generate(i[0].to(device), num_beams=beams, 
                                       max_length=max_len,
                                       bos_token_id=model.config.bos_token_id,
                                       eos_token_id=model.config.eos_token_id,
                                       length_penalty = 2.0)
            generate.append(generated.detach().cpu())


    prediction = [tokenizer.decode(i,skip_special_tokens=True) for i in torch.cat(generate, dim=0)]

    return prediction

In [None]:
test_data = pd.read_csv('datas/삼성전자_20220311~20220313.csv')
test_input = test_data['news'].tolist()

In [None]:
prediction = bart_sum_test(input_data,max_len=205,beams=4,batch_size=32,path='models/bart_test')
test_data['summary'] = prediction