In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
# !cd '/content/drive/Othercomputers/내 컴퓨터/meeting/' && pip install -qq -r requirements.txt

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer, BartConfig
    
from datasets import load_metric, Dataset
from dacon_submit_api import dacon_submit_api 

import nltk
import numpy as np
import pandas as pd
import json
import os

import warnings
warnings.filterwarnings(action='ignore')

In [None]:
pretrained = 'ainize/kobart-news'
batch_size = 8
epochs = 3

encoder_max_length = 500
decoder_max_length = 50
learning_rate = 1e-4

tokenizer = AutoTokenizer.from_pretrained(pretrained)
model = AutoModelForSeq2SeqLM.from_pretrained(pretrained)
metric = load_metric("rouge")

In [None]:
# DIR = "/content/drive/Othercomputers/내 컴퓨터/meeting/data"
DIR = "./data"
TRAIN_SOURCE = os.path.join(DIR, "train.json")
TEST_SOURCE = os.path.join(DIR, "test.json")

with open(TRAIN_SOURCE) as f:
    TRAIN_DATA = json.loads(f.read())
    
with open(TEST_SOURCE) as f:
    TEST_DATA = json.loads(f.read())

train = pd.DataFrame(columns=['uid', 'title', 'region', 'context', 'summary'])
uid = 1000
for data in TRAIN_DATA:
    for agenda in data['context'].keys():
        context = ''
        for line in data['context'][agenda]:
            context += data['context'][agenda][line]
            context += ' '
        train.loc[uid, 'uid'] = uid
        train.loc[uid, 'title'] = data['title']
        train.loc[uid, 'region'] = data['region']
        train.loc[uid, 'context'] = context[:-1]
        train.loc[uid, 'summary'] = data['label'][agenda]['summary']
        uid += 1

test = pd.DataFrame(columns=['uid', 'title', 'region', 'context'])
uid = 2000
for data in TEST_DATA:
    for agenda in data['context'].keys():
        context = ''
        for line in data['context'][agenda]:
            context += data['context'][agenda][line]
            context += ' '
        test.loc[uid, 'uid'] = uid
        test.loc[uid, 'title'] = data['title']
        test.loc[uid, 'region'] = data['region']
        test.loc[uid, 'context'] = context[:-1]
        uid += 1

train['total'] = train.title + ' ' + train.region + ' ' + train.context
test['total'] = test.title + ' ' + test.region + ' ' + test.context

df_train = train.iloc[:-200]
df_val = train.iloc[-200:]

In [None]:
def preprocess_function(batch):
    prefix = "summarize: "
    inputs = [prefix + doc for doc in batch["total"]]
    inputs = tokenizer(inputs, padding="max_length",
                        truncation=True,
                        max_length=encoder_max_length)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        outputs = tokenizer(batch["summary"], padding="max_length",
                        truncation=True,
                        max_length=decoder_max_length)

    batch["input_ids"] = inputs.input_ids
    batch["attention_mask"] = inputs.attention_mask
    batch["decoder_input_ids"] = outputs.input_ids
    batch["decoder_attention_mask"] = outputs.attention_mask
    batch["labels"] = outputs.input_ids.copy()
    batch["labels"] = [
        [-100 if token == tokenizer.pad_token_id \
            else token for token in labels] \
                for labels in batch["labels"]]
    return batch

In [None]:
train_data = Dataset.from_pandas(df_train)
val_data = Dataset.from_pandas(df_val)
test_data = Dataset.from_pandas(test)

train_data = train_data.map(
    preprocess_function,
    batched=True,
    batch_size=batch_size,
    remove_columns=[
        'uid', 'title', 'region', 
        'context', 'summary', 'total']
)
train_data.set_format(
    type="torch", 
    columns=[
        "input_ids", "attention_mask", "decoder_input_ids", 
        "decoder_attention_mask", "labels"],
)


val_data = val_data.map(
    preprocess_function,
    batched=True,
    batch_size=batch_size,
    remove_columns=[
        'uid', 'title', 'region', 
        'context', 'summary', 'total']
)
val_data.set_format(
    type="torch", 
    columns=[
        "input_ids", "attention_mask", "decoder_input_ids", 
        "decoder_attention_mask", "labels"],
)

In [None]:
# def compute_metrics(pred):
#     labels_ids = pred['labels']
#     pred_ids = pred['input_ids']

#     # all unnecessary tokens are removed
#     pred_str = tokenizer.decode(pred_ids, skip_special_tokens=True)
#     labels_ids[labels_ids == -100] = tokenizer.pad_token_id
#     label_str = tokenizer.decode(labels_ids, skip_special_tokens=True)

#     rouge_output = metric.compute(predictions=nltk.sent_tokenize(pred_str), references=str(label_str), rouge_types=["rouge2"])["rouge2"].mid

#     return {
#         "rouge2_precision": round(rouge_output.precision, 4),
#         "rouge2_recall": round(rouge_output.recall, 4),
#         "rouge2_fmeasure": round(rouge_output.fmeasure, 4),
#     }

In [None]:
training_args = Seq2SeqTrainingArguments(
                    save_strategy="epoch",
                    evaluation_strategy="epoch",
                    num_train_epochs=epochs,
                    logging_strategy="epoch",
                    do_train=True,
                    do_eval=True,
                    per_device_train_batch_size=batch_size,
                    per_device_eval_batch_size=batch_size,
                    output_dir='./log',
                    load_best_model_at_end=True,
                    learning_rate=learning_rate,
                    remove_unused_columns=True,
                    lr_scheduler_type="constant_with_warmup",
                    )

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)


trainer = Seq2SeqTrainer(
            model=model,
            args=training_args,
            data_collator=data_collator,
            train_dataset=train_data,
            eval_dataset=val_data,
            tokenizer=tokenizer,
            # compute_metrics=compute_metrics,
            )

trainer.train()

In [None]:
def generate_summary(batch):
    prefix = "summarize: "
    inputs = [prefix + doc for doc in batch["total"]]
    inputs = tokenizer(inputs,
                        padding="max_length",
                        truncation=True,
                        max_length=encoder_max_length,
                        return_tensors="pt")

    input_ids = inputs.input_ids.to("cuda")
    attention_mask = inputs.attention_mask.to("cuda")

    outputs = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_length=decoder_max_length,)

    output_str = tokenizer.batch_decode(
                                outputs,
                                skip_special_tokens=True)

    batch["summary"] = output_str

    return batch

In [None]:
model.to("cuda")

results = test_data.map(
                    generate_summary,
                    batched=True,
                    batch_size=batch_size)

pred_str = results["total"]
label_str = results["summary"]

SUBMISSION = os.path.join(DIR, "sample_submission.csv")
sample_submission = pd.read_csv(SUBMISSION)
sample_submission['summary'] = label_str
sample_submission.to_csv('sub1.csv', index=False)

In [None]:
result = dacon_submit_api.post_submission_file(
    'sub1.csv', 
    '63af198a705c8b8bcb5c3f0bcd84b1c8e8435c9da10875843871a770c269d0e6', 
    '235813',
    '영락태왕', 
    'submission'
)