In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install rouge_score
!pip install transformers
!pip install datasets
!pip install parlai
!pip install transformers datasets wandb
!pip install --upgrade accelerate
!parlai display_data -t msc:PersonaSummary --include-last-session True
!pip install --upgrade numpy
!pip install names
!pip install py7zr


In [None]:
import transformers
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
from datasets import load_dataset, load_from_disk
import numpy as np
import nltk
nltk.download('punkt')

In [None]:
max_input = 512
max_target = 128
batch_size = 3
model_checkpoint = "philschmid/flan-t5-base-samsum"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
import names
#Generate random names
for i in range(10):
    rand_name = names.get_first_name()
    print(rand_name)

In [None]:
from datasets import load_dataset, load_from_disk

In [None]:
data = load_dataset("samsum")

In [None]:
data['train'][:2]

In [None]:
def getSpeakerNames():
  speaker_1 = names.get_first_name()
  speaker_2 = names.get_first_name()
  while(speaker_1 == speaker_2):
    speaker_2 = names.get_first_name()
  return speaker_1, speaker_2

In [None]:
import re
import json
# Read the JSON file
folder_path = '/usr/local/lib/python3.10/dist-packages/data/msc/msc/msc_personasummary'

def createDataset(mode):
    s1_data_path = f"{folder_path}/session_1/{mode}.txt"
    s2_data_path = f"{folder_path}/session_2/{mode}.txt"
    s3_data_path = f"{folder_path}/session_3/{mode}.txt"
    s4_data_path = f"{folder_path}/session_4/{mode}.txt"

    if mode == 'train':
        data_paths = [s1_data_path, s2_data_path, s3_data_path]
    else:
        data_paths = [s1_data_path, s2_data_path, s3_data_path, s4_data_path]

    mode_X = []
    mode_y = []
    # 데이터 파일 열기

    for data_path in data_paths:
        with open(data_path, 'r', encoding='utf-8') as f:
            for line in f:
                # 한 줄씩 읽기
                data = json.loads(line)
                train_data = ""
                agg_persona_list = []
                name1, name2 = getSpeakerNames()
                for i, utterance in enumerate(data['dialog']):
                    if utterance['id'] != 'bot_0' and utterance['id'] != 'bot_1': assert(0)
                    speaker_name = name1 if utterance['id'] == 'bot_0' else name2
                    text = utterance['text']
                    summary = utterance['agg_persona_list']
                    for i in range(len(summary)):
                        summary[i] = summary[i].replace('I', speaker_name)
                        summary[i] = summary[i].replace("'ve", "'s")

                    # 이어붙이기
                    train_data += f"{speaker_name}: {text}\r\n"
                    agg_persona_list.extend(summary)
                train_data = train_data.rstrip('\r\n')
                agg_persona_list = list(dict.fromkeys(agg_persona_list))
                joined_summaries = " ".join(agg_persona_list)
                # print(train_data)
                # print(joined_summaries)
                mode_X.append(train_data)
                mode_y.append(joined_summaries)
    return mode_X, mode_y

In [None]:
train_X, train_y = createDataset(mode='train')
valid_X, valid_y = createDataset(mode='valid')

In [None]:
train_X[0]

In [None]:
train_y[0]

In [None]:
from datasets import Dataset

train_dataset = Dataset.from_dict({'text':train_X, 'label':train_y})
valid_dataset = Dataset.from_dict({'text':valid_X, 'label':valid_y})

In [None]:
print('Train Dataset Length : ', len(train_X))
print('Valid Dataset Length : ', len(valid_X))

In [None]:
prefix = 'summarize: '
def preprocess_data(sample):
    text = prefix + sample['text']
    #tokenize the dialogues
    model_inputs = tokenizer(text,  max_length=max_input, padding='max_length', truncation=True)
    #tokenize the summaries
    with tokenizer.as_target_tokenizer():
      targets = tokenizer(sample['label'], max_length=max_target, padding='max_length', truncation=True)

    #set labels
    model_inputs['labels'] = targets['input_ids']
    #return the tokenized data
    #input_ids, attention_mask and labels
    return model_inputs

In [None]:
tokenized_train_dataset = train_dataset.map(preprocess_data)
tokenized_valid_dataset = valid_dataset.map(preprocess_data)

In [None]:
tokenized_train_dataset[2]

In [None]:
tokenized_train_dataset = tokenized_train_dataset.remove_columns(['text', 'label'])
tokenized_valid_dataset = tokenized_valid_dataset.remove_columns(['text', 'label'])

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [None]:
batch_size = 8
model_name = "Flan-T5-chat-summary"
model_dir = f"drive/MyDrive/Colab Notebooks/Metabuddy/Models/{model_name}"
args = Seq2SeqTrainingArguments(
    model_dir,
    evaluation_strategy="steps",
    eval_steps=100,
    logging_strategy="steps",
    logging_steps=100,
    save_strategy="steps",
    save_steps=100,
    learning_rate=5e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=5,
    predict_with_generate=True,
    fp16=False,
    load_best_model_at_end=True,
    metric_for_best_model="rouge1",
    report_to='wandb',
    push_to_hub=False,
    logging_dir=f"{model_dir}/runs"
)

In [None]:
label_pad_token_id = -100

data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
    )

In [None]:
from datasets import load_metric

metric = load_metric("rouge")

In [None]:
import wandb
wandb.login()

In [None]:
id = wandb.util.generate_id()
print(id)

In [None]:
wandb.init(project='Memory Extraction', # 실험기록을 관리한 프로젝트 이름
           entity='knkim', # 사용자명 또는 팀 이름
           id='mzm2f6b6',  # 실험에 부여된 고유 아이디
           name='plan-t5-chat-summary',    # 실험에 부여한 이름
          )

In [None]:
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip()))
                      for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip()))
                      for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels,
                            use_stemmer=True)

    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id)
                      for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_valid_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


In [None]:
trainer.train(resume_from_checkpoint=True)