In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install rouge_score
!pip install transformers
!pip install datasets
!pip install parlai
!pip install transformers datasets wandb
!pip install --upgrade accelerate
!parlai display_data -t msc:PersonaSummary --include-last-session True 
!pip install --upgrade numpy
!pip install names
!pip install py7zr


In [None]:
import transformers
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
from datasets import load_dataset, load_from_disk
import numpy as np
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
max_input = 512
max_target = 128
batch_size = 3
model_checkpoints = "lidiya/bart-large-xsum-samsum"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoints)

In [None]:
import names
#Generate random names
for i in range(10):
    rand_name = names.get_first_name()
    print(rand_name)

Stephanie
George
Harold
Sandra
Rose
Joshua
Robert
Debra
Bertha
Gary


In [None]:
from datasets import load_dataset, load_from_disk

In [None]:
data = load_dataset("samsum")

In [None]:
data['train'][:2]

{'id': ['13818513', '13728867'],
 'dialogue': ["Amanda: I baked  cookies. Do you want some?\r\nJerry: Sure!\r\nAmanda: I'll bring you tomorrow :-)",
  'Olivia: Who are you voting for in this election? \r\nOliver: Liberals as always.\r\nOlivia: Me too!!\r\nOliver: Great'],
 'summary': ['Amanda baked cookies and will bring Jerry some tomorrow.',
  'Olivia and Olivier are voting for liberals in this election. ']}

In [None]:
def getSpeakerNames():
  speaker_1 = names.get_first_name()
  speaker_2 = names.get_first_name()
  while(speaker_1 == speaker_2):
    speaker_2 = names.get_first_name()
  return speaker_1, speaker_2

In [None]:
import re
import json
# Read the JSON file
folder_path = '/usr/local/lib/python3.10/dist-packages/data/msc/msc/msc_personasummary'

def createDataset(mode):
    s1_data_path = f"{folder_path}/session_1/{mode}.txt"
    s2_data_path = f"{folder_path}/session_2/{mode}.txt"
    s3_data_path = f"{folder_path}/session_3/{mode}.txt"
    s4_data_path = f"{folder_path}/session_4/{mode}.txt"

    if mode == 'train':
        data_paths = [s1_data_path, s2_data_path, s3_data_path]
    else:
        data_paths = [s1_data_path, s2_data_path, s3_data_path, s4_data_path]

    mode_X = []
    mode_y = []
    # 데이터 파일 열기

    for data_path in data_paths:
        with open(data_path, 'r', encoding='utf-8') as f:
            for line in f:
                # 한 줄씩 읽기
                data = json.loads(line)
                train_data = ""
                agg_persona_list = []
                name1, name2 = getSpeakerNames()
                for i, utterance in enumerate(data['dialog']):
                    if utterance['id'] != 'bot_0' and utterance['id'] != 'bot_1': assert(0)
                    speaker_name = name1 if utterance['id'] == 'bot_0' else name2
                    text = utterance['text']
                    summary = utterance['agg_persona_list']
                    for i in range(len(summary)):
                        summary[i] = summary[i].replace('I', speaker_name)
                        summary[i] = summary[i].replace("'ve", "'s")

                    # 이어붙이기
                    train_data += f"{speaker_name}: {text}\r\n"
                    agg_persona_list.extend(summary)
                train_data = train_data.rstrip('\r\n')
                agg_persona_list = list(dict.fromkeys(agg_persona_list))
                joined_summaries = " ".join(agg_persona_list)
                # print(train_data)
                # print(joined_summaries)
                mode_X.append(train_data)
                mode_y.append(joined_summaries)
    return mode_X, mode_y

In [None]:
train_X, train_y = createDataset(mode='train')
valid_X, valid_y = createDataset(mode='valid')

In [None]:
train_X[0]

"Allison: I need some advice on where to go on vacation, have you been anywhere lately?\r\nSonia: I have been all over the world. I'm military.\r\nAllison: That is good you have alot of travel experience\r\nSonia: Sure do. And a lot of experience blowing things up! Haha. Bora bora is nice.\r\nAllison: I've been working non stop crazy hours and need a break.\r\nSonia: The best breaks are spent with cute cuddly kittens.\r\nAllison: Bora bora sounds nice, you have been there before?\r\nSonia: Nope... Just sounds nice, and repetitive. Bora... Bora. Ha!\r\nAllison: Kittens really? I rather be at the beach.\r\nSonia: Only if the beach was covered in kittens!\r\nAllison: That would be a sight to see.\r\nSonia: Or maybe brownies... I love chocolate.\r\nAllison: I love brownies too but I haven't quite perfected mine yet.\r\nSonia: Well I'm available to taste test!"

In [None]:
train_y[0]

"Sonia served or serve in the military. Sonia's traveled the world. Sonia's blown things up. Allison's been working a lot of extra hours. Allison want to break from my non-stop work. Sonia's never been to Bora Bora. Allison like going to the beach. Sonia love chocolate. Allison love brownies."

In [None]:
from datasets import Dataset

train_dataset = Dataset.from_dict({'text':train_X, 'label':train_y})
valid_dataset = Dataset.from_dict({'text':valid_X, 'label':valid_y})

In [None]:
print('Train Dataset Length : ', len(train_X))
print('Valid Dataset Length : ', len(valid_X))

Train Dataset Length :  10285
Valid Dataset Length :  2000


In [None]:
def preprocess_data(sample):
    #tokenize the dialogues
    model_inputs = tokenizer(sample['text'],  max_length=max_input, padding='max_length', truncation=True)
    #tokenize the summaries
    with tokenizer.as_target_tokenizer():
      targets = tokenizer(sample['label'], max_length=max_target, padding='max_length', truncation=True)
      
    #set labels
    model_inputs['labels'] = targets['input_ids']
    #return the tokenized data
    #input_ids, attention_mask and labels
    return model_inputs

In [None]:
tokenized_train_dataset = train_dataset.map(preprocess_data)
tokenized_valid_dataset = valid_dataset.map(preprocess_data)



  0%|          | 0/10285 [00:00<?, ?ex/s]



  0%|          | 0/2000 [00:00<?, ?ex/s]

In [None]:
tokenized_train_dataset[0]

In [None]:
tokenized_train_dataset = tokenized_train_dataset.remove_columns(['text', 'label'])
tokenized_valid_dataset = tokenized_valid_dataset.remove_columns(['text', 'label'])

In [None]:
model_checkpoints = "lidiya/bart-large-xsum-samsum"
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoints)

Downloading pytorch_model.bin:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

In [None]:
batch_size = 8
model_name = "BART_chat"
model_dir = f"drive/MyDrive/Colab Notebooks/Metabuddy/Models/{model_name}"
args = Seq2SeqTrainingArguments(
    model_dir,
    evaluation_strategy="steps",
    eval_steps=100,
    logging_strategy="steps",
    logging_steps=100,
    save_strategy="steps",
    save_steps=100,
    learning_rate=4e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    predict_with_generate=True,
    fp16=True,
    load_best_model_at_end=True,
    metric_for_best_model="rouge1",
    report_to='wandb',
    push_to_hub=False,
    logging_dir=f"{model_dir}/runs"
)

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer)

In [None]:
from datasets import load_metric

metric = load_metric("rouge")

Downloading builder script:   0%|          | 0.00/2.16k [00:00<?, ?B/s]

In [None]:
import wandb
wandb.login()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [None]:
id = wandb.util.generate_id()
print(id)

pqijg2og


In [None]:
wandb.init(project='Memory Extraction', # 실험기록을 관리한 프로젝트 이름
           entity='knkim', # 사용자명 또는 팀 이름
           id='pqijg2og',  # 실험에 부여된 고유 아이디
           name='bart_chat',    # 실험에 부여한 이름               
          )

0,1
eval/gen_len,▁
eval/loss,▁
eval/rouge1,▁
eval/rouge2,▁
eval/rougeL,▁
eval/rougeLsum,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁█

0,1
eval/gen_len,58.5875
eval/loss,0.83659
eval/rouge1,52.047
eval/rouge2,30.2663
eval/rougeL,41.1271
eval/rougeLsum,51.2717
eval/runtime,781.2396
eval/samples_per_second,2.56
eval/steps_per_second,0.32
train/epoch,1.0


In [None]:
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip()))
                      for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) 
                      for label in decoded_labels]
    
    result = metric.compute(predictions=decoded_preds, references=decoded_labels,
                            use_stemmer=True)

    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id)
                      for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_valid_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


In [None]:
trainer.train(resume_from_checkpoint=True)

You're using a BartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1200,0.7083,0.83659,52.047,30.2663,41.1271,51.2717,58.5875


TrainOutput(global_step=1286, training_loss=0.10319082992814747, metrics={'train_runtime': 952.0842, 'train_samples_per_second': 10.803, 'train_steps_per_second': 1.351, 'total_flos': 1.114433541636096e+16, 'train_loss': 0.10319082992814747, 'epoch': 1.0})