#1. Prepare some packages.

#1.1 Install packages.

In [None]:
!nvidia-smi

Tue May 16 11:06:37 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   68C    P8    11W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
!pip install datasets -qq
!pip install transformers -qq
!pip install evaluate nltk rouge_score -qq

In [None]:
!pip install accelerate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!pip install git+https://github.com/huggingface/accelerate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/huggingface/accelerate
  Cloning https://github.com/huggingface/accelerate to /tmp/pip-req-build-vsei99q_
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/accelerate /tmp/pip-req-build-vsei99q_
  Resolved https://github.com/huggingface/accelerate to commit dcde1e93d09abea02a8e7f4a07a2c5734b87b60e
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [None]:
!pip install --upgrade accelerate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


#1.2 Import libraries.

In [None]:
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,
)
import evaluate
import nltk
import torch
import pandas as pd 
import numpy as np
import random
import matplotlib.pyplot as plt
from IPython.display import display, HTML
from nltk.tokenize import word_tokenize
import tensorflow as tf
import os
import math

In [None]:
import argparse
from tqdm import tqdm

In [None]:
from transformers import MvpForConditionalGeneration, MvpTokenizerFast

In [None]:
from transformers.optimization import AdamW, get_linear_schedule_with_warmup

# 2. Prepare the data

#2.1 Read file and do a little text clean

In [None]:
#do a littel text clean with punctuations
def cleanpunctuation(s):
    for p in '!,.:;?':
        s=s.replace(' '+p,p)
    s=s.replace(' '+'n\'t','n\'t')
    s=s.replace(' '+'\'s','\'s')
    s=s.replace(' '+'\'re','\'re')
    s=s.replace(' '+'\'ve','\'ve')
    s=s.replace(' '+'\'ll','\'ll')
    s=s.replace(' '+'\'am','\'am')
    s=s.replace(' '+'\'m','\'m')
    s=s.replace(' '+'\' m','\'m')
    s=s.replace(' '+'\'m','\'m')
    s=s.replace(' '+'\' ve','\'ve')
    s=s.replace(' '+'\' s','\'s')
    s=s.replace('<newline>','\n')
    return s 

In [None]:
fTrain=open(os.path.join('valid.wp_combined'),encoding='utf8')
fTest=open(os.path.join('test.wp_combined'),encoding='utf8')

In [None]:
data_train=fTrain.readlines()
data_test=fTest.readlines()

In [None]:
print("Train dataset length: "+str(len(data_train)))
print("Test dataset length: "+ str(len(data_test)))

Train dataset length: 15620
Test dataset length: 15138


In [None]:
train_text=list(map(cleanpunctuation,data_train))

In [None]:
test_text=list(map(cleanpunctuation,data_test))

In [None]:
train_text = train_text[:5000]
test_text = test_text[:1000]

In [None]:
train_text[9]      

"[ WP ] `` On your right, you'll see natives living along the beach, and they are NOT happy with us being in their sacred waters. '' <endprompts> `` On your right, you'll see the natives living along the beach, and they are NOT happy with us being in their sacred waters. '' Marcus muttered into an imaginary microphone as we floated on our shamble of a armored rubber raft, now riddled with darts and primitive arrows. \n \n The emergency escape pod suffered severe damage on the drop down to planet side. The light weight and heat resistant alloys made for makeshift armor for ourselves and our rubber survival raft. Our only hope was to get out of the territory of these natives and get to the Federation outpost downriver. \n \n The ape-like primitives were much akin to gorillas from earth in terms of survival habits. Building `` nests '' in trees, which were more like tree houses that any kid would want, and not being able to swim and just not liking water in general, getting most of their 

In [None]:
parser = argparse.ArgumentParser()
parser.add_argument('--seed', type=int, default=88888)
parser.add_argument("--model_name", default="gpt2", type=str)
parser.add_argument("--max_seq_length", default=512, type=int)
parser.add_argument("--train_batch_size", default=4, type=int)
parser.add_argument("--valid_batch_size", default=4, type=int)
parser.add_argument("--num_train_epochs", default=1, type=int)
parser.add_argument("--warmup", default=0.1, type=float)
parser.add_argument("--learning_rate", default=5e-5, type=float)
# parser.add_argument("--input_text_path", default='../input/story-text', type=str)
args, _ = parser.parse_known_args()

#2.2 Tokenize and load to dataloader

In [None]:
tokenizer = MvpTokenizerFast.from_pretrained("RUCAIBox/mvp")
tokenizer.pad_token=tokenizer.eos_token

In [None]:
# Tokenize the stories
def tokenize_stories(stories):
    input_ids = []
    attention_masks = []
    for story in stories:
        encoded_dict = tokenizer.encode_plus(story, 
                                              add_special_tokens=True, 
                                              max_length=512, 
                                              pad_to_max_length=True, 
                                              return_attention_mask=True, 
                                              return_tensors='tf')
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])
    input_ids = tf.concat(input_ids, axis=0)
    attention_masks = tf.concat(attention_masks, axis=0)
    return input_ids, attention_masks

In [None]:
train_input_ids, train_attention_masks = tokenize_stories(train_text)
test_input_ids, test_attention_masks = tokenize_stories(test_text)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [None]:
train_size = int(0.8 * len(train_input_ids))
val_size = len(train_input_ids) - train_size

In [None]:
train_input_ids, val_input_ids = tf.split(train_input_ids, [train_size, val_size])
train_attention_masks, val_attention_masks = tf.split(train_attention_masks, [train_size, val_size])

In [None]:
# Prepare the training data
train_data = tf.data.Dataset.from_tensor_slices((train_input_ids, train_attention_masks))
train_data = train_data.shuffle(len(train_input_ids)).batch(8, drop_remainder=True)

In [None]:
# Prepare the validation data
val_data = tf.data.Dataset.from_tensor_slices((val_input_ids, val_attention_masks))
val_data = val_data.batch(8, drop_remainder=True)

In [None]:
# Define the training loop
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)

In [None]:
from transformers import TextDataset,DataCollatorForLanguageModeling

In [None]:
#file path
train_path = '/content/valid.wp_combined'
test_path = '/content/test.wp_combined'

In [None]:
from transformers import TextDataset,DataCollatorForLanguageModeling

# def load_dataset(train_path,test_path,tokenizer):
#     train_dataset = TextDataset(
#           tokenizer=tokenizer,
#           file_path=train_path,
#           block_size=128)
     
#     test_dataset = TextDataset(
#           tokenizer=tokenizer,
#           file_path=test_path,
#           block_size=128)   
    
#     data_collator = DataCollatorForLanguageModeling(
#         tokenizer=tokenizer, mlm=False,
#     )
#     return train_dataset,test_dataset,data_collator

# train_dataset,test_dataset,data_collator = load_dataset(train_path,test_path,tokenizer)

Token indices sequence length is longer than the specified maximum sequence length for this model (5893735 > 1024). Running this sequence through the model will result in indexing errors


In [None]:
model = MvpForConditionalGeneration.from_pretrained("RUCAIBox/mvp-story")

In [None]:
from transformers import Trainer, TrainingArguments

In [None]:
training_args = TrainingArguments(
    output_dir="./mvp_out",  # The output directory
    overwrite_output_dir=True,  # Overwrite the content of the output directory
    num_train_epochs=1,  # Number of training epochs
    per_device_train_batch_size=8,  # Batch size for training
    per_device_eval_batch_size=8,  # Batch size for evaluation
    eval_steps=400,  # Number of update steps between two evaluations
    save_steps=800,  # After how many steps the model is saved
    warmup_steps=500,  # Number of warmup steps for learning rate scheduler
    prediction_loss_only=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Clear GPU memory cache before training
import torch
torch.cuda.empty_cache()

In [None]:
# Clear GPU memory cache before training
import torch
torch.cuda.empty_cache()

trainer.train()

In [None]:
prompt=test_text[5][:test_text[300].find('<endprompts>')]
target=test_text[5][test_text[300].find('<endprompts>')+5:]

def generate_story(prompt,target,k=0,p=0.9,output_length=100,temperature=1,num_return_sequences=1,repetition_penalty=1.0):
    print("====prompt====\n")
    print(prompt+"\n")
    print('====target story is as below===\n')
    print(target+"\n")
    encoded_prompt = tokenizer.encode(prompt, add_special_tokens=False, return_tensors="pt")
    model.to('cpu')
    model.eval()
    output_sequences = model.generate(
        input_ids=encoded_prompt,
        max_length=output_length,
        temperature=temperature,
        top_k=k,
        top_p=p,
        repetition_penalty=repetition_penalty,
        do_sample=True,
        num_return_sequences=num_return_sequences
    )
    if len(output_sequences.shape) > 2:
        output_sequences.squeeze_()
    for generated_sequence_idx, generated_sequence in enumerate(output_sequences):
        print("=== GENERATED SEQUENCE {} ===".format(generated_sequence_idx + 1))
        generated_sequence = generated_sequence.tolist()
        # Decode text
        text = tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True)
        # Remove all text after eos token
        # text = text[: text.find(tokenizer.eos_token)]
        print(text)

In [None]:
generate_story(prompt,target)

====prompt====

[ TT ] `` Shut the dog up. '' <endprompts> “ Shut the dog 

====target story is as below===

 shouted my head officer from the jeep. The dog was running circles around our vehicle, barking at the people inside. The officer tapped my shoulder and pointed to the yellow, skinny animal circling our jeep. 
 
 “ But sir.., ” I managed to spit out before he took both his hands and pushed me out of the vehicle. I went tumbling out, and landed on the rough sandy ground. I stood up adjusting the gun hanging from my shoulder and proceeded to walk towards the canine. The dog stopped its barking, and shifted its black eyes to me. 
 
 “ Come here little pup. Hey come here, I ’ m not going to hurt ya, ” I said trying to coax it nearer to me. Actually, I didn ’ t know if I was going to hurt the little mutt or not yet. Reaching my hand towards my waist, I pulled off a tiny bit of my rations. I held it out my hand, with the ration laying on my open palm. The dog perked it ’ s ears, and c