https://ai-forever.github.io/ru-prompts/pretrained/


In [None]:
!pip install transformers==4.6.0
!pip install ruprompts

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [None]:
from ruprompts import Prompt, PromptFormat, TensorPromptProvider
from transformers import GPT2LMHeadModel, AutoTokenizer
from transformers import pipeline
from sklearn.model_selection import train_test_split
import pandas as pd
from transformers import set_seed

import torch
from torch.utils.data import DataLoader, Dataset

In [None]:
model_id = "sberbank-ai/rugpt3large_based_on_gpt2"
model = GPT2LMHeadModel.from_pretrained(model_id)
tokenizer = AutoTokenizer.from_pretrained(model_id, pad_token="<pad>", eos_token="<pad>")

Downloading:   0%|          | 0.00/609 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.14G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.71M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.27M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
set_seed(1)
prompt_format = PromptFormat("<P>Passage:</P>{passage}<P>\\nQuestion:</P>{question}<P>\\nAnswer:{answer}</P>") 
prompt_provider = TensorPromptProvider()
prompt = Prompt(
    format=prompt_format,
    provider=prompt_provider,
)

prompt.patch(model, tokenizer)

In [None]:
class PQADataset(Dataset):
  def __init__(self, 
               data, 
               prompt_fields=['text', 'question_text'], 
               target_field = 'answer_text',  
               truncation_field = 'passage',
               max_len=1027, 
               is_test=False):
    super().__init__()
    self.data = data
    self.is_test=is_test
    self.max_len = max_len
    self.prompt_fields = prompt_fields
    self.target_field = target_field
    self.truncation_field = truncation_field



  def __len__(self):
    return len(self.data)



  def truncate(self, tokenized, ranges):

    if len(tokenized['input_ids'])<=self.max_len:
      return 0

    truncated_field_range = ranges[self.truncation_field]
    truncated_field_start = tokenized.char_to_token(truncated_field_range.start)
    truncated_field_end = tokenized.char_to_token(truncated_field_range.stop)

    exceeding_tokens = len(tokenized["input_ids"]) - self.max_len

    cut_start = max(truncated_field_end - exceeding_tokens, truncated_field_start)
    cut_end = truncated_field_end


    tokenized["input_ids"] = tokenized["input_ids"][:cut_start] + tokenized["input_ids"][cut_end:]
    tokenized["attention_mask"] = (
        tokenized["attention_mask"][:cut_start] + tokenized["attention_mask"][cut_end:]
    )

    return exceeding_tokens


  def __getitem__(self, index):
    row = self.data.iloc[index]
    input_ids = [0]*self.max_len 
    attn_mask = [0]*self.max_len 
    labels = [-100]*self.max_len

    formatted_text, ranges = prompt(passage=row[self.prompt_fields[0]], 
                                    question=row[self.prompt_fields[1]],
                                    return_ranges=True)

    target_sequence = row[self.target_field]

    if self.is_test:
      input_text = formatted_text + tokenizer.eos_token
    else:
      input_text = formatted_text + target_sequence + tokenizer.eos_token
 
    tokenized = tokenizer(input_text)

    truncated_tokens = self.truncate(tokenized, ranges)


    
    target_seq_start = tokenized.char_to_token(len(formatted_text)) - truncated_tokens
    

    target_seq_end = 1

    curr = target_seq_start

    while tokenized['input_ids'][curr]!=0:
      curr+=1
      target_seq_end+=1

    labels[target_seq_start:target_seq_start+target_seq_end] = tokenized['input_ids'][target_seq_start:target_seq_start+target_seq_end]

    input_ids[:len(tokenized['input_ids'])] = tokenized['input_ids']
    attn_mask[:len(tokenized['attention_mask'])] = tokenized['attention_mask']


    tokenized['input_ids'] = input_ids
    tokenized['attention_mask'] = attn_mask

    tokenized['labels'] = labels

    # print(len(tokenized['input_ids']), len(tokenized['attention_mask']), len(tokenized['labels']))

    return {'input_ids' : torch.as_tensor(tokenized['input_ids']), 
            'attention_mask': torch.as_tensor(tokenized['attention_mask']), 
            'labels': torch.as_tensor(tokenized['labels']) }



In [None]:
triplets = pd.read_csv('/content/drive/MyDrive/context_QA.csv')
# triplets

In [None]:
train, test = train_test_split(triplets, test_size=0.2, random_state=42)
train, val = train_test_split(train, test_size=0.25, random_state=42)
len(train), len(val), len(test)

(1170, 391, 391)

In [None]:
train_dataset = PQADataset(train, max_len=512)
val_dataset = PQADataset(val, max_len=512)
test_dataset = PQADataset(test, max_len=512, is_test=True)

train_loader = DataLoader(
    train_dataset,
    batch_size = 2,
    shuffle=True,
    #drop_last=True
)

valid_loader = DataLoader(
    val_dataset,
    batch_size=2,
    shuffle=True,
    #drop_last=True
)

test_loader = DataLoader(
    test_dataset,
    batch_size=1,
    shuffle=False,
    drop_last=False
)

In [None]:
def compute_metrics(eval_preds):
  preds, labels = eval_preds

  

  pass 

In [None]:
# sample = next(iter(train_loader))
# tokenizer.decode(sample['input_ids'][0])
# triplets['combine'] = triplets.apply(lambda row: len(tokenizer.encode(prompt(passage=row['text'], question=row['question_text'])+row['answer_text']+ '<pad>')), axis=1)
# triplets['combine']

# # triplets['combine'] = triplets.apply(lambda row: '<|P|>'*4+row['text']+'<|P|>'*3+row['question_text']+'<|P|>'*2+row['answer_text'], axis=1) # 
# # triplets

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/logs_prompts/second_attempt/",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=1,
    eval_steps=100,
    save_steps=100,
    logging_steps=10,
    evaluation_strategy="steps",
    save_strategy="steps",
    logging_strategy="steps",
    save_total_limit=2,
    metric_for_best_model="eval_loss",
    learning_rate=0.01, #
    max_steps=1000,
    report_to="tensorboard",
    # report_to=["tensorboard", "wandb"],  # uncomment to log to WandB
    logging_dir="/content/drive/MyDrive/logs_prompts/second_attempt/logs/",
    seed=1,
)

In [None]:
from transformers.optimization import AdamW, get_linear_schedule_with_warmup

optimizer = AdamW(prompt_provider.parameters(), lr=training_args.learning_rate)
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=150,
    num_training_steps=training_args.max_steps,
)

In [None]:
from transformers import Trainer, EarlyStoppingCallback
from ruprompts.callbacks import (
    FreezeTransformerUnfreezePrompt,
    ReduceCheckpoint,
    SavePretrainedPrompt,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=train_loader.collate_fn,
   # place_model_on_device=True,
    optimizers=(optimizer, scheduler),
    callbacks=[FreezeTransformerUnfreezePrompt(), 
             #  EarlyStoppingCallback(early_stopping_patience=2), 
               ReduceCheckpoint(), 
               SavePretrainedPrompt(prompt)],
    
)

trainer.train()

Step,Training Loss,Validation Loss
100,4.9337,4.710433
200,4.168,4.163318
300,3.9259,3.73528
400,3.2944,3.590199
500,3.8784,3.37646
600,2.8844,3.378542
700,3.4887,3.209499
800,3.3558,3.077754
900,3.544,3.002672
1000,3.2673,2.957791


TrainOutput(global_step=1000, training_loss=4.116764989852905, metrics={'train_runtime': 2192.0982, 'train_samples_per_second': 0.456, 'total_flos': 0, 'epoch': 1.71, 'init_mem_cpu_alloc_delta': 0, 'init_mem_gpu_alloc_delta': 0, 'init_mem_cpu_peaked_delta': 0, 'init_mem_gpu_peaked_delta': 0, 'train_mem_cpu_alloc_delta': -4444069888, 'train_mem_gpu_alloc_delta': 393728, 'train_mem_cpu_peaked_delta': 4580274176, 'train_mem_gpu_peaked_delta': 6573174272})

In [None]:
#'/content/drive/Othercomputers/My Laptop/'
torch.save(model,"my_model.pt")
model.save_pretrained('/content/drive/MyDrive/logs_prompts/model/')

In [None]:
model.save_pretrained('.')

In [None]:
prompt = Prompt.from_pretrained(f"/content/drive/MyDrive/logs_prompts/second_attempt/checkpoint-1000/")
prompt.patch(model, tokenizer)

# tokenized.to('cpu')
model.to('cpu')



In [None]:
train

Unnamed: 0,text,question_text,answer_text
835,"Родился Николай в Торуне в купеческой семье, р...",В какой стране родился Николай Коперник?,Польша
955,Руджеро Леонкавалло (итал. Ruggero Leoncavallo...,Как звали итальянского композитора Леонкавалло?,Руджеро
1027,Родился в селе Курья Алтайского края. Он был с...,Где родился Михаил Калашников?,Курья
1724,Российский государственный университет физичес...,В каком году открыт Институт физической культу...,29 мая 1918 г
944,Первым из европейцев нового времени Петру увид...,В какой стране находится скальный город Петра?,Иордания
...,...,...,...
17,Происхождение Снегга в основном стало известно...,Как звали отца Северуса Снейпа?,Тоббиас Снегг (Снейп)
384,Русский колодец — крытая бревенчатая шахта с в...,Что есть у колодца?,ворот
1149,Забальзамированное тело Сталина было помещено ...,Где захоронен Сталин?,Некрополь у Кремлёвской стены
995,ЦСКА — советский и российский профессиональный...,В какой лиге играет ЦСКА?,"Лига чемпионов УЕФА, Кубок России по футболу, ..."


In [None]:

passage = triplets.iloc[955]['text']
question = triplets.iloc[955]['question_text']
answer = triplets.iloc[955]['answer_text']


formatted_text = prompt.format(passage = passage, question=question)

input_text = formatted_text + tokenizer.eos_token

tokenized = tokenizer(input_text, return_tensors='pt')


In [None]:
res = model.generate(tokenized['input_ids'], 
                     #do_sample=True, 
                     return_dict_in_generate=True, 
                     #output_scores=True, 
                    # repetition_penalty=5.0,
                     max_length=len(tokenized['input_ids'][0])+15,
                     top_k=10, 
                    #  top_p=0.95, 
                     temperature=0.9)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [None]:
pre, ans = tokenizer.decode(res[0][0]).replace('\xa0', ' ').replace('\n', ' ').replace('<|P|>','').split('<pad>')
pre

'Руджеро Леонкавалло (итал. Ruggero Leoncavallo; 23 апреля 1857, Неаполь — 9 августа 1919, Монтекатини-Терме) — итальянский композитор.Как звали итальянского композитора Леонкавалло?'

In [None]:
ans

'�Руджеро Леонкаваллосальдостеростеростеростеро'

In [None]:
answer

'Руджеро'