In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.30.1-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m35.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m20.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m44.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m

In [None]:
import pandas as pd
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import numpy as np
import random
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm, trange
import torch.nn.functional as F
import csv
import os

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# base_path = "/content/drive/MyDrive/Colab Notebooks/Diploma/" #nievni
base_path = "/content/drive/MyDrive/Diploma/" # вафля

# GPT2 with Fine Tuning

### Prepare data

In [None]:
df_CL = pd.read_csv( base_path+"df_tranclated.csv")
df_CL

Unnamed: 0,0,1
0,Yesterday I hung a new painting on the wall. I...,Yesterday I fixed the paintings to the wall us...
1,This morning a neighbor came to me. She asked ...,A neighbor came to me this morning asking for ...
2,Mom was cooking dinner when water spilled on t...,"While cooking dinner, my mother spilled water ..."
3,We went for a walk in the park yesterday. They...,Yesterday we walked in the park and fed ducks ...
4,A little girl found a lost wallet. She returne...,A little girl found a lost wallet. She returne...
...,...,...
1271,Igor found a forgotten wallet in the park. Ins...,"Having discovered the lost wallet, Igor decide..."
1272,"Lena decided to bake cookies, but she forgot a...","Forgetting the timer, Lena admits that the coo..."
1273,"Anton ordered pizza in the evening, and when t...",Anton has ordered a lot of pizzas and is havin...
1274,"Nastya is packing her suitcase for the trip, b...",Nastya has a problem with her suitcase and cre...


In [None]:
df = pd.DataFrame(df_CL["0"] + " <CL> " + df_CL["1"],columns=["Lyric"])

In [None]:
test_set = df.sample(frac=0.02)
df = df.loc[~df.index.isin(test_set.index)]

#Reset the indexes
test_set = test_set.reset_index()
df = df.reset_index()
df.head()

Unnamed: 0,index,Lyric
0,0,Yesterday I hung a new painting on the wall. I...
1,1,This morning a neighbor came to me. She asked ...
2,2,Mom was cooking dinner when water spilled on t...
3,3,We went for a walk in the park yesterday. They...
4,4,A little girl found a lost wallet. She returne...


In [None]:
test_set['True_end_lyrics'] = test_set['Lyric'].str.split("<CL>").str[1]
test_set['Lyric'] = test_set['Lyric'].str.split("<CL>").str[0]
test_set.head()

Unnamed: 0,index,Lyric,True_end_lyrics
0,1063,I bought a new phone but it won't charge.,My new phone won't charge despite being conne...
1,210,Jogging at dawn is like energizing. This is ho...,"Dawn, jogging, cheerfulness."
2,1207,"Lena, with constant attempts, was finally able...",Lena learned how to make the perfect white ch...
3,331,"The street had 100 houses, and the newspaper b...",The newspaper booth is in the middle of the s...
4,307,A group of 16 people are going to visit the mu...,Seventeen people museum. Each 20 minutes tour.


### Prepare the dataset

In [None]:
class SongLyrics(Dataset):

    def __init__(self, control_code, truncate=False, gpt2_type="gpt2", max_length=1024):

        self.tokenizer = GPT2Tokenizer.from_pretrained(gpt2_type)
        self.lyrics = []

        for row in df['Lyric']:
          self.lyrics.append(torch.tensor(
                self.tokenizer.encode(f"<|{control_code}|>{row[:max_length]}<|endoftext|>")
            ))

        if truncate:
            self.lyrics = self.lyrics[:20000]
        self.lyrics_count = len(self.lyrics)

    def __len__(self):
        return self.lyrics_count

    def __getitem__(self, item):
        return self.lyrics[item]

In [None]:
dataset = SongLyrics(df['Lyric'], truncate=True, gpt2_type="gpt2")

### Prepare training

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

KeyboardInterrupt: ignored

In [None]:
#Accumulated batch size (since GPT2 is so big)
def pack_tensor(new_tensor, packed_tensor, max_seq_len):
    if packed_tensor is None:
        return new_tensor, True, None
    if new_tensor.size()[1] + packed_tensor.size()[1] > max_seq_len:
        return packed_tensor, False, new_tensor
    else:
        packed_tensor = torch.cat([new_tensor, packed_tensor[:, 1:]], dim=1)
        return packed_tensor, True, None

In [None]:
def train(
    dataset, model, tokenizer,
    batch_size=16, epochs=20, lr=2e-5,
    max_seq_len=400, warmup_steps=200,
    gpt2_type="gpt2", output_dir=".", output_prefix="wreckgar",
    test_mode=False,save_model_on_epoch=True,
):

    acc_steps = 100
    device=torch.device("cuda")
    model = model.cuda()
    model.train()

    optimizer = AdamW(model.parameters(), lr=lr)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=warmup_steps, num_training_steps=-1
    )

    train_dataloader = DataLoader(dataset, batch_size=1, shuffle=True)
    loss=0
    accumulating_batch_count = 0
    input_tensor = None

    for epoch in range(epochs):

        print(f"Training epoch {epoch}")
        print(loss)
        for idx, entry in tqdm(enumerate(train_dataloader)):
            (input_tensor, carry_on, remainder) = pack_tensor(entry, input_tensor, 768)

            if carry_on and idx != len(train_dataloader) - 1:
                continue

            input_tensor = input_tensor.to(device)
            outputs = model(input_tensor, labels=input_tensor)
            loss = outputs[0]
            loss.backward()

            if (accumulating_batch_count % batch_size) == 0:
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
                model.zero_grad()

            accumulating_batch_count += 1
            input_tensor = None
        if save_model_on_epoch:
            torch.save(
                model.state_dict(),
                os.path.join(output_dir, f"{output_prefix}-{epoch}.pt"),
            )
    return model

### Actual Training

In [None]:
#Train the model on the specific data we have
model = train(dataset, model, tokenizer)

In [None]:
model = torch.load('/content/wreckgar-16.pt')

In [None]:
#Save the model to a pkl or something so it can be reused later on
torch.save(model,  base_path+'model.pt')

### Text generation

In [None]:
#Load the model to use it
#model = torch.load( base_path+'model.pt')

In [None]:
model = GPT2LMHeadModel.from_pretrained('gpt2')
model.load_state_dict(torch.load(base_path+"model.pt"))
model.eval()

RuntimeError: ignored

In [None]:
def generate(
    model,
    tokenizer,
    prompt,
    entry_count=10,
    entry_length=None, #maximum number of words
    top_p=0.8,
    temperature=1.,
):
    if entry_length is None:
      entry_length = min(500,int(len(prompt.split())*1.3//1))
    model.eval()

    generated_num = 0
    generated_list = []

    filter_value = -float("Inf")

    with torch.no_grad():

        for entry_idx in trange(entry_count):

            entry_finished = False

            generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)

            for i in range(entry_length):
                outputs = model(generated, labels=generated)
                loss, logits = outputs[:2]
                logits = logits[:, -1, :] / (temperature if temperature > 0 else 1.0)

                sorted_logits, sorted_indices = torch.sort(logits, descending=True)
                cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)

                sorted_indices_to_remove = cumulative_probs > top_p
                sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[
                    ..., :-1
                ].clone()
                sorted_indices_to_remove[..., 0] = 0

                indices_to_remove = sorted_indices[sorted_indices_to_remove]
                logits[:, indices_to_remove] = filter_value

                next_token = torch.multinomial(F.softmax(logits, dim=-1), num_samples=1)
                generated = torch.cat((generated, next_token), dim=1)

                if next_token in tokenizer.encode("<|endoftext|>"):
                    entry_finished = True

                if entry_finished:

                    generated_num = generated_num + 1

                    output_list = list(generated.squeeze().numpy())
                    output_text = tokenizer.decode(output_list)
                    generated_list.append(output_text)
                    break

            if not entry_finished:
              output_list = list(generated.squeeze().numpy())
              output_text = f"{tokenizer.decode(output_list)}<|endoftext|>"
              generated_list.append(output_text)

    return generated_list

In [None]:
test_set['Lyric'] = test_set['Lyric'] + " <CL>"

In [None]:
#Function to generate multiple sentences. Test data should be a dataframe
def text_generation(test_data,temperature=0.5):
  generated_lyrics = []
  for i in range(len(test_data)):
    x = generate(model.to('cpu'), tokenizer, test_data['Lyric'][i], entry_count=1,temperature=temperature)
    generated_lyrics.append(x)
  return generated_lyrics

In [None]:
generate(model.to('cpu'), tokenizer, df['Lyric'][15].split("<CL>")[0]+"<CL>", entry_count=1,temperature=0.5)

NameError: ignored

In [None]:
generated_lyrics = text_generation(test_set,temperature=0.6)

100%|██████████| 1/1 [00:19<00:00, 19.33s/it]
100%|██████████| 1/1 [00:04<00:00,  4.86s/it]
100%|██████████| 1/1 [00:04<00:00,  4.82s/it]
100%|██████████| 1/1 [00:07<00:00,  7.60s/it]
100%|██████████| 1/1 [00:09<00:00,  9.55s/it]
100%|██████████| 1/1 [00:07<00:00,  7.07s/it]
100%|██████████| 1/1 [00:07<00:00,  7.65s/it]
100%|██████████| 1/1 [00:06<00:00,  6.18s/it]
100%|██████████| 1/1 [00:16<00:00, 16.46s/it]
100%|██████████| 1/1 [00:03<00:00,  3.32s/it]
100%|██████████| 1/1 [00:02<00:00,  2.63s/it]
100%|██████████| 1/1 [00:05<00:00,  5.92s/it]
100%|██████████| 1/1 [00:07<00:00,  7.27s/it]
100%|██████████| 1/1 [00:04<00:00,  4.07s/it]
100%|██████████| 1/1 [00:09<00:00,  9.86s/it]
100%|██████████| 1/1 [00:07<00:00,  7.98s/it]
100%|██████████| 1/1 [00:01<00:00,  1.79s/it]
100%|██████████| 1/1 [00:12<00:00, 12.16s/it]
100%|██████████| 1/1 [00:03<00:00,  3.59s/it]
100%|██████████| 1/1 [00:05<00:00,  5.03s/it]
100%|██████████| 1/1 [00:06<00:00,  6.77s/it]
100%|██████████| 1/1 [00:05<00:00,

In [None]:
generated_lyrics

[['Elena noticed that the light bulb in the entrance had burned out. She put in a new one and suggested that the neighbors buy light bulbs together.  <CL> "I\'m not sure what\'s going on. I\'m not sure what\'s going on. I\'m not sure what\'s going on. I\'m not sure what\'s going on<|endoftext|>'],
 ['Roma began to study on the simulator after work. He energized and improved his health.  <CL>\n\nThe machine is now able to record the data and the results of the tests.\n\n<|endoftext|>'],
 ['Vova found 10 forgotten letters from his grandfather, 7 of which were written 50 years ago.  <CL>\n\n"I had a letter from my grandfather, who was a great man and a very generous man,<|endoftext|>'],
 ['When the refrigerator froze, Vasya called the master. This one arrived and brought terrific joy to the house.  <CL> The master told Vasya that he had been asked to make a copy of the book and that he would give it to<|endoftext|>'],
 ['I bought a new hammock for relaxing in the country, but did not take

In [None]:
test_set["Lyric"][24]

In [None]:
small_test_set = test_set[:]

In [None]:
#Loop to keep only generated text and add it as a new column in the dataframe
my_generations=[]

for i in range(len(generated_lyrics)):
  my_generations.append(generated_lyrics[i][0].split("<CL>")[1])

small_test_set['Generated_lyrics'] = my_generations

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  small_test_set['Generated_lyrics'] = my_generations


In [None]:
#Finish the sentences when there is a point, remove after that
final=[]

for i in range(len(small_test_set)):
  to_remove = small_test_set['Generated_lyrics'][i].split('.')[-1]
  final.append(small_test_set['Generated_lyrics'][i].replace("<|endoftext|>",""))

small_test_set['Generated_lyrics'] = final
small_test_set.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  small_test_set['Generated_lyrics'] = final


Unnamed: 0,index,Lyric,True_end_lyrics,Generated_lyrics
0,127,Elena noticed that the light bulb in the entra...,"Elena, light bulb, neighbors.","""I'm not sure what's going on. I'm not sure w..."
1,89,Roma began to study on the simulator after wor...,"Roma goes to the gym, full of energy and in g...",\n\nThe machine is now able to record the data...
2,454,Vova found 10 forgotten letters from his grand...,Vova discovered 10 letters from his grandfath...,"\n\n""I had a letter from my grandfather, who w..."
3,731,"When the refrigerator froze, Vasya called the ...","Vasya's refrigerator froze, so he called a re...",The master told Vasya that he had been asked ...
4,780,I bought a new hammock for relaxing in the cou...,I bought a new hammock for relaxing in the co...,"The hammock is now fully functional, but I am..."


In [None]:
small_test_set.to_csv(base_path+"tests_geners.csv",index_label="index")

In [None]:
small_test_set['Generated_lyrics'][6]

' I got a call from my boss and he told me to go to the hospital. I was so excited to see him and he was so kind'

In [None]:
small_test_set['True_end_lyrics'][6]

' Stumbled, smashed the phone, had a bad day.'

### Analyze performance

In [None]:
#Using BLEU score to compare the real sentences with the generated ones
import statistics
from nltk.translate.bleu_score import sentence_bleu

scores=[]

for i in range(len(small_test_set)):
  reference = small_test_set['True_end_lyrics'][i]
  candidate = small_test_set['Generated_lyrics'][i]
  scores.append(sentence_bleu(reference, candidate))

statistics.mean(scores)

KeyboardInterrupt: ignored

In [None]:
!pip install rouge

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/pip/_vendor/pkg_resources/__init__.py", line 3108, in _dep_map
    return self.__dep_map
  File "/usr/local/lib/python3.10/dist-packages/pip/_vendor/pkg_resources/__init__.py", line 2901, in __getattr__
    raise AttributeError(attr)
AttributeError: _DistInfoDistribution__dep_map

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/cli/base_command.py", line 169, in exc_logging_wrapper
    status = run_func(*args)
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/cli/req_command.py", line 242, in wrapper
    return func(self, options, args)
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/commands/install.py", line 441, in run
    conflicts = self._determine_conflicts(to

In [None]:
#Rouge score
from rouge import Rouge
rouge=Rouge()

print(rouge.get_scores(small_test_set['Generated_lyrics'], small_test_set['True_end_lyrics'], avg=True))
print(rouge.get_scores(small_test_set['True_end_lyrics'],small_test_set['Lyric'], avg=True))
print(rouge.get_scores(small_test_set['Generated_lyrics'], small_test_set['Lyric'], avg=True))

{'rouge-1': {'r': 0.11929391374995216, 'p': 0.1280296092796093, 'f': 0.12020169140808458}, 'rouge-2': {'r': 0.002747252747252747, 'p': 0.0038461538461538464, 'f': 0.003205128018162404}, 'rouge-l': {'r': 0.10368036063639903, 'p': 0.11554487179487179, 'f': 0.10684897427552661}}
{'rouge-1': {'r': 0.4436241951858377, 'p': 0.5136731435725038, 'f': 0.4571651563769159}, 'rouge-2': {'r': 0.2536129080559302, 'p': 0.2638343769811407, 'f': 0.25384367141580794}, 'rouge-l': {'r': 0.4212357424904797, 'p': 0.4805784070041365, 'f': 0.4329629968806386}}
{'rouge-1': {'r': 0.14045480373636923, 'p': 0.17844169719169722, 'f': 0.1547750653026056}, 'rouge-2': {'r': 0.01905982905982906, 'p': 0.038003663003663, 'f': 0.025272868664998418}, 'rouge-l': {'r': 0.12246567642061652, 'p': 0.15818070818070817, 'f': 0.135897470010926}}


In [None]:
rouge.get_scores(small_test_set['Lyric'],small_test_set['Lyric'], avg=True)

{'rouge-1': {'r': 1.0, 'p': 1.0, 'f': 0.9999999949999997},
 'rouge-2': {'r': 1.0, 'p': 1.0, 'f': 0.9999999949999997},
 'rouge-l': {'r': 1.0, 'p': 1.0, 'f': 0.9999999949999997}}

# GPT2 without any fine Tuning

In [None]:
import transformers
import torch

In [None]:
tokenizer = transformers.GPT2Tokenizer.from_pretrained('gpt2')
model = transformers.GPT2LMHeadModel.from_pretrained('gpt2')

In [None]:
## Making a function that will generate text for us ##
def gen_text(prompt_text, tokenizer, model, n_seqs=1, max_length=374):
  # n_seqs is the number of sequences to generate
  # max_length is the maximum length of the sequence
  encoded_prompt = tokenizer.encode(prompt_text, add_special_tokens=False, return_tensors="pt")
  # We are encoding the text using the gpt tokenizer. The return tensors are of type "pt"
  # since we are using PyTorch, not tensorflow
  output_sequences = model.generate(
      input_ids=encoded_prompt,
      max_length=max_length+len(encoded_prompt), # The model has to generate something,
      # so we add the length of the original sequence to max_length
      temperature=1.0,
      top_k=0,
      top_p=0.9,
      repetition_penalty=1.2, # To ensure that we dont get repeated phrases
      do_sample=True,
      num_return_sequences=n_seqs
  ) # We feed the encoded input into the model.
  ## Getting the output ##
  if len(output_sequences.shape) > 2:
    output_sequences.squeeze_() # the _ indicates that the operation will be done in-place
  generated_sequences = []
  for generated_sequence_idx, generated_sequence in enumerate(output_sequences):
    generated_sequence = generated_sequence.tolist()
    text = tokenizer.decode(generated_sequence)
    total_sequence = (
        prompt_text + text[len(tokenizer.decode(encoded_prompt[0], clean_up_tokenization_spaces=True, )) :]
    )
    generated_sequences.append(total_sequence)
  return generated_sequences

In [None]:
#Generate sequences
gen_text(df['Lyric'][0],tokenizer,model)

In [None]:
#Function to generate multiple sentences. Test data should be a dataframe
def text_generation(test_data):
  generated_lyrics = []
  for i in range(len(test_data)):
    x = gen_text(test_data['Lyric'][i], tokenizer, model)
    generated_lyrics.append(x)
  return generated_lyrics

generated_lyrics = text_generation(test_set)

In [None]:
#Loop to keep only generated text and add it as a new column in the dataframe
my_generations=[]

for i in range(len(generated_lyrics)):
  a = test_set['Lyric'][i].split()[-30:] #Get the matching string we want (30 words)
  b = ' '.join(a)
  c = ' '.join(generated_lyrics[i]) #Get all that comes after the matching string
  my_generations.append(c.split(b)[-1])

test_set['Generated_lyrics'] = my_generations

In [None]:
#Finish the sentences when there is a point, remove after that
final=[]

for i in range(len(test_set)):
  to_remove = test_set['Generated_lyrics'][i].split('.')[-1]
  final.append(test_set['Generated_lyrics'][i].replace(to_remove,''))

test_set['Generated_lyrics'] = final
test_set.head()

In [None]:
#Using BLEU score to compare the real sentences with the generated ones
import statistics
from nltk.translate.bleu_score import sentence_bleu

scores=[]

for i in range(len(test_set)):
  reference = test_set['True_end_lyrics'][i]
  candidate = test_set['Generated_lyrics'][i]
  scores.append(sentence_bleu(reference, candidate))

statistics.mean(scores)

In [None]:
!pip install rouge

In [None]:
#Rouge score
from rouge import Rouge
rouge=Rouge()

rouge.get_scores(test_set['Generated_lyrics'], test_set['True_end_lyrics'], avg=True, ignore_empty=True)