In [None]:
!pip install transformers
!pip install datasets

Collecting transformers
  Downloading transformers-4.15.0-py3-none-any.whl (3.4 MB)
[K     |████████████████████████████████| 3.4 MB 9.9 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 51.2 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 5.2 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 61.8 MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 52.2 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transforme

In [None]:
import json
import os
import numpy as np
import pandas as pd
import re
import torch
import datasets

from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from transformers import T5TokenizerFast, Seq2SeqTrainingArguments
from transformers import TextDataset,DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments,T5ForConditionalGeneration
from transformers import pipeline, Seq2SeqTrainer


# Create Dataset

In [None]:
IN_COLAB: bool = False
try:
  from google.colab import drive, files
  IN_COLAB = True
except:
  IN_COLAB = False

path: str = ''
if IN_COLAB:
  drive.mount('/content/drive')
  path = 'drive/MyDrive/RapMachine/KeywordLyrics.json'
else:
  path = None



Mounted at /content/drive


In [None]:
directory: str = 'training_data/'
if not os.path.exists(directory):
  os.makedirs(directory)

df = pd.read_json(path)
df_train, df_test = train_test_split(df,test_size=0.15)

def create_dataset(df):
  data: dict = {'keywords': [], 'lyrics': []}
  for _, row in df.iterrows():
    data['keywords'].append(row[0])
    data['lyrics'].append(row[1])
  return Dataset.from_dict(data)

data_train = create_dataset(df_train)
data_test = create_dataset(df_test)

dataset = DatasetDict({'train': data_train, 'test': data_test})

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['keywords', 'lyrics'],
        num_rows: 3711
    })
    test: Dataset({
        features: ['keywords', 'lyrics'],
        num_rows: 655
    })
})

In [None]:
def format_src(elem):
  elem['src_texts'] = f'CONTEXT: {elem["keywords"]}'
  return elem

def format_tgt(elem):
  lyrics = re.sub('\n', ' ', elem["lyrics"])
  lyrics = re.sub(r"\s", " ", lyrics)
  elem['tgt_texts'] = f'RAP-LYRICS: {lyrics}'
  return elem

In [None]:
dataset = dataset.map(format_src)
dataset = dataset.map(format_tgt)

  0%|          | 0/3711 [00:00<?, ?ex/s]

  0%|          | 0/655 [00:00<?, ?ex/s]

  0%|          | 0/3711 [00:00<?, ?ex/s]

  0%|          | 0/655 [00:00<?, ?ex/s]

## Tokenize

In [None]:
tokenizer = T5TokenizerFast.from_pretrained('t5-small')

Downloading:   0%|          | 0.00/773k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.32M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.17k [00:00<?, ?B/s]

In [None]:
def get_max_length(tokenizer, train_dataset, column, percentile):
  def get_lengths(batch):
    return tokenizer(batch, padding=False, return_length=True)

  lengths = train_dataset.map(get_lengths, input_columns=column, batched=True)['length']
  return int(np.percentile(lengths, percentile)) +1

In [None]:
max_length = get_max_length(tokenizer, dataset['train'], 'src_texts', 80)
max_target_length = get_max_length(tokenizer, dataset['train'], 'tgt_texts', 90)

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (854 > 512). Running this sequence through the model will result in indexing errors


In [None]:
def tokenize(batch):
  # TODO: call the tokenizer once for the inputs and once for the targets. Make sure to use the correct max_length, use the correct padding, and enable truncation.
  inputs = tokenizer(
    batch['src_texts'],
    truncation=True,
    max_length=max_length,
    padding='max_length'
  )
  with tokenizer.as_target_tokenizer():
    targets = tokenizer(
      batch['tgt_texts'],
      truncation=True,
      max_length=max_target_length,
      padding='max_length'
    )
  inputs["labels"] = targets["input_ids"]
  return inputs

In [None]:
dataset = dataset.map(tokenize, batched=True)

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
dataset.set_format('torch', columns=['input_ids', 'labels', 'attention_mask'])

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'input_ids', 'keywords', 'labels', 'lyrics', 'src_texts', 'tgt_texts'],
        num_rows: 3711
    })
    test: Dataset({
        features: ['attention_mask', 'input_ids', 'keywords', 'labels', 'lyrics', 'src_texts', 'tgt_texts'],
        num_rows: 655
    })
})

# Training

In [None]:
NUM_EPOCHS = 1 if torch.cuda.is_available() else 1

TRAIN_BATCH_SIZE = 4
EVAL_BATCH_SIZE = 4
WARMUP_STEPS = 200
WEIGHT_DECAY = 0.01
LOGGING_STEPS = 100
LEARNING_RATE = 5e-05

In [None]:
model = T5ForConditionalGeneration.from_pretrained("t5-small")

Downloading:   0%|          | 0.00/231M [00:00<?, ?B/s]

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir='./results',
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=TRAIN_BATCH_SIZE,
    per_device_eval_batch_size=EVAL_BATCH_SIZE,
    warmup_steps=WARMUP_STEPS,
    weight_decay=WEIGHT_DECAY,
    logging_dir='./logs/',
    evaluation_strategy="steps",
    logging_steps=LOGGING_STEPS,
    learning_rate=LEARNING_RATE,
    predict_with_generate=True,
    gradient_accumulation_steps=4
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test']
)

In [None]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: src_texts, lyrics, tgt_texts, keywords.
***** Running training *****
  Num examples = 3711
  Num Epochs = 1
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 4
  Total optimization steps = 232


Step,Training Loss,Validation Loss
100,8.1839,3.290048
200,3.3142,2.802628


The following columns in the evaluation set  don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: src_texts, lyrics, tgt_texts, keywords.
***** Running Evaluation *****
  Num examples = 655
  Batch size = 4
The following columns in the evaluation set  don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: src_texts, lyrics, tgt_texts, keywords.
***** Running Evaluation *****
  Num examples = 655
  Batch size = 4


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=232, training_loss=5.371755698631549, metrics={'train_runtime': 365.4546, 'train_samples_per_second': 10.154, 'train_steps_per_second': 0.635, 'total_flos': 63762641879040.0, 'train_loss': 5.371755698631549, 'epoch': 1.0})

In [None]:
trainer.save_model()

Saving model checkpoint to ./results
Configuration saved in ./results/config.json
Model weights saved in ./results/pytorch_model.bin
tokenizer config file saved in ./results/tokenizer_config.json
Special tokens file saved in ./results/special_tokens_map.json


In [None]:
rapper = pipeline(
    'text-generation',
    model='./results',
    tokenizer="t5-small",
    config={
        "num_beams": 5,
        'min_length': 400,
        'top-k': 50})

AttributeError: ignored

In [None]:
keywords: str = 'chicago, compton, straight'
input: str = f"CONTEXT: {keywords}" + f'RAP-LYRICS:'

input_ids = tokenizer.encode(input, return_tensors='pt').cuda()
greedy_output = model.generate(input_ids, min_length=500, num_beams=5, top_k=50, early_stopping=True)
print("Output:\n" + 100 * '-')
print(tokenizer.decode(greedy_output[0], skip_special_tokens=True))

Output:
----------------------------------------------------------------------------------------------------



In [None]:
input_ids = tokenizer.encode('I enjoy walking with my cute dog. It was a nice dazy', return_tensors='pt').cuda()

# generate text until the output length (which includes the context length) reaches 50
greedy_output = model.generate(input_ids, min_length=100, top_k=50)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(greedy_output[0], skip_special_tokens=True))

Output:
----------------------------------------------------------------------------------------------------
......... 
