In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from torch.utils.data import DataLoader, random_split
from transformers import Trainer, TrainingArguments
from datasets import load_dataset
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
#load dataset
dataset = load_dataset("lamini/taylor_swift",split="train")

In [3]:
model_name = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`


BertLMHeadModel(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_a

In [4]:
# get the data
df = pd.DataFrame(dataset)

In [36]:
#tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['question'], examples['answer'], truncation=True, padding="max_length")
tokenized_data = df.apply(lambda x: tokenize_function(x), axis=1)
train_dataset1, val_dataset2 = train_test_split(df, test_size=0.2, random_state=42)
train_dataset = train_dataset1.apply(lambda x: tokenize_function(x), axis=1)
val_dataset = val_dataset2.apply(lambda x: tokenize_function(x), axis=1)
test_dict = val_dataset2.to_dict(orient='records')


In [37]:
from transformers import AdamW
import accelerate

#training parameters
# Hyperparameters and Training Configuration
learning_rate = 1.0e-5
num_epochs = 3
batch_size = 20
# Define the optimizer

outputdir = f"swift_{num_epochs}"
training_args = TrainingArguments(learning_rate= learning_rate, per_device_train_batch_size= 20, num_train_epochs=num_epochs, weight_decay=0.01, warmup_ratio=0.1, optim="adafactor",eval_steps=120, save_steps=120, gradient_accumulation_steps=4, output_dir= outputdir)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)


In [38]:
trainer = Trainer(model= model, args=training_args, train_dataset= train_dataset, eval_dataset=val_dataset)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


In [39]:
for epoch in range(num_epochs):
    total_loss = 0
    model.train()

    """for batch in train_loader:
      
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        total_loss += loss.item()

    print(f"Epoch: {epoch+1}/{num_epochs}, Average Loss: {total_loss / len(train_dataset)}")"""


In [40]:
# Evaluation
model.eval()
correct = 0
total = 0

"""with torch.no_grad():
   for batch in val_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        predictions = torch.argmax(outputs.logits, dim=1)

        total += labels.size(0)
        correct += (predictions == labels).sum().item()

accuracy = correct / total
print(f"Validation Accuracy: {accuracy:.2f}")"""

'with torch.no_grad():\n   for batch in val_loader:\n        input_ids = batch["input_ids"].to(device)\n        attention_mask = batch["attention_mask"].to(device)\n        labels = batch["label"].to(device)\n\n        outputs = model(input_ids=input_ids, attention_mask=attention_mask)\n        predictions = torch.argmax(outputs.logits, dim=1)\n\n        total += labels.size(0)\n        correct += (predictions == labels).sum().item()\n\naccuracy = correct / total\nprint(f"Validation Accuracy: {accuracy:.2f}")'

In [41]:
def inference(text, model, tokenizer, max_input_tokens=1000, max_output_tokens=100):
#max_input_tokens=1000, max_output_tokens=100, you can change this depending on if you want the model to generate more or less
  # Tokenize text coming in
  input_ids = tokenizer.encode(
          text,
          return_tensors="pt",
          truncation=True,
          max_length=max_input_tokens
  )

  # Generate based on the tokens
  device = model.device
  #generate based on tokens!
  generated_tokens_with_prompt = model.generate(
  #put model on gpu or cpu
    input_ids=input_ids.to(device),
    max_length=max_output_tokens
  )

  # Decode
  generated_text_with_prompt = tokenizer.batch_decode(generated_tokens_with_prompt, skip_special_tokens=True)

  # Strip the prompt
  generated_text_answer = generated_text_with_prompt[0][len(text):]

  return generated_text_answer

In [43]:
test_text = test_dict[0]['question']
print("Question input (test):", test_text)
print(f"Correct answer from Lamini docs: {test_dict[0]['answer']}")
print("Model's answer: ")
print(inference(test_text, model, tokenizer))

Question input (test): What is the controversy surrounding Taylor Swift's Live from Paris concert and how does it affect her millennial fan base who are subscribed to her Reputation album?
Correct answer from Lamini docs: Taylor Swift's Live from Paris Concert was a live streamed event that took place on December 14, 2018. The controversy surrounding the concert was that it was only available to fans who had purchased her Reputation album. This caused a lot of backlash from fans who felt that they were being unfairly excluded from the event. Many fans felt that the concert should have been made available to all of her fans, regardless of whether or not they had purchased the album. This controversy has had a significant impact on Taylor Swift's millennial fan base who are primarily subscribed to her Reputation album. Many fans felt that this was a way for Taylor Swift to force them to purchase her album in order to see the concert. This
Model's answer: 
................................