In [1]:
from PIL import Image
from transformers import AutoModelForCausalLM, AutoModel, AutoTokenizer, ViTImageProcessor, VisionEncoderDecoderModel, VisionEncoderDecoderConfig, AutoConfig
from split import split_dataset
from dataset import SongsDataset
from torch.utils.data import DataLoader
from training import train_model
from evaluation import evaluate_model
import torch



In [2]:
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

In [3]:
ENCODER_NAME = "google/vit-base-patch16-224-in21k"
DECODER_NAME = "gpt2"

In [4]:
# Splitting the data
all_songs_path = 'data/songs/all'
# split_dataset(all_songs_path)

In [5]:
# Loading image_processor and tokenizer to be used in datasets
# image_processor = ViTImageProcessor.from_pretrained(ENCODER_NAME)
image_processor = ViTImageProcessor.from_pretrained(ENCODER_NAME)
tokenizer = AutoTokenizer.from_pretrained(DECODER_NAME)
tokenizer.pad_token = tokenizer.eos_token

In [6]:
from torchvision import transforms

In [7]:
# Datasets
train_dataset = SongsDataset('data/songs/train', image_processor, tokenizer, by_line=False, n_variations=10)
print('train size', len(train_dataset))
validation_dataset = SongsDataset('data/songs/validation', image_processor, tokenizer, by_line=False, n_variations=1)
print('val size', len(validation_dataset))
test_dataset = SongsDataset('data/songs/test', image_processor, tokenizer, by_line=False, n_variations=1)
print('test size', len(test_dataset))

train size 1990
val size 22
test size 25


In [8]:
# Dataloaders
BATCH_SIZE = 32
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, pin_memory=False)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, pin_memory=False)
validation_dataloader = DataLoader(validation_dataset, batch_size=BATCH_SIZE, shuffle=False, pin_memory=False)

In [9]:
import importlib
import training
import evaluation
from training import train_model
from evaluation import evaluate_model
importlib.reload(training)
importlib.reload(evaluation)

<module 'evaluation' from 'C:\\Users\\Itay\\Documents\\GitHub\\nlp-final-project\\evaluation.py'>

In [None]:
model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained(ENCODER_NAME, DECODER_NAME)
# update the model config
model.config.eos_token_id = tokenizer.eos_token_id
model.config.decoder_start_token_id = tokenizer.bos_token_id
model.config.pad_token_id = tokenizer.pad_token_id
model.config.vocab_size = model.config.decoder.vocab_size


train_history, val_history = train_model(model, train_dataloader, validation_dataloader, 
                                         num_epochs=5, learning_rate=1e-4, device=DEVICE)


Some weights of GPT2LMHeadModel were not initialized from the model checkpoint at gpt2 and are newly initialized: ['h.5.crossattention.q_attn.bias', 'h.6.crossattention.c_attn.weight', 'h.6.crossattention.q_attn.weight', 'h.0.crossattention.c_attn.weight', 'h.9.crossattention.q_attn.weight', 'h.1.crossattention.c_proj.weight', 'h.5.crossattention.q_attn.weight', 'h.9.crossattention.q_attn.bias', 'h.0.crossattention.c_attn.bias', 'h.4.crossattention.c_attn.weight', 'h.8.crossattention.c_attn.bias', 'h.3.crossattention.c_proj.bias', 'h.0.crossattention.q_attn.weight', 'h.10.crossattention.q_attn.bias', 'h.6.ln_cross_attn.bias', 'h.1.crossattention.q_attn.weight', 'h.1.ln_cross_attn.weight', 'h.8.ln_cross_attn.bias', 'h.11.crossattention.c_attn.weight', 'h.9.crossattention.c_attn.weight', 'h.4.ln_cross_attn.weight', 'h.10.crossattention.q_attn.weight', 'h.4.crossattention.c_attn.bias', 'h.2.crossattention.q_attn.weight', 'h.4.ln_cross_attn.bias', 'h.8.crossattention.c_proj.weight', 'h.10.

In [11]:
# model.save_pretrained("best_so_far.chk", from_pt=True) 

In [None]:
# beam search parameters
model.config.max_length = 32
model.config.early_stopping = True
model.config.no_repeat_ngram_size = 3
model.config.length_penalty = 2.0
model.config.num_beams = 4

true_lyrics, predicted_lyrics = evaluate_model(model, test_dataloader, device=DEVICE, 
                                               max_new_tokens=32, 
                                               num_beams=4, 
                                               no_repeat_ngram_size=3)  

### Evaluating results

In [20]:
from transformers import pipeline
sentiment_pipeline = pipeline("sentiment-analysis")

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [21]:
predicted_sentiments = [k['label'] for k in sentiment_pipeline(predicted_lyrics)]
true_sentiments = [k['label'] for k in sentiment_pipeline(true_lyrics)]

In [23]:
from sklearn.metrics import accuracy_score
accuracy_score(true_sentiments, predicted_sentiments)

0.71