In [None]:
import os
import pandas as pd
import numpy as np

from transformers import GPT2Tokenizer
from transformers import GPT2LMHeadModel

import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel

from sentence_transformers import SentenceTransformer
from torch.nn.functional import cosine_similarity

from pathlib import Path

from torch.utils.data import Dataset, DataLoader
import math
from tqdm import tqdm

In [116]:
model_path = Path.cwd() / "models" / "gpt2_retrained_good_split.pt"
print(model_path)

/home/isachansson/DML-project/notebooks/models/gpt2_retrained_good_split.pt


In [117]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

model_name = "distilgpt2"

tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

model = GPT2LMHeadModel.from_pretrained(model_name)
model.load_state_dict(torch.load(model_path, map_location=device))
model.to(device)
model.eval()

cuda


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-5): 6 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

## Get genereated songs

In [165]:
prompts = [
    "Genre: Heavy Metal,\n\n",
    "Genre: Indie,\n\n",
    "Genre: Pop,\n\n"
]
genres = ['Heavy Metal', 'Indie', 'Pop']
genreated_songs = {
    'Heavy Metal' : '',
    'Pop' : '',
    'Indie' : ''
}
for prompt, genre in zip(prompts, genres):
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)
    with torch.inference_mode():
        outputs = model.generate(
            input_ids,
            do_sample=True,
            max_new_tokens=250,
            min_new_tokens=200,
            top_p=0.92,
            top_k=50,
            temperature=0.9,
            repetition_penalty=1.1,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
            num_return_sequences=1,
        )
    text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    genreated_songs[genre] = text

genreated_songs

{'Heavy Metal': 'Genre: Heavy Metal,\n\n\n"I am one with the dream of you."\nHe\'s a demon and no fool.\n\nI\'m searching for someone to blame.\nThat\'s all I really want.\n\nI feel something cold inside.\nI never know what it is that we do not understand.\nWe are alone in this cage.\nIn this cage, we will stand tall against each others...\n\nAnd now I have my dreams fulfilled -\nThrough dreams we shall survive!\nTo see their face as they come alive again.\nMy life has been chosen by god.\n\nI don\'t think about love nor any other good things\nIt\'s just like when I was born.\n\nI can\'t even remember how much time ago.\nNo more tears or hate so easily.\nYou\'re always there on my mind at the end of the day.\n\nI\'m searching for someone to blame.\nThat\'s all I really want.\n\nI feel something cold inside.\nI never know what it is that we do not understand.\nWe are alone in this cage.\nIn this cage, we will stand tall against each others...',
 'Pop': 'Genre: Pop,\n\n\nOh yeah!\n(yeah!

### Geta  random song from the data one for each genre

In [None]:
data_path = Path(os.getcwd()).parent / 'data'
df = pd.read_csv(f'{data_path}/lyrics_filtered_768tokens.csv')

print(df['genre'].value_counts())

genre
Heavy Metal    13378
Pop            13128
Indie          12909
Name: count, dtype: int64


In [None]:

sampled_real_songs = {}

for genre in ['Pop', 'Indie', 'Heavy Metal']:
    subset = df[df['genre'] == genre]
    samples = subset.sample(30, random_state=6)
    sampled_real_songs[genre] = samples['lyrics'].tolist()
    
semantic_model = SentenceTransformer("all-MiniLM-L6-v2")

genres = ['Pop', 'Indie', 'Heavy Metal']
semantic_sim_matrix = np.zeros((len(genres), len(genres)))

for i, gen_g in enumerate(genres):

    v_gen = torch.tensor(semantic_model.encode(genreated_songs[gen_g], convert_to_numpy=True))
    v_gen = v_gen.unsqueeze(0)  
    
    for j, real_g in enumerate(genres):
        sims = []
        for real_song in sampled_real_songs[real_g]:
            v_real = torch.tensor(semantic_model.encode(real_song, convert_to_numpy=True))
            v_real = v_real.unsqueeze(0)
            sim = cosine_similarity(v_gen, v_real, dim=1).item()
            sims.append(sim)
        semantic_sim_matrix[i, j] = np.mean(sims)


df_sem = pd.DataFrame(semantic_sim_matrix, index=[f"Gen {g}" for g in genres], columns=genres)
print(df_sem.round(3))


                   Pop  Indie  Heavy Metal
Gen Pop          0.445  0.343        0.259
Gen Indie        0.352  0.310        0.285
Gen Heavy Metal  0.395  0.381        0.401


In [166]:
for value in genreated_songs.values():
    print('*' * 50)
    print(value)

**************************************************
Genre: Heavy Metal,


"I am one with the dream of you."
He's a demon and no fool.

I'm searching for someone to blame.
That's all I really want.

I feel something cold inside.
I never know what it is that we do not understand.
We are alone in this cage.
In this cage, we will stand tall against each others...

And now I have my dreams fulfilled -
Through dreams we shall survive!
To see their face as they come alive again.
My life has been chosen by god.

I don't think about love nor any other good things
It's just like when I was born.

I can't even remember how much time ago.
No more tears or hate so easily.
You're always there on my mind at the end of the day.

I'm searching for someone to blame.
That's all I really want.

I feel something cold inside.
I never know what it is that we do not understand.
We are alone in this cage.
In this cage, we will stand tall against each others...
**************************************************


## Do the same for the genre model

In [180]:
base_model = GPT2LMHeadModel.from_pretrained(model_name)
base_model.to(device)
base_model.eval()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-5): 6 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [181]:
prompts = [
    "genre: Heavy Metal,\n\n",
    "genre: Indie,\n\n",
    "genre: Pop,\n\n"
]
genres = ['Heavy Metal', 'Indie', 'Pop']
baseline_genreated_songs = {
    'Heavy Metal' : '',
    'Pop' : '',
    'Indie' : ''
}
for prompt, genre in zip(prompts, genres):
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)
    with torch.inference_mode():
        outputs = base_model.generate(
            input_ids,
            do_sample=True,
            max_new_tokens=250,
            min_new_tokens=200,
            top_p=0.92,
            top_k=50,
            temperature=0.9,
            repetition_penalty=1.1,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
            num_return_sequences=1,
        )
    text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    baseline_genreated_songs[genre] = text

baseline_genreated_songs

{'Heavy Metal': 'genre: Heavy Metal,\n\n\nThe next game in the series will be an RPG with a lot of possibilities for how to create new characters and story that is quite unique. As players become more familiar with what they are doing from previous games (like Persona 4) some things may change - such as bringing up character names or having various types of items on screen at once if you have already completed one quest while being level 50 (it\'s not clear yet but this idea probably won\'t make it through until I see my own prototype).I hope anyone interested can keep checking out other projects around the world including "Majima Hachi" because there are so many different ideas floating around these topics throughout The Legend of Zelda World.Thanks!You\'re logged off Login | Sign Up Log In-By :\nA group of developers gathered their creativity into something special called Final Fantasy XIV, which has been releasing since May 2015. It consists primarily of three elements:Character cre

In [183]:
for value in baseline_genreated_songs.values():
    print('*' * 50)
    print(value)

**************************************************
genre: Heavy Metal,


The next game in the series will be an RPG with a lot of possibilities for how to create new characters and story that is quite unique. As players become more familiar with what they are doing from previous games (like Persona 4) some things may change - such as bringing up character names or having various types of items on screen at once if you have already completed one quest while being level 50 (it's not clear yet but this idea probably won't make it through until I see my own prototype).I hope anyone interested can keep checking out other projects around the world including "Majima Hachi" because there are so many different ideas floating around these topics throughout The Legend of Zelda World.Thanks!You're logged off Login | Sign Up Log In-By :
A group of developers gathered their creativity into something special called Final Fantasy XIV, which has been releasing since May 2015. It consists primarily of t

In [None]:

sampled_real_songs = {}

for genre in ['Pop', 'Indie', 'Heavy Metal']:
    subset = df[df['genre'] == genre]
    samples = subset.sample(50, random_state=6)
    sampled_real_songs[genre] = samples['lyrics'].tolist()
    

genres = ['Pop', 'Indie', 'Heavy Metal']
semantic_sim_matrix = np.zeros((len(genres), len(genres)))

for i, gen_g in enumerate(genres):

    v_gen = torch.tensor(semantic_model.encode(baseline_genreated_songs[gen_g], convert_to_numpy=True))
    v_gen = v_gen.unsqueeze(0)  
    
    for j, real_g in enumerate(genres):
        sims = []
        for real_song in sampled_real_songs[real_g]:
            v_real = torch.tensor(semantic_model.encode(real_song, convert_to_numpy=True))
            v_real = v_real.unsqueeze(0)
            sim = cosine_similarity(v_gen, v_real, dim=1).item()
            sims.append(sim)
        semantic_sim_matrix[i, j] = np.mean(sims)


df_sem = pd.DataFrame(semantic_sim_matrix, index=[f"Gen {g}" for g in genres], columns=genres)
print(df_sem.round(3))


                   Pop  Indie  Heavy Metal
Gen Pop          0.198  0.179        0.170
Gen Indie        0.236  0.222        0.202
Gen Heavy Metal  0.225  0.181        0.192


## Calculate perplexity score by 

Calculate the perplexity by calulating the loss over the test data

In [None]:
# Same dataset as in genre_model
class LyricsDataset(Dataset):

    def __init__(self, formatted_texts, tokenizer, max_length=768):

        self.input_ids = []
        self.attn_masks = []
        
        print(f"Pre-tokenizing {len(formatted_texts)} songs...")
        for text in tqdm(formatted_texts):
            encodings = tokenizer.encode_plus(
                text,
                truncation=True,
                padding='max_length',
                max_length=max_length,
                return_tensors='pt'
            )
            self.input_ids.append(encodings['input_ids'].squeeze(0))
            self.attn_masks.append(encodings['attention_mask'].squeeze(0))
        
        print("Tokenization complete")
    
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attn_masks[idx],
            'labels': self.input_ids[idx]
        }


In [None]:
def validation_loop(model, val_df, tokenizer, max_length=768, batch_size=8, device="cuda" if torch.cuda.is_available() else "cpu"):

    val_dataset = LyricsDataset(val_df['formatted_text'].tolist(), tokenizer, max_length=max_length)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    
    model.eval()
    model.to(device)
    
    batch_losses = []

    with torch.no_grad():
        progress_bar = tqdm(val_loader, desc="Val")
        for batch in progress_bar:
            try:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                
                batch_loss = outputs.loss.item()
                batch_losses.append(batch_loss)

                progress_bar.set_postfix({"Batch Loss": f"{batch_loss:.4f}"})
                
            except RuntimeError as e:
                print(f"Error: {e}")
                raise
    
    avg_loss = np.mean(batch_losses)
    perplexity = np.exp(avg_loss)
    
    print(f"Validation completed.Average Loss: {avg_loss:.4f}, Ppl: {perplexity:.4f}")
    
    return perplexity

    

In [None]:
# read in and the data and calculate perplexity

test_df = pd.read_csv('./data/test_split.csv')
print(len(test_df))
test_ppl = validation_loop(model, test_df, tokenizer)
print(f"Test PPL: {test_ppl}")

val_df = pd.read_csv('./data/val_split.csv')
print(len(val_df))
val_ppl = validation_loop(model, val_df, tokenizer)
print(f"Validation PPL: {val_ppl}")


train_df = pd.read_csv('./data2/train_split.csv')
print(len(train_df))
train_ppl = validation_loop(model, train_df, tokenizer)
print(f"Training PPL: {train_ppl}")

test_df.head(5)


3942
Pre-tokenizing 3942 songs...


  0%|          | 0/3942 [00:00<?, ?it/s]

100%|██████████| 3942/3942 [00:08<00:00, 490.76it/s]


Tokenization complete


Validation (Perplexity): 100%|██████████| 493/493 [01:29<00:00,  5.50it/s, Batch Loss=0.7849]


Validation completed. Average Loss: 0.9534, Perplexity: 2.5946
Test PPL: 2.5946287127550067
3941
Pre-tokenizing 3941 songs...


100%|██████████| 3941/3941 [00:07<00:00, 506.82it/s]


Tokenization complete


Validation (Perplexity): 100%|██████████| 493/493 [01:29<00:00,  5.50it/s, Batch Loss=1.0446]


Validation completed. Average Loss: 0.9707, Perplexity: 2.6398
Validation PPL: 2.6397894304066747
31532
Pre-tokenizing 31532 songs...


100%|██████████| 31532/31532 [00:59<00:00, 530.86it/s]


Tokenization complete


Validation (Perplexity): 100%|██████████| 3942/3942 [11:56<00:00,  5.50it/s, Batch Loss=1.0181]


Validation completed. Average Loss: 0.9306, Perplexity: 2.5360
Training PPL: 2.535964428234951


Unnamed: 0,genre,lyrics,token_count,formatted_text
0,Pop,With music by our side\nTo break the color lin...,204,Genre: Pop\n\nWith music by our side\nTo break...
1,Pop,In your bed or in your car\nOn the earth or up...,456,Genre: Pop\n\nIn your bed or in your car\nOn t...
2,Pop,(Verse)\nYou ain't gotta say too much cause I ...,347,Genre: Pop\n\n(Verse)\nYou ain't gotta say too...
3,Pop,Someone's drinking all alone\nSomeone's left t...,147,Genre: Pop\n\nSomeone's drinking all alone\nSo...
4,Heavy Metal,"hey you, I need a friend, I hope you feel the ...",280,"Genre: Heavy Metal\n\nhey you, I need a friend..."
