In [118]:
import os
import pandas as pd
import numpy as np

from transformers import GPT2Tokenizer
from transformers import GPT2LMHeadModel

import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel


from pathlib import Path

In [119]:
model_path = Path.cwd() / "models" / "finetuned_distilgpt2_with_tokens.pt"
print(model_path)

/home/isachansson/DML-project/notebooks/models/finetuned_distilgpt2_with_tokens.pt


In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

model_name = "distilgpt2"

tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

model = GPT2LMHeadModel.from_pretrained(model_name)
model.load_state_dict(torch.load(model_path, map_location=device))
model.to(device)
model.eval()

cuda


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-5): 6 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

## Get genereated songs

In [120]:
prompts = ["[Heavy Metal] Generate Heavy Metal lyrics", "[Indie] Generate Indie lyrics","[POP] Generate Pop lyrics"]
genres = ['Heavy Metal', 'Indie', 'Pop']
genreated_songs = {
    'Heavy Metal' : '',
    'Pop' : '',
    'Indie' : ''
}
for prompt, genre in zip(prompts, genres):
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)
    with torch.inference_mode():
        outputs = model.generate(
            input_ids,
            do_sample=True,
            max_new_tokens=150,
            top_p=0.9,
            top_k=40,
            temperature=0.8,
            repetition_penalty=1.2,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
            num_return_sequences=1,
        )
    text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    genreated_songs[genre] = text

genreated_songs

{'Heavy Metal': '[Heavy Metal] Generate Heavy Metal lyrics.\n\n\n[Solo: P. Wagner, R. Schmidt]\n\n[Solo: M. Weikal/Lyrics: E. Holopainen - S. Hanneman & D. van Goghling]',
 'Pop': "[POP] Generate Pop lyrics.\n\n\n\nWhen you're with me I'm always on the same page\nThere's a message that we all can see through\nSo take care of it, and be aware\nOf your life when there's no one else around\nYou know how to run away from what hurts\nWe'll make love tonight for our children\nBut don't let them go again (oh, yeah)\nI won 'cause they will!\n\nOhh ohh ohh ohh ohh ohh ohhh, ohh ohh, ohh ohh ohh ohh, ohh ohh, ohh ohh...\nAnd now my heart is beating like a drum, cause nothing but pain starts to break\nCause everything",
 'Indie': '[Indie] Generate Indie lyrics for the magazine, "Nashville"\n\nWell I\'m gonna make some noise tonight - uh oh\nCause you can\'t hear my voice when I speak my mind.\nIt\'s such a good time and everybody around is here watching us talk.\n\nI hope that it don´t feel so ba

### Geta  random song from the data one for each genre

In [115]:
data_path = Path(os.getcwd()).parent / 'data'

df = pd.read_csv(f'{data_path}/lyrics_filtered_768tokens.csv')

print(df['genre'].value_counts())


genre
Heavy Metal    13378
Pop            13128
Indie          12909
Name: count, dtype: int64


In [127]:
prompts = ["[Heavy Metal] ", "[Indie] ","[POP]"]
genres = ['Heavy Metal', 'Indie', 'Pop']
genreated_songs = {
    'Heavy Metal' : '',
    'Pop' : '',
    'Indie' : ''
}
for prompt, genre in zip(prompts, genres):
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)

    with torch.inference_mode():
        with torch.inference_mode():
            outputs = model.generate(
            input_ids,
            do_sample=True,
            max_new_tokens=150,
            top_p=0.9,
            top_k=40,
            temperature=1.0,
            repetition_penalty=1.1,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
            num_return_sequences=1,
        )
        text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        genreated_songs[genre] = text

genreated_songs
sampled_real_songs = {}

for genre in ['Pop', 'Indie', 'Heavy Metal']:
    subset = df[df['genre'] == genre]
    samples = subset.sample(50, random_state=6)
    sampled_real_songs[genre] = samples['lyrics'].tolist()
    
from torch.nn.functional import cosine_similarity
def get_hidden_representation(text: str, model, tokenizer, device="cpu"):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, 
                      max_length=768, padding=True).to(device)
    with torch.no_grad():
        outputs = model(**inputs, output_hidden_states=True)
        last_hidden = outputs.hidden_states[-1]

        mask = inputs["attention_mask"].unsqueeze(-1)
        sentence_vec = (last_hidden * mask).sum(1) / mask.sum(1).clamp(min=1e-9)
    return sentence_vec

genres = ['Pop', 'Indie', 'Heavy Metal']

similarity_matrix = np.zeros((len(genres), len(genres)))

for i, gen_g in enumerate(genres):
    v_gen = get_hidden_representation(genreated_songs[gen_g], model, tokenizer, device)
    
    for j, real_g in enumerate(genres):
        sims = []
        for real_song in sampled_real_songs[real_g]:
            v_real = get_hidden_representation(real_song, model, tokenizer, device)
            sim = cosine_similarity(v_gen, v_real, dim=1).item()
            sims.append(sim)
        similarity_matrix[i, j] = np.mean(sims)

df_sim = pd.DataFrame(similarity_matrix, index=[f"Gen {g}" for g in genres], columns=genres)

print(df_sim.round(3))

                   Pop  Indie  Heavy Metal
Gen Pop          0.642  0.340       -0.053
Gen Indie        0.574  0.269       -0.129
Gen Heavy Metal  0.210  0.384        0.611


In [128]:
prompts = ["[Heavy Metal] ", "[Indie] ","[POP]"]
genres = ['Heavy Metal', 'Indie', 'Pop']
genreated_songs = {
    'Heavy Metal' : '',
    'Pop' : '',
    'Indie' : ''
}
for prompt, genre in zip(prompts, genres):
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)

    with torch.inference_mode():
        with torch.inference_mode():
            outputs = model.generate(
            input_ids,
            do_sample=True,
            max_new_tokens=150,
            top_p=0.9,
            top_k=40,
            temperature=1.0,
            repetition_penalty=1.1,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
            num_return_sequences=1,
        )
        text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        genreated_songs[genre] = text

genreated_songs
sampled_real_songs = {}

for genre in ['Pop', 'Indie', 'Heavy Metal']:
    subset = df[df['genre'] == genre]
    samples = subset.sample(50, random_state=6)
    sampled_real_songs[genre] = samples['lyrics'].tolist()
    
# --- Semantic comparison version ---
from sentence_transformers import SentenceTransformer
import torch
import numpy as np
import pandas as pd
from torch.nn.functional import cosine_similarity

# Load pretrained semantic embedding model
semantic_model = SentenceTransformer("all-MiniLM-L6-v2")

genres = ['Pop', 'Indie', 'Heavy Metal']
semantic_sim_matrix = np.zeros((len(genres), len(genres)))

for i, gen_g in enumerate(genres):

    v_gen = torch.tensor(semantic_model.encode(genreated_songs[gen_g], convert_to_numpy=True))
    v_gen = v_gen.unsqueeze(0)  
    
    for j, real_g in enumerate(genres):
        sims = []
        for real_song in sampled_real_songs[real_g]:
            v_real = torch.tensor(semantic_model.encode(real_song, convert_to_numpy=True))
            v_real = v_real.unsqueeze(0)
            sim = cosine_similarity(v_gen, v_real, dim=1).item()
            sims.append(sim)
        semantic_sim_matrix[i, j] = np.mean(sims)


df_sem = pd.DataFrame(semantic_sim_matrix, index=[f"Gen {g}" for g in genres], columns=genres)
print("\n💬 Semantic similarity matrix (SentenceTransformer):")
print(df_sem.round(3))



💬 Semantic similarity matrix (SentenceTransformer):
                   Pop  Indie  Heavy Metal
Gen Pop          0.356  0.294        0.255
Gen Indie        0.307  0.264        0.250
Gen Heavy Metal  0.284  0.306        0.371


In [31]:
print(df.columns.tolist())

['genre', 'lyrics', 'token_count', 'formatted_text']


In [129]:
for value in genreated_songs.values():
    print(value)

[Heavy Metal] Â

In the dark I see the future and I think about it)
I am the one that can save you, to save me

And if this is our end, would we just die in fear?
If this is our end, should we have the chance to stop the madness?
What a wonderful world!

No more killing, no more killing, no more living hell, yeah

I am the one that will save your life
From your fear of dying, from your fears of dying

So let me give you my heart
My love, my life
I will not die again

And what's happening now, do you believe it?
The world is changing, but don't
[POP]
I got a long list of reasons, why)
I've been getting older (yeah), my heart's getting too fast
I've been feeling down and down again (yeah), I'm getting younger
(oh, oh, oh, oh)
I don't want you (no, no)
(I don't want you)
I don't want you (no, no)


(No-one could ever understand)
Why did I have to do it like this?
It's hard to see the future in me, yeah...
And there's so many things that must change (to be different)
So much more than the 