# Lyrics summarization

## Load data

In [1]:
# mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

%cd '/content/drive/MyDrive/PythonWorkspace/Labs/DA_2023'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/PythonWorkspace/Labs/DA_2023


In [2]:
# read data - https://www.kaggle.com/datasets/neisse/scrapped-lyrics-from-6-genres
import pandas as pd

df = pd.read_csv('lyrics-data.csv')
df.head()

Unnamed: 0,ALink,SName,SLink,Lyric,language
0,/ivete-sangalo/,Arerê,/ivete-sangalo/arere.html,"Tudo o que eu quero nessa vida,\nToda vida, é\...",pt
1,/ivete-sangalo/,Se Eu Não Te Amasse Tanto Assim,/ivete-sangalo/se-eu-nao-te-amasse-tanto-assim...,Meu coração\nSem direção\nVoando só por voar\n...,pt
2,/ivete-sangalo/,Céu da Boca,/ivete-sangalo/chupa-toda.html,É de babaixá!\nÉ de balacubaca!\nÉ de babaixá!...,pt
3,/ivete-sangalo/,Quando A Chuva Passar,/ivete-sangalo/quando-a-chuva-passar.html,Quando a chuva passar\n\nPra quê falar\nSe voc...,pt
4,/ivete-sangalo/,Sorte Grande,/ivete-sangalo/sorte-grande.html,A minha sorte grande foi você cair do céu\nMin...,pt


## Preprocess data

In [3]:
# filter rows
df.dropna(inplace=True)
df = df.drop_duplicates(subset=['SName'], keep='first')
df = df.drop_duplicates(subset=['Lyric'], keep='first')
lyrics = df.drop(df[df['language'] != 'en'].index)[['SName', 'Lyric']]
lyrics.head()

Unnamed: 0,SName,Lyric
69,Careless Whisper,I feel so unsure\nAs I take your hand and lead...
86,Could You Be Loved / Citação Musical do Rap: S...,"Don't let them fool, ya\nOr even try to school..."
88,Cruisin' (Part. Saulo),"Baby, let's cruise, away from here\nDon't be c..."
111,Easy,"Know it sounds funny\nBut, I just can't stand ..."
140,For Your Babies (The Voice cover),You've got that look again\nThe one I hoped I ...


In [4]:
# get stats
lyrics = lyrics.iloc[:10000]
lyrics.describe()

Unnamed: 0,SName,Lyric
count,10000,10000
unique,10000,10000
top,Careless Whisper,I feel so unsure\nAs I take your hand and lead...
freq,1,1


In [5]:
import re
import spacy

nlp = spacy.load('en_core_web_sm')

def clean_text(text):
    text = ' '.join([c.lower() for c in str(text).split()])
    numbers_patterns = re.compile('[0-9]+[\w]*')
    text = re.sub(numbers_patterns, '', text)
    punctiation_pattern = re.compile('[!-_@#$%^''&*()?`<>;\.,:"]')
    text = re.sub(punctiation_pattern, '', text)
    
    text = ' '.join([w.lemma_ for w in nlp(text) if not w.is_stop if len(w.lemma_) > 2])

    return text

# clean lyrics
lyrics['clean_lyric'] = lyrics['Lyric'].apply(clean_text)
lyrics.head()

Unnamed: 0,SName,Lyric,clean_lyric
69,Careless Whisper,I feel so unsure\nAs I take your hand and lead...,feel unsure hand lead dance floor music die ey...
86,Could You Be Loved / Citação Musical do Rap: S...,"Don't let them fool, ya\nOr even try to school...",not let fool try school get mind hell think ri...
88,Cruisin' (Part. Saulo),"Baby, let's cruise, away from here\nDon't be c...",baby let cruise away not confuse way clear wan...
111,Easy,"Know it sounds funny\nBut, I just can't stand ...",know sound funny not stand pain girl leave tom...
140,For Your Babies (The Voice cover),You've got that look again\nThe one I hoped I ...,get look hope lad face beam smile get boast pu...


## Feature extraction

In [6]:
# monkey patch because some library above sets encoding to ansi
import locale
def getpreferredencoding(do_setlocale=True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

In [7]:
!pip3 install sentence-transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer 
import multiprocessing
import gensim
from gensim.test.utils import datapath
from sentence_transformers import SentenceTransformer

In [9]:
# BoW model
bow_vectorizer = CountVectorizer(max_df=0.90, min_df=2, max_features=768, stop_words='english')
bow = bow_vectorizer.fit_transform(lyrics['clean_lyric'])
bow.shape

(10000, 768)

In [10]:
#tfidf model
tfidf_vectorizer = TfidfVectorizer(max_df=0.90, min_df=2, max_features=768, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(lyrics['clean_lyric'])
tfidf.shape

(10000, 768)

Hyperparams papers:

1.   https://arxiv.org/abs/1803.09820
2.   http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf


In [11]:
# define hyperparameters to test
vector_sizes = [300, 500, 768]
windows = [3, 5]
results = pd.DataFrame(columns=['vector_size', 'window', 'corr', 'p-value', 'significance', 's p-value', 'unknown %'])

# test each combination of hyperparameters
tokenized_lyrics = lyrics['clean_lyric'].apply(lambda x: x.split())
for vector_size in vector_sizes:
    for window in windows:
        model_w2v = gensim.models.Word2Vec(
            sentences=tokenized_lyrics,
            min_count=5,
            vector_size=vector_size,
            workers=multiprocessing.cpu_count(),
            sg=1,
            window=window,
            negative=10,
            seed=42
        )
        model_w2v.train(tokenized_lyrics, total_examples=len(lyrics['clean_lyric']), epochs=20)
        (corr, pvalue), (significance, spvalue), unknown_perc = model_w2v.wv.evaluate_word_pairs(datapath('wordsim353.tsv'))
        results = results.append({'vector_size': vector_size, 'window': window, 'corr': corr, 'p-value': pvalue, 'significance': significance, 's p-value': spvalue, 'unknown %': unknown_perc}, ignore_index=True)
results

  results = results.append({'vector_size': vector_size, 'window': window, 'corr': corr, 'p-value': pvalue, 'significance': significance, 's p-value': spvalue, 'unknown %': unknown_perc}, ignore_index=True)
  results = results.append({'vector_size': vector_size, 'window': window, 'corr': corr, 'p-value': pvalue, 'significance': significance, 's p-value': spvalue, 'unknown %': unknown_perc}, ignore_index=True)
  results = results.append({'vector_size': vector_size, 'window': window, 'corr': corr, 'p-value': pvalue, 'significance': significance, 's p-value': spvalue, 'unknown %': unknown_perc}, ignore_index=True)
  results = results.append({'vector_size': vector_size, 'window': window, 'corr': corr, 'p-value': pvalue, 'significance': significance, 's p-value': spvalue, 'unknown %': unknown_perc}, ignore_index=True)
  results = results.append({'vector_size': vector_size, 'window': window, 'corr': corr, 'p-value': pvalue, 'significance': significance, 's p-value': spvalue, 'unknown %': unkn

Unnamed: 0,vector_size,window,corr,p-value,significance,s p-value,unknown %
0,300.0,3.0,0.451299,3.417083e-13,0.50772,8.396099000000001e-17,33.427762
1,300.0,5.0,0.437488,2.092005e-12,0.458171,1.345003e-13,33.427762
2,500.0,3.0,0.385652,9.432086e-10,0.442799,1.052372e-12,33.427762
3,500.0,5.0,0.414405,3.620711e-11,0.469495,2.762309e-14,33.427762
4,768.0,3.0,0.372889,3.635783e-09,0.435244,2.786903e-12,33.427762
5,768.0,5.0,0.39008,5.825398e-10,0.45254,2.891985e-13,33.427762


In [12]:
model_w2v.wv.most_similar(positive="pain")

[('novacane', 0.337821364402771),
 ('thunderyea', 0.30574867129325867),
 ('migrane', 0.3046170771121979),
 ('migraine', 0.30312874913215637),
 ('mendin', 0.2981855273246765),
 ('regain', 0.28983160853385925),
 ('slain', 0.2872713804244995),
 ('magnify', 0.28586795926094055),
 ('underyea', 0.28165575861930847),
 ('errythe', 0.2814032733440399)]

In [13]:
import numpy as np

# lyric vector as mean of word vectors
def word_vector(tokens, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0

    for word in tokens:
        try:
            vec += model_w2v.wv[word].reshape((1, size))
            count += 1.
        except KeyError:
            continue
    if count != 0:
        vec /= count
    
    return vec

In [14]:
wordvec = np.zeros((len(tokenized_lyrics), 768)) 
for i in range(len(tokenized_lyrics)):
    wordvec[i, :] = word_vector(tokenized_lyrics.iloc[i], 768)
wordvec.shape

(10000, 768)

In [15]:
# bert model
bert_model = SentenceTransformer('all-mpnet-base-v2')
bert = bert_model.encode(list(lyrics['clean_lyric']))
bert.shape

(10000, 768)

In [16]:
def find_similar(prompt_vec, embeddings, lyrics, method='cosine', top_n=5, debug=False):
    scores = []
    for ref in embeddings:
        if method == 'cosine': 
            scores.append((1 - np.dot(ref, prompt_vec)/(np.linalg.norm(ref)*np.linalg.norm(prompt_vec))))
        if method == 'norm': 
            scores.append(np.linalg.norm(ref - prompt_vec))
    scores, idxs = np.array(scores), np.argsort(scores)

    top_lyrics = [lyrics[j] for j in idxs[:top_n]]
    top_prob = 1 - scores[idxs[:top_n]]
    if debug:
        for i in range(top_n):
            print(f"{i + 1}. Prob: {round(top_prob[i], 3)}, Lyrics: {top_lyrics[i][:100]}")

    return top_lyrics, top_prob

# analyze the results using the nearest neighbor search
lprompt = "happy sunny day today"
print("Prompt:", lprompt)
for model in ['bow', 'tfidf', 'word2vec', 'bert']:
    print("\nModel:", model)
    prompt = clean_text(lprompt)

    if model == 'bow':
        prompt_vec = bow_vectorizer.transform([prompt,]).toarray()[0]
        embeddings = bow.toarray()
    elif model == 'tfidf':
        prompt_vec = tfidf_vectorizer.transform([prompt,]).toarray()[0]
        embeddings = tfidf.toarray()
    elif model == 'word2vec':
        prompt_vec = word_vector(prompt.split(), 768)[0]
        embeddings = wordvec
    elif model == 'bert':
        prompt_vec = bert_model.encode([prompt,])[0]
        embeddings = bert
    else:
        raise ValueError("Invalid model type. Must be 'bow', 'tfidf', 'word2vec', or 'bert'.")
    
    find_similar(prompt_vec, embeddings, list(lyrics['clean_lyric']), method='cosine', top_n=3, debug=True)

Prompt: happy sunny day today

Model: bow


  scores.append((1 - np.dot(ref, prompt_vec)/(np.linalg.norm(ref)*np.linalg.norm(prompt_vec))))
  scores.append((1 - np.dot(ref, prompt_vec)/(np.linalg.norm(ref)*np.linalg.norm(prompt_vec))))


1. Prob: 0.715, Lyrics: happy day happy day happy day happy day jesus wash jesus wash jesus wash jesus wash jesus wash jesus
2. Prob: 0.583, Lyrics: lisa birthday god bless day gift little sister proud today lisa birthday happy birthday lisa lisa bi
3. Prob: 0.559, Lyrics: happy day happy day happy day happy day jesus wash jesus wash jesus wash jesus wash jesus wash jesus

Model: tfidf
1. Prob: 0.634, Lyrics: lisa birthday god bless day gift little sister proud today lisa birthday happy birthday lisa lisa bi
2. Prob: 0.626, Lyrics: happy day happy day happy day happy day jesus wash jesus wash jesus wash jesus wash jesus wash jesus
3. Prob: 0.624, Lyrics: take ride place get care shut drive mean mean conversation contaminate air planet today need populat

Model: word2vec
1. Prob: 0.728, Lyrics: think day ghetto life fade away cuz lose not find way look forward daywelcome sunny day block not st
2. Prob: 0.66, Lyrics: jackson happy happy kiss lip thousand time time count call stand darkes

## Text summarization

In [17]:
from transformers import pipeline

classifier = pipeline("summarization")

No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [18]:
# preprocess
def preprocess(text):
    ctext = clean_text(text)
    sentences, csentences = [], []
    for s in text.split('\n'):
        csent = clean_text(s)
        if csent not in csentences and len(csent) > 2:
            sentences.append(s)
            csentences.append(csent)
    return text, sentences, ctext, csentences

# summarize by selecting top n similar sentences to the text
def summarize(text, model='distilbart', n_sentences=5):
    true_text, true_text_sentences, text, text_sentences = preprocess(text)
    
    if model == 'bow':
        text_vec = bow_vectorizer.transform([text,]).toarray()[0]
        text_sentences_vec = bow_vectorizer.transform(text_sentences).toarray()
    elif model == 'tfidf':
        text_vec = tfidf_vectorizer.transform([text,]).toarray()[0]
        text_sentences_vec = tfidf_vectorizer.transform(text_sentences).toarray()
    elif model == 'word2vec':
        text_vec = word_vector(text.split(), 768)[0]
        text_sentences_vec = np.array([word_vector(s.split(), 768)[0] for s in text_sentences])
    elif model == 'bert':
        text_vec = bert_model.encode([true_text,])[0]
        text_sentences_vec = bert_model.encode(true_text_sentences)
    elif model == 'distilbart':
        return classifier(true_text[:1024])[0]['summary_text'].strip()
    else:
        raise ValueError("Invalid model type. Must be 'bow', 'tfidf', 'word2vec', 'bert', or 'distilbart'.")
    
    top_sentences, _ = find_similar(text_vec, text_sentences_vec, true_text_sentences, method='cosine', top_n=n_sentences)
    summary = '. '.join([s for s in top_sentences])

    return summary

lyric_text = list(lyrics['Lyric'])[0]
print(f"Lyrics:\n{lyric_text}")
for model in ['bow', 'tfidf', 'word2vec', 'bert', 'distilbart']:
    print("\nModel:", model)
    summary = summarize(lyric_text, model=model, n_sentences=5)
    print(f"Summary:\n{summary}")

Lyrics:
I feel so unsure
As I take your hand and lead you to the dance floor
As the music dies, something in your eyes
Calls to mind a silver screen
And all those sad goodbyes

I'm never gonna dance again
Guilty feet have got no rhythm
Though it's easy to pretend
I know you're not a fool

Should've known better than to cheat a friend
And waste the chance that I've been given
So I'm never gonna dance again
The way I danced with you

Time can never mend
The careless whispers of a good friend
To the heart and mind
Ignorance is kind
There's no comfort in the truth
Pain is all you'll find

I'm never gonna dance again
Guilty feet have got no rhythm
Though it's easy to pretend
I know you're not a fool

Should've known better than to cheat a friend
And waste this chance that I've been given
So I'm never gonna dance again
The way I danced with you

Never without your love

Tonight the music seems so loud
I wish that we could lose this crowd
Maybe it's better this way
We'd hurt each other with t

In [19]:
!pip3 install rouge

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [20]:
from nltk.translate.bleu_score import corpus_bleu
from rouge import Rouge
from collections import Counter
from random import sample
from pprint import pprint

In [22]:
# calculate metrics
def calc_duplicate_n_grams_rate(documents):
    all_ngrams_count = Counter()
    duplicate_ngrams_count = Counter()

    for doc in documents:
        words = doc.split(" ")
        for n in range(1, 5):
            ngrams = [tuple(words[i:i+n]) for i in range(len(words)-n+1)]
            unique_ngrams = set(ngrams)
            all_ngrams_count[n] += len(ngrams)
            duplicate_ngrams_count[n] += len(ngrams) - len(unique_ngrams)

    return {n: duplicate_ngrams_count[n]/all_ngrams_count[n] if all_ngrams_count[n] else 0.0
            for n in range(1, 5)}

def calc_metrics(model, refs, hyps, metric="all"):
    metrics = dict()
    metrics["model"] = model
    metrics["ref_example"] = refs[-1][:100]
    metrics["hyp_example"] = hyps[-1]
    many_refs = [[r] if r is not list else r for r in refs]

    if metric in ("bleu", "all"):
        metrics["bleu"] = corpus_bleu(many_refs, hyps)
    if metric in ("rouge", "all"):
        rouge = Rouge()
        scores = {m: v['f'] for m, v in rouge.get_scores(y_pred, y_true, avg=True).items()}
        metrics.update(scores)
    if metric in ("duplicate_ngrams", "all"):
        metrics["duplicate_ngrams"] = dict()
        metrics["duplicate_ngrams"].update(calc_duplicate_n_grams_rate(hyps))
    metrics["count"] = len(hyps)

    return metrics

y_true = sample(list(lyrics['Lyric']), 100)
results = []
for model in ['bow', 'tfidf', 'word2vec', 'bert', 'distilbart']:
    y_pred = [summarize(lyric_text, model=model, n_sentences=5) for lyric_text in y_true]
    metrics = calc_metrics(model, y_true, y_pred)
    results.append(metrics)

df_results = pd.DataFrame(results)
df_results

  scores.append((1 - np.dot(ref, prompt_vec)/(np.linalg.norm(ref)*np.linalg.norm(prompt_vec))))
  scores.append((1 - np.dot(ref, prompt_vec)/(np.linalg.norm(ref)*np.linalg.norm(prompt_vec))))
Your max_length is set to 142, but you input_length is only 45. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=22)
Your max_length is set to 142, but you input_length is only 113. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=56)


Unnamed: 0,model,ref_example,hyp_example,bleu,rouge-1,rouge-2,rouge-l,duplicate_ngrams,count
0,bow,I believe that in time my heart will heal agai...,Cos I know one day I'm gonna feel again. I kno...,1.3e-05,0.2655,0.195536,0.2655,"{1: 0.30387984981226535, 2: 0.1566110397946084...",100
1,tfidf,I believe that in time my heart will heal agai...,Cos I know one day I'm gonna feel again. I kno...,1.1e-05,0.264766,0.195819,0.264766,"{1: 0.3050202839756592, 2: 0.154786680541103, ...",100
2,word2vec,I believe that in time my heart will heal agai...,Cos I know one day I'm gonna feel again. I kno...,4.9e-05,0.296019,0.218774,0.296019,"{1: 0.2715023579609252, 2: 0.12566046404778314...",100
3,bert,I believe that in time my heart will heal agai...,And I believe that in time my heart will feel ...,2.2e-05,0.289515,0.207228,0.289515,"{1: 0.24518993586581153, 2: 0.1107738998482549...",100
4,distilbart,I believe that in time my heart will heal agai...,I believe that in time my heart will heal agai...,0.000212,0.282873,0.179372,0.265442,"{1: 0.20144294664894627, 2: 0.0327075672537255...",100
