# Vectorization

## Elmo Embedder

In [1]:
from allennlp.commands.elmo import ElmoEmbedder
import spacy
import numpy as np

elmo = ElmoEmbedder()
nlp = spacy.load('en_core_web_sm')

# def average_documents(document_list):
#     individual_averages = []
#     for doc_text in document_list:
#         doc = nlp(doc_text)
#         tensor = elmo.embed_sentence([token.text for token in doc])
#         individual_averages.append(np.mean(tensor[2], axis=0))
#     return np.mean(np.array(individual_averages), axis=0)

def average_documents(document_list):
    individual_averages = []
    for doc_text in document_list:
        doc = nlp(doc_text)
        tensor = elmo.embed_sentence([token.text for token in doc])
        individual_averages.append(np.mean(tensor[2], axis=0))
    return individual_averages

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


## Vectorize the Team Press Documents

In [2]:
import pickle

with open('names.pkl', 'rb') as i:
    names = pickle.load(i)

with open('text_lists.pkl', 'rb') as i:
    text_lists = pickle.load(i)

In [24]:
# clean the lists
text_lists_clean = []
for text_list in text_lists:
    inner_list = []
    for text in text_list:
        if len(text) < 50:
            continue
        else:
            inner_list.append(text)
    text_lists_clean.append(inner_list)

In [26]:
from tqdm import tqdm

press_vectors = []
for raw_text_list in tqdm(text_lists_clean):
    press_vectors.append(average_documents(raw_text_list))
    with open('press_vectors.pkl', 'wb') as out:
        pickle.dump(press_vectors, out)

100%|██████████| 68/68 [1:49:14<00:00, 76.80s/it]   


In [27]:
import pickle

with open('press_vectors.pkl', 'rb') as i:
    press_vectors = pickle.load(i)

In [28]:
press_averages = [np.mean(np.array(vecs), axis=0) for vecs in press_vectors]
press_averages

[array([ 0.17897075, -0.13472624, -0.28860024, ..., -0.25570223,
         0.19174615,  0.38612315], dtype=float32),
 array([ 0.16407768, -0.15236291, -0.45347825, ..., -0.20872222,
         0.05152787,  0.48828205], dtype=float32),
 array([ 0.03351332,  0.0100759 , -0.33813605, ..., -0.16992773,
         0.08054047,  0.30680418], dtype=float32),
 array([ 0.16759877, -0.04871371, -0.2617827 , ..., -0.19115308,
         0.27005473,  0.42724633], dtype=float32),
 array([ 0.10757665,  0.12121792, -0.43731433, ..., -0.14854185,
        -0.1561414 ,  0.33163977], dtype=float32),
 array([ 0.08144642, -0.16864747, -0.3840909 , ..., -0.30881295,
         0.05589694,  0.5249117 ], dtype=float32),
 array([-0.02217656, -0.1427221 , -0.45624396, ..., -0.27478608,
         0.01590006,  0.49095   ], dtype=float32),
 array([ 0.05544323,  0.02304536, -0.52420545, ..., -0.32586262,
         0.19809489,  0.29474694], dtype=float32),
 array([-0.01539924, -0.1306188 , -0.3882396 , ..., -0.291185  ,
       

In [29]:
len(press_averages)

68

In [38]:
with open('press_averages.pkl', 'wb') as out:
    pickle.dump(press_averages, out)

## Vectorize the Songs

In [30]:
songs = [
    ("We Are the Champions - Queen", "I've paid my dues, Time after time. I've done my sentence, But committed no crime. And bad mistakes, I've made a few. I've had my share of sand kicked in my face But I've come through. We are the champions, my friends. And we'll keep on fighting 'til the end. We are the champions. We are the champions. No time for losers because we are the champions of the world. I've taken my bows, And my curtain calls. You brought me fame and fortune and everything that goes with it; I thank you all. But it's been no bed of roses. No pleasure cruise. I consider it a challenge before the whole human race.  And I ain't gonna lose. We are the champions, my friends. And we'll keep on fighting 'til the end. We are the champions. We are the champions. No time for losers 'Cause we are the champions of the world. We are the champions, my friends. And we'll keep on fighting 'til the end. We are the champions. We are the champions. No time for losers 'Cause we are the champions. "),
    ("Eye of the Tiger - Survivor", "Risin' up, back on the street Did my time, took my chances Went the distance, now I'm back on my feet Just a man and his will to survive So many times, it happens too fast You trade your passion for glory Don't lose your grip on the dreams of the past You must fight just to keep them alive It's the eye of the tiger, it's the thrill of the fight Risin' up to the challenge of our rival And the last known survivor stalks his prey in the night And he's watchin' us all with the eye of the tiger Face to face, out in the heat Hangin' tough, stayin' hungry They stack the odds 'till we take to the street For the kill with the skill to survive It's the eye of the tiger, it's the dream of the fight Risin' up to the challenge of our rival And the last known survivor stalks his prey in the night And he's watchin' us all with the eye of the tiger Risin' up, straight to the top Had the guts, got the glory Went the distance, now I'm not gonna stop Just a man and his will to survive It's the eye of the tiger, it's the dream of the fight Risin' up to the challenge of our rival And the last known survivor stalks his prey in the night And he's watchin' us all with the eye of the tiger The eye of the tiger The eye of the tiger The eye of the tiger The eye of the tiger"),
    ("Radar Love - Golden Earing", "I've been drivin' all night, my hand's wet on the wheel There's a voice in my head that drives my heel It's my baby callin', says I need you here And it's a half past four and I'm shiftin' gear When she is lonely and the longing gets too much She sends a cable comin' in from above Don't need no phone at all We've got a thing that's called radar love We've got a wave in the air, radar love The radio is playing some forgotten song Brenda Lee's comin' on strong The road has got me hypnotized And I'm speedin' into a new sunrise When I get lonely and I'm sure I've had enough She sends her comfort comin' in from above We don't need no letter at all We've got a thing that's called radar love We've got a light in the sky, radar love No more speed, I'm almost there Gotta keep cool now, gotta take care Last car to pass, here I go And the line of cars drove down real slow And the radio played that forgotten song Brenda Lee's comin' on strong And the newsman sang his same song Oh one more radar lover gone When I get lonely and I'm sure I've had enough She sends her comfort comin' in from above We don't need no letter at all We've got a thing that's called radar love We've got a light in the sky We've got a thing that's called radar love We've got a thing that's called radar love")
]

In [33]:
song_averages = [np.mean(np.array(average_documents([lyrics])), axis=0) for name, lyrics in songs]

In [34]:
song_averages[0].shape

(1024,)

In [35]:
import pickle

with open('song_averages.pkl', 'wb') as out:
    pickle.dump(song_averages, out)

## Ranking

In [40]:
import pickle

with open('song_averages.pkl', 'rb') as i:
    song_averages = pickle.load(i)
    
with open('press_averages.pkl', 'rb') as i:
    press_averages = pickle.load(i)
    
with open('names.pkl', 'rb') as i:
    names = pickle.load(i)

In [44]:
from scipy.spatial.distance import cosine
import pandas as pd

for (song_name, lyrics), song_average in zip(songs, song_averages):
    data = []
    for name, press_vec in zip(names, press_averages):
        data.append((name, cosine(song_average, press_vec)))
    pd.DataFrame(data, columns=['Team', 'Score']).sort_values('Score', ascending=False).to_csv(f'Rankings/{song_name}.csv', index=False)
    

## Scratch