In [1]:
import numpy as np
import pandas as pd

In [2]:
def clean_text(text, stemm=True):
    # split into words
    from nltk.tokenize import word_tokenize
    tokens = word_tokenize(text)

    # convert to lower case
    tokens = [w.lower() for w in tokens]

    # remove punctuation from each word
    import string
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]

    # remove remaining tokens that are not alphabetic
    words = [word for word in stripped if word.isalpha()]

    # filter out stop words
    from nltk.corpus import stopwords
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if not w in stop_words]

    if not stemm:
        return sorted(words)
    
    # stemming of words
    from nltk.stem.porter import PorterStemmer
    porter = PorterStemmer()
    stemmed = [porter.stem(word) for word in words]
    
    return sorted(stemmed)

In [3]:
def generate_vocab(texts):
    words = set()
    for text in texts:
        words.update(text)         
    vocab = sorted(list(words))
    return vocab

In [4]:
def generate_bow(text, vocab):
    bag = np.zeros(len(vocab), dtype='int')
    words, c = np.unique(text, return_counts=True)
    for i, w in enumerate(words):
        for j, word in enumerate(vocab):
            if word == w: 
                bag[j] += c[i]
    return bag

In [5]:
def genereate_tbd(texts):
    vocab = generate_vocab(texts)
    A = np.zeros((len(vocab),0))

    for i, text in enumerate(texts):
        A = np.insert(A, i, np.transpose(generate_bow(text, vocab)), axis=1)
    
    return vocab, A

In [6]:
data = pd.read_csv('songdata.csv')
data.head()

Unnamed: 0,artist,song,text
0,ABBA,All I Ask,"Look at her face, it's a wonderful face \nAnd..."
1,ABBA,"Andante, Andante","Take it easy with me, please \nTouch me gentl..."
2,ABBA,As Good As New,I'll never know why I had to go \nWhy I had t...
3,ABBA,Bang,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,Making somebody happy is a question of give an...


In [7]:
texts = data['text'].values[:1000]

In [8]:
cleaned_texts = np.array([clean_text(text, stemm=True) for text in texts])

In [9]:
vocab, A = genereate_tbd(cleaned_texts)

In [10]:
np.savetxt('vocab.txt', vocab, delimiter=',', fmt='%s')
np.savetxt('A.txt', A, delimiter=',', fmt='%d')

In [11]:
vocab = np.genfromtxt('vocab.txt',dtype='str').tolist()
A = np.genfromtxt('A.txt', delimiter=',', dtype='int')

In [12]:
def IDF(bag):
    N = bag.shape[1]
    nw = bag.shape[0] - np.count_nonzero(bag == 0, axis=1)
    idf = np.multiply(bag, np.reshape(np.log(N / nw), (len(vocab),1)))
    return np.abs(idf)

In [13]:
def normalize_matrix(A):
    normalized = A / np.apply_along_axis(np.linalg.norm, 0, A)
    return normalized

def normalize_vector(v):
    normalized = v / np.linalg.norm(v)
    return normalized

In [14]:
k = 5
search = "You are the Dancing Queen, young and sweet, only seventeen"

cleaned_search = clean_text(search, stemm=True)
print("cleaned text\n", cleaned_search)

q = np.reshape(generate_bow(cleaned_search, vocab), (1, len(vocab)))

cleaned text
 ['danc', 'queen', 'seventeen', 'sweet', 'young']


In [15]:
idf_A = IDF(A)

In [16]:
corr = normalize_vector(q).dot(normalize_matrix(A))
idf_corr = normalize_vector(q).dot(normalize_matrix(idf_A))

In [17]:
display(data.iloc[np.flip(np.argsort(corr)[0][-k:])])

print("IDF")

display(data.iloc[np.flip(np.argsort(idf_corr)[0][-k:])])

Unnamed: 0,artist,song,text
74,ABBA,Reina Danzante,You can dance \nYou can jive \nHaving the ti...
11,ABBA,Dancing Queen,"You can dance, you can jive, having the time o..."
527,Alice In Chains,Queen Of The Rodeo,My daddy was a trucker \nLeft home when I was...
888,Avril Lavigne,17,He was working at the record shop \nI would k...
53,ABBA,Love Isn't Easy,"Do you remember the first time, and all of you..."


IDF


Unnamed: 0,artist,song,text
74,ABBA,Reina Danzante,You can dance \nYou can jive \nHaving the ti...
11,ABBA,Dancing Queen,"You can dance, you can jive, having the time o..."
527,Alice In Chains,Queen Of The Rodeo,My daddy was a trucker \nLeft home when I was...
888,Avril Lavigne,17,He was working at the record shop \nI would k...
53,ABBA,Love Isn't Easy,"Do you remember the first time, and all of you..."


In [18]:
def low_rank_approx(A=None, r=1):
    SVD = np.linalg.svd(A, full_matrices=False)
    u, s, v = SVD
    Ar = np.zeros((len(u), len(v)))
    for i in range(r):
        Ar += s[i] * np.outer(u.T[i], v[i])
    return Ar

In [19]:
for r in range(10, 101, 10):
    Ak = low_rank_approx(A, r)
    idf_Ak = IDF(Ak)

    kcorr = normalize_vector(q).dot(normalize_matrix(Ak))
    idf_kcorr = normalize_vector(q).dot(normalize_matrix(idf_Ak))

    print("r = {}".format(r))
    
    display(data.iloc[np.flip(np.argsort(kcorr)[0][-k:])])
    
    print("IDF")

    display(data.iloc[np.flip(np.argsort(idf_kcorr)[0][-k:])])

r = 10


Unnamed: 0,artist,song,text
124,Ace Of Base,Who Am I,I was born in the shadows \nUnder the Iberian...
890,Avril Lavigne,Anything But Ordinary,Sometimes I get so weird \nI even freak mysel...
327,Air Supply,Someone,When you first found love \nWas it all that y...
609,Alphaville,Forever Young,"Let's start in style, let's dance for a while,..."
355,Aiza Seguerra,Open Arms,"Intro: \n \nLying beside you, here in the da..."


IDF


Unnamed: 0,artist,song,text
513,Alice Cooper,Poison,"Your cruel device \nYour blood, like ice \nO..."
753,Andy Williams,My Sweet Lord,"My sweet Lord \nHmm, my Lord \nHmm, my Lord ..."
818,Arlo Guthrie,Percy's Song,"Bad news, bad news, come to me where I sleep ..."
287,Air Supply,I Want You,"If it's love that you want, there will always ..."
899,Avril Lavigne,Fall To Pieces,I looked away \nThen I looked back at you \n...


r = 20


Unnamed: 0,artist,song,text
349,Aiza Seguerra,I See You Lord,I'm so blessed my Lord \nI can see you \nIn ...
753,Andy Williams,My Sweet Lord,"My sweet Lord \nHmm, my Lord \nHmm, my Lord ..."
736,Andrea Bocelli,Sogno,SOGNO \n \nVa ti aspettero' \nII fiore nel ...
411,Alabama,If I Could Just See You Now,Your love for me is gone \nBut my love for yo...
355,Aiza Seguerra,Open Arms,"Intro: \n \nLying beside you, here in the da..."


IDF


Unnamed: 0,artist,song,text
349,Aiza Seguerra,I See You Lord,I'm so blessed my Lord \nI can see you \nIn ...
753,Andy Williams,My Sweet Lord,"My sweet Lord \nHmm, my Lord \nHmm, my Lord ..."
736,Andrea Bocelli,Sogno,SOGNO \n \nVa ti aspettero' \nII fiore nel ...
411,Alabama,If I Could Just See You Now,Your love for me is gone \nBut my love for yo...
355,Aiza Seguerra,Open Arms,"Intro: \n \nLying beside you, here in the da..."


r = 30


Unnamed: 0,artist,song,text
349,Aiza Seguerra,I See You Lord,I'm so blessed my Lord \nI can see you \nIn ...
753,Andy Williams,My Sweet Lord,"My sweet Lord \nHmm, my Lord \nHmm, my Lord ..."
74,ABBA,Reina Danzante,You can dance \nYou can jive \nHaving the ti...
11,ABBA,Dancing Queen,"You can dance, you can jive, having the time o..."
732,Amy Grant,Sweet Will Of God,My stubborn will at last hath yielded \nI wou...


IDF


Unnamed: 0,artist,song,text
349,Aiza Seguerra,I See You Lord,I'm so blessed my Lord \nI can see you \nIn ...
753,Andy Williams,My Sweet Lord,"My sweet Lord \nHmm, my Lord \nHmm, my Lord ..."
74,ABBA,Reina Danzante,You can dance \nYou can jive \nHaving the ti...
11,ABBA,Dancing Queen,"You can dance, you can jive, having the time o..."
732,Amy Grant,Sweet Will Of God,My stubborn will at last hath yielded \nI wou...


r = 40


Unnamed: 0,artist,song,text
753,Andy Williams,My Sweet Lord,"My sweet Lord \nHmm, my Lord \nHmm, my Lord ..."
349,Aiza Seguerra,I See You Lord,I'm so blessed my Lord \nI can see you \nIn ...
74,ABBA,Reina Danzante,You can dance \nYou can jive \nHaving the ti...
11,ABBA,Dancing Queen,"You can dance, you can jive, having the time o..."
732,Amy Grant,Sweet Will Of God,My stubborn will at last hath yielded \nI wou...


IDF


Unnamed: 0,artist,song,text
753,Andy Williams,My Sweet Lord,"My sweet Lord \nHmm, my Lord \nHmm, my Lord ..."
349,Aiza Seguerra,I See You Lord,I'm so blessed my Lord \nI can see you \nIn ...
74,ABBA,Reina Danzante,You can dance \nYou can jive \nHaving the ti...
11,ABBA,Dancing Queen,"You can dance, you can jive, having the time o..."
712,Amy Grant,Father,"Father, blessed Father it's morning again \nI..."


r = 50


Unnamed: 0,artist,song,text
753,Andy Williams,My Sweet Lord,"My sweet Lord \nHmm, my Lord \nHmm, my Lord ..."
74,ABBA,Reina Danzante,You can dance \nYou can jive \nHaving the ti...
11,ABBA,Dancing Queen,"You can dance, you can jive, having the time o..."
732,Amy Grant,Sweet Will Of God,My stubborn will at last hath yielded \nI wou...
349,Aiza Seguerra,I See You Lord,I'm so blessed my Lord \nI can see you \nIn ...


IDF


Unnamed: 0,artist,song,text
753,Andy Williams,My Sweet Lord,"My sweet Lord \nHmm, my Lord \nHmm, my Lord ..."
74,ABBA,Reina Danzante,You can dance \nYou can jive \nHaving the ti...
11,ABBA,Dancing Queen,"You can dance, you can jive, having the time o..."
349,Aiza Seguerra,I See You Lord,I'm so blessed my Lord \nI can see you \nIn ...
732,Amy Grant,Sweet Will Of God,My stubborn will at last hath yielded \nI wou...


r = 60


Unnamed: 0,artist,song,text
753,Andy Williams,My Sweet Lord,"My sweet Lord \nHmm, my Lord \nHmm, my Lord ..."
74,ABBA,Reina Danzante,You can dance \nYou can jive \nHaving the ti...
11,ABBA,Dancing Queen,"You can dance, you can jive, having the time o..."
732,Amy Grant,Sweet Will Of God,My stubborn will at last hath yielded \nI wou...
349,Aiza Seguerra,I See You Lord,I'm so blessed my Lord \nI can see you \nIn ...


IDF


Unnamed: 0,artist,song,text
753,Andy Williams,My Sweet Lord,"My sweet Lord \nHmm, my Lord \nHmm, my Lord ..."
74,ABBA,Reina Danzante,You can dance \nYou can jive \nHaving the ti...
11,ABBA,Dancing Queen,"You can dance, you can jive, having the time o..."
349,Aiza Seguerra,I See You Lord,I'm so blessed my Lord \nI can see you \nIn ...
732,Amy Grant,Sweet Will Of God,My stubborn will at last hath yielded \nI wou...


r = 70


Unnamed: 0,artist,song,text
74,ABBA,Reina Danzante,You can dance \nYou can jive \nHaving the ti...
11,ABBA,Dancing Queen,"You can dance, you can jive, having the time o..."
753,Andy Williams,My Sweet Lord,"My sweet Lord \nHmm, my Lord \nHmm, my Lord ..."
96,ABBA,The Piper,They came from the hills \nAnd they came from...
732,Amy Grant,Sweet Will Of God,My stubborn will at last hath yielded \nI wou...


IDF


Unnamed: 0,artist,song,text
753,Andy Williams,My Sweet Lord,"My sweet Lord \nHmm, my Lord \nHmm, my Lord ..."
74,ABBA,Reina Danzante,You can dance \nYou can jive \nHaving the ti...
11,ABBA,Dancing Queen,"You can dance, you can jive, having the time o..."
96,ABBA,The Piper,They came from the hills \nAnd they came from...
732,Amy Grant,Sweet Will Of God,My stubborn will at last hath yielded \nI wou...


r = 80


Unnamed: 0,artist,song,text
74,ABBA,Reina Danzante,You can dance \nYou can jive \nHaving the ti...
11,ABBA,Dancing Queen,"You can dance, you can jive, having the time o..."
96,ABBA,The Piper,They came from the hills \nAnd they came from...
10,ABBA,Dance,"Oh, my love it makes me sad. \nWhy did things..."
256,Air Supply,Dancing With The Mountain,Hot sun as warm as the kissing of the wind \n...


IDF


Unnamed: 0,artist,song,text
74,ABBA,Reina Danzante,You can dance \nYou can jive \nHaving the ti...
11,ABBA,Dancing Queen,"You can dance, you can jive, having the time o..."
96,ABBA,The Piper,They came from the hills \nAnd they came from...
10,ABBA,Dance,"Oh, my love it makes me sad. \nWhy did things..."
256,Air Supply,Dancing With The Mountain,Hot sun as warm as the kissing of the wind \n...


r = 90


Unnamed: 0,artist,song,text
74,ABBA,Reina Danzante,You can dance \nYou can jive \nHaving the ti...
11,ABBA,Dancing Queen,"You can dance, you can jive, having the time o..."
96,ABBA,The Piper,They came from the hills \nAnd they came from...
10,ABBA,Dance,"Oh, my love it makes me sad. \nWhy did things..."
457,Alan Parsons Project,The Fall Of The House Of Usher - I Prelude,"[instrumental] \n \n(I) Prelude \n \n""Shad..."


IDF


Unnamed: 0,artist,song,text
74,ABBA,Reina Danzante,You can dance \nYou can jive \nHaving the ti...
11,ABBA,Dancing Queen,"You can dance, you can jive, having the time o..."
96,ABBA,The Piper,They came from the hills \nAnd they came from...
10,ABBA,Dance,"Oh, my love it makes me sad. \nWhy did things..."
457,Alan Parsons Project,The Fall Of The House Of Usher - I Prelude,"[instrumental] \n \n(I) Prelude \n \n""Shad..."


r = 100


Unnamed: 0,artist,song,text
74,ABBA,Reina Danzante,You can dance \nYou can jive \nHaving the ti...
11,ABBA,Dancing Queen,"You can dance, you can jive, having the time o..."
96,ABBA,The Piper,They came from the hills \nAnd they came from...
10,ABBA,Dance,"Oh, my love it makes me sad. \nWhy did things..."
457,Alan Parsons Project,The Fall Of The House Of Usher - I Prelude,"[instrumental] \n \n(I) Prelude \n \n""Shad..."


IDF


Unnamed: 0,artist,song,text
74,ABBA,Reina Danzante,You can dance \nYou can jive \nHaving the ti...
11,ABBA,Dancing Queen,"You can dance, you can jive, having the time o..."
96,ABBA,The Piper,They came from the hills \nAnd they came from...
10,ABBA,Dance,"Oh, my love it makes me sad. \nWhy did things..."
457,Alan Parsons Project,The Fall Of The House Of Usher - I Prelude,"[instrumental] \n \n(I) Prelude \n \n""Shad..."


In [20]:
def find(vocab, A, search, k=5, stemm=True, idf=False, svd=False, r=80):
    cleaned_search = clean_text(search, stemm=stemm)
    
    q = np.reshape(generate_bow(cleaned_search, vocab), (1, len(vocab)))
    
    if not svd:
        if not idf:
            corr = normalize_vector(q).dot(normalize_matrix(A))
            return np.flip(np.argsort(corr)[0][-k:])
        else:
            idf_A = IDF(A)
            idf_corr = normalize_vector(q).dot(normalize_matrix(idf_A))
            return np.flip(np.argsort(idf_corr)[0][-k:])
    else:
        Ak = low_rank_approx(A, r)
        if not idf:
            kcorr = normalize_vector(q).dot(normalize_matrix(Ak))
            return np.flip(np.argsort(kcorr)[0][-k:])
        else:
            idf_Ak = IDF(Ak)
            idf_kcorr = normalize_vector(q).dot(normalize_matrix(idf_Ak))
            return np.flip(np.argsort(idf_kcorr)[0][-k:])     

In [23]:
search = "You are the Dancing Queen, young and sweet, only seventeen"
findings = find(vocab, A, search, 5)
print(findings)
display(data.iloc[findings])

[ 74  11 527 888  53]


Unnamed: 0,artist,song,text
74,ABBA,Reina Danzante,You can dance \nYou can jive \nHaving the ti...
11,ABBA,Dancing Queen,"You can dance, you can jive, having the time o..."
527,Alice In Chains,Queen Of The Rodeo,My daddy was a trucker \nLeft home when I was...
888,Avril Lavigne,17,He was working at the record shop \nI would k...
53,ABBA,Love Isn't Easy,"Do you remember the first time, and all of you..."
