In [None]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [54]:
from sklearn.feature_extraction.text import CountVectorizer

In [126]:
import spacy
nlp = spacy.load("en_core_web_md")

#### - import artist_1 and artist_2 csv

In [None]:
artist_1 = pd.read_csv('artist_1.csv', index_col=0)
artist_2 = pd.read_csv('artist_2.csv', index_col=0)

In [None]:
artist = pd.concat([artist_1, artist_2], axis=0)
artist

Unnamed: 0,song_title,l_link,lyrics,artist_name
0,Alligator,https://www.lyrics.com//lyric/36182049/Of+Mons...,Hey Hey I see color raining down Feral feelin...,Of Monsters and Men
2,Ahay,https://www.lyrics.com//lyric/36406257/Of+Mons...,I know that I'm wrong And now we're both sitti...,Of Monsters and Men
3,Róróró,https://www.lyrics.com//lyric/36406262/Of+Mons...,I am up with the sunrise I'm tired and I think...,Of Monsters and Men
4,Waiting for the Snow,https://www.lyrics.com//lyric/36406261/Of+Mons...,I'm waiting for the snow I'm waiting for visio...,Of Monsters and Men
5,"Vulture, Vulture",https://www.lyrics.com//lyric/36406260/Of+Mons...,"He lives in the darkness, he's calling my name...",Of Monsters and Men
...,...,...,...,...
157,where we are [Acoustic],https://www.lyrics.com//lyric-lf/7526166/The+L...,Where we are (where we are) I don't know where...,The Lumineers
163,Democracy,https://www.lyrics.com//lyric/36521451/The+Lum...,It's coming through a hole in the air From tho...,The Lumineers
167,Old Lady,https://www.lyrics.com//lyric/36521460/The+Lum...,An old lady crosses the street And as she wave...,The Lumineers
168,Soundtrack Song,https://www.lyrics.com//lyric/36521450/The+Lum...,"Loneliness, oh won't you let me be Let me be a...",The Lumineers


#### - Lemmatization

In [172]:
# define lemmatizer to extract words from the lyric
def lemmatizer(text):
    
    lemma_list = []
    nlp = spacy.load("en_core_web_md")
    tokens = nlp(text)
    
    for token in tokens:
        if token.is_stop is False and token.pos_ != 'PUNCT' and token.pos_ != 'SPACE' and token.has_vector is True and token.pos_ != 'CCONJ' and token.like_num is False:
            lemma_list.append(token.lemma_)
    return lemma_list

In [None]:
# list all lyrics text
songs = artist['lyrics'].values
songs

In [135]:
# test lemmatizer with a song
song2 = songs[2]

In [174]:
lemmatizer(songs[2])

['sunrise',
 'tired',
 'think',
 'want',
 'home',
 'let',
 'start',
 'hear',
 'call',
 'cry',
 'run',
 'house',
 'catch',
 'think',
 'oh',
 'shame',
 'know',
 'open',
 'arm',
 'hold',
 'oh',
 'shame',
 'row',
 'edge',
 'fall',
 'blame',
 'know',
 'open',
 'arm',
 'close',
 'oh',
 'shame',
 'row',
 'edge',
 'fall',
 'center',
 'place',
 'field',
 'poppy',
 'dream',
 'away',
 'know',
 'get',
 'run',
 'hear',
 'roaring',
 'pulse',
 'sound',
 'run',
 'house',
 'follow',
 'dare',
 'oh',
 'shame',
 'know',
 'open',
 'arm',
 'hold',
 'oh',
 'shame',
 'row',
 'edge',
 'fall',
 'blame',
 'know',
 'open',
 'arm',
 'close',
 'oh',
 'shame',
 'row',
 'edge',
 'fall',
 'ready',
 'ready',
 'want',
 'want',
 'true',
 'ready',
 'oh',
 'shame',
 'need',
 'need',
 'true',
 'ready',
 'oh',
 'shame',
 'want',
 'want',
 'true',
 'ready',
 'oh',
 'shame',
 'need',
 'need',
 'true',
 'ready']

#### - Bag of words (bow)

In [55]:
# Use countvectorizer to analyze words and count all words for all songs in the artist
bow = CountVectorizer(tokenizer=lemmatizer, analyzer='word')

In [56]:
X_cv = artist['lyrics']

In [57]:
bow.fit(X_cv)

CountVectorizer(tokenizer=<function lemmatizer at 0x7fdbd17e0b80>)

In [58]:
doc_term_mat = bow.transform(X_cv)
doc_term_mat = doc_term_mat.todense()

In [59]:
doc_term_mat = pd.DataFrame(
    doc_term_mat, 
    columns = bow.get_feature_names_out())

In [60]:
doc_term_mat

Unnamed: 0,','cause,'em,-,.,[,],abyss,account,act,...,year,yell,yellow,yes,yesterday,yo,yon,york,young,youth
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
114,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
115,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
116,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
117,0,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [259]:
# calculate frequency of words listed in the artist df
doc_term_mat.sum().sort_values().tail(10)

feel     91
run      94
hey      97
ah      118
like    120
come    120
ooh     130
know    162
love    222
oh      313
dtype: int64