In [1]:
import requests
import re
from bs4 import BeautifulSoup
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import TreebankWordTokenizer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from nltk.corpus import stopwords
import pandas as pd
from sklearn.model_selection import train_test_split

# Requesting the table of content page

First we need to download the page containing all links for songs for an artist

In [2]:
urls = ['https://www.lyrics.com/artist/James-Vincent-McMorrow/1093507','https://www.lyrics.com/artist/Mumford-%26-Sons/1570809']

In [3]:
def get_artist_name(url):
    artist_name =  url.split('/')[4].replace("-", "_")
    artist_name = artist_name.replace('%26', '&')
    return artist_name

get_artist_name('https://www.lyrics.com/artist/Mumford-%26-Sons/1570809')

'Mumford_&_Sons'

In [4]:
def download_artist_page(url):

    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}

    r=requests.get(url, headers=headers)
    text = r.text
    soup = BeautifulSoup(text, 'html')
    artist_name = get_artist_name(url)
    #artist = soup.h1.text.replace(" ", "_")
    #artist_name = str(artist)
    #artist_name = artist_name.replace('%26', '&')
    #print(artist_name)

    with open (str(artist_name) + '_lyrics_page', 'w') as f:
        f.write(text)
    
    """with open ('james_vincent_mcorrow_lyrics_page', 'r') as f:
        my_string = f.read()
    print(my_string)"""
    
    #print(f)
    return soup
    
#download_artist_page('https://www.lyrics.com/artist/Mumford-%26-Sons/1570809')

Then we explore the page with beautiful soup to find all links

In [5]:
def get_links(soup):
    page_links = soup.find_all('a')
    song_links = []
    song_names = []
    for link in page_links: 
        if '/lyric/' in link['href']:
            split_name = link.text.split("[")[0].strip()
            if split_name not in song_names:
                song_links.append(link['href'])
                song_names.append(split_name)
            
    print("Song links found")
    print("___________")
    print("")
    return song_links

#get_links(download_artist_page(url))

We then prepare a function to extract the lyrics out of the lyrics page and clean up any style or tag

In [6]:
def get_lyrics(url):
    soup_lyrics = BeautifulSoup(requests.get(url).text, 'html')
    lyrics = soup_lyrics.find_all(id='lyric-body-text')

    for tag in lyrics:
        for minitag in tag:
            if minitag in tag.find_all('a'):
                del minitag['href']
                del minitag['style']
                minitag.extract()
    return lyrics[0].get_text().replace("\n", " ")

We can then use that function inside a container function that can iterate through all the links returned by our get_links() function and use get_lyrics to write the lyrics into a file

In [7]:
def write_lyrics(song_links):
    print('Starting lyrics downloads')
    for counter, link in enumerate(song_links[:6]):
        url = 'https://www.lyrics.com' + link
        response = requests.get(url)
        #print(response)
        
        artist = link.split('/')[3].replace("+", "_")
        artist = artist.replace('%26', '&')
        # I somehow can't decode the special characters in the song titles, 
        #so I'll just replace some of the common ones
        
        song_title = link.split('/')[4].replace("+", " ")
        song_title = song_title.replace("%27", "'")
        song_title = song_title.replace('%26', "&")
        song_title = song_title.replace('%2C', ',')
        song_title = song_title.replace('%C3%A9', 'é')
        song_title = song_title.replace('%5B', '[')
        song_title = song_title.replace('%5D', ']')
        song_title = song_title.replace('%21', '!')
        song_title = song_title.replace('%28', '(')
        song_title = song_title.replace('%29', ')')
        
        # now we write all the lyrics into an artist txt file! We use the append mode 
        #to add lyrics so the last lyrics don't get overwritten
        #return artist
        
        with open(f'lyrics/{artist}.txt', 'a') as f:
            f.write("\n" + get_lyrics(url))
        print(counter, song_title, response.status_code)
        #return artist
        

#links_list = get_links(soup)
#write_lyrics(links_list)

From there we can create a function to clean up the lyrics documents we just made to prepare them for the bag of words

In [8]:
def clean_up_lyrics(artist):
    with open ('lyrics/' + artist +'.txt', 'r') as f:
        my_lyrics = f.read()
    my_lyrics = my_lyrics.split('\n')
    my_lyrics = [s.lower() for s in my_lyrics]

    clean_lyrics = []

    tokenizer = TreebankWordTokenizer()
    lemmatizer = WordNetLemmatizer()

    for doc in my_lyrics:
        #print(doc)
        tokens = tokenizer.tokenize(text=doc)
        clean_doc = " ".join(lemmatizer.lemmatize(token) for token in tokens)
        clean_lyrics.append(clean_doc)
    labels_one_artist = [artist] * len(clean_lyrics)
    global labels
    labels.extend(labels_one_artist)
    global corpus
    corpus.extend(clean_lyrics)    

#print(corpus)

That's it! Now we create our execution function so we can launch everything from one cell

In [9]:
labels = []
corpus = []

def run_all(url):
    
    artists = []
    artist = get_artist_name(url)
    print('Now looking at ' + artist)
    
    soup = download_artist_page(url)
    links_list = get_links(soup)
    write_lyrics(links_list)
    artists.append(artist)
    clean_up_lyrics(artist)
    print()
    print("_____________")
    print()
        
for url in urls:
    run_all(url)

print('Finished!')

Now looking at James_Vincent_McMorrow
Song links found
___________

Starting lyrics downloads
0 I Lie Awake Every Night 200
1 Alone Together 200
2 Evil 200
3 One Thousand Times 200
4 The Future 200
5 True Care 200

_____________

Now looking at Mumford_&_Sons
Song links found
___________

Starting lyrics downloads
0 Beloved 200
1 Blind Leading the Blind 200
2 Devil in Your Eye 200
3 The Boxer 200
4 42 200
5 Guiding Light 200

_____________

Finished!


For debugging and clarity I'm putting all the corpus and the labels into a dataframe

In [36]:
data_tuples = list(zip(corpus,labels))
df = pd.DataFrame(data_tuples, columns=['corpus','labels'])
#df.head(20)
#df.tail(20)
#df.isna().sum()

I want to calculate the accuracy of my predictions, so I'm going to train_test_split to create a validation dataset

In [12]:
X = df['corpus']
y = df['labels']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=10)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

We can now calculate the accuracy

In [29]:
def calculate_score():
    
    stopwords_ = stopwords.words('english')

    steps = [('tf-idf', TfidfVectorizer(stop_words=stopwords_)),
              ('LR', LogisticRegression(class_weight='balanced'))
            ]

    pipeline = Pipeline(steps)
    pipeline.fit(X_train, y_train)
    pipeline.predict(X_test)
    #pipeline.predict_proba(X_test)
    return pipeline.score(X_test, y_test)
    
    #return pipeline.predict([text])[-1]

calculate_score()

0.7837837837837838

And finally predict the artist for new texts!

In [None]:
from nltk.corpus import stopwords

def predict_text(text):
    
    stopwords_ = stopwords.words('english')

    steps = [('tf-idf', TfidfVectorizer(stop_words=stopwords_)),
              ('LR', LogisticRegression(class_weight='balanced'))
            ]

    pipeline = Pipeline(steps)
    pipeline.fit(corpus, labels)
    pipeline.predict([text])
    pipeline.predict_proba([text])
    
    return pipeline.predict([text])[-1]

In [None]:
#string = 'your love is gold'
string = input("Enter your string:")

In [None]:
print('Mmmmmh I think this text is from')
predict_text(string)