In [1]:
import requests
import re
from bs4 import BeautifulSoup
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import TreebankWordTokenizer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from nltk.corpus import stopwords
import pandas as pd
from sklearn.model_selection import train_test_split

# Requesting the table of content page

First we need to download the page containing all links for songs for an artist

In [2]:
urls = ['https://www.lyrics.com/artist/James-Vincent-McMorrow/1093507',
        'https://www.lyrics.com/artist/Mumford-%26-Sons/1570809',
        'https://www.lyrics.com/artist/Foo-Fighters',
        'https://www.lyrics.com/artist/Jimi-Hendrix',
       ]

In [3]:
def get_artist_name(url):
    artist_name =  url.split('/')[4].replace("-", "_")
    artist_name = artist_name.replace('%26', '&')
    return artist_name

get_artist_name('https://www.lyrics.com/artist/Mumford-%26-Sons/1570809')

'Mumford_&_Sons'

In [4]:
def download_artist_page(url):

    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}

    r=requests.get(url, headers=headers)
    text = r.text
    soup = BeautifulSoup(text, 'html')
    artist_name = get_artist_name(url)
    #artist = soup.h1.text.replace(" ", "_")
    #artist_name = str(artist)
    #artist_name = artist_name.replace('%26', '&')
    #print(artist_name)

    with open (str(artist_name) + '_lyrics_page', 'w') as f:
        f.write(text)
    
    """with open ('james_vincent_mcorrow_lyrics_page', 'r') as f:
        my_string = f.read()
    print(my_string)"""
    
    #print(f)
    return soup
    
#download_artist_page('https://www.lyrics.com/artist/Mumford-%26-Sons/1570809')

Then we explore the page with beautiful soup to find all links

In [5]:
def get_links(soup):
    page_links = soup.find_all('a')
    song_links = []
    song_names = []
    for link in page_links: 
        if '/lyric/' in link['href']:
            split_name = link.text.split("[")[0].strip()
            if split_name not in song_names:
                song_links.append(link['href'])
                song_names.append(split_name)
            
    print("Song links found")
    print("___________")
    print("")
    return song_links

#get_links(download_artist_page(url))

# Getting song lyrics

Now that we have a list of links, we can create a function to crawl all these links and clean up any style or tag so that only the lyrics remain.

In [6]:
def get_lyrics(url):
    soup_lyrics = BeautifulSoup(requests.get(url).text, 'html')
    lyrics = soup_lyrics.find_all(id='lyric-body-text')
    
    # there are a lot of styles and links in the body which we want to get rid of, 
    # we'll do it with the loop below

    for tag in lyrics:
        for minitag in tag:
            if minitag in tag.find_all('a'):
                del minitag['href']
                del minitag['style']
                minitag.extract()
    return lyrics[0].get_text().replace("\n", " ")

We can then use that function inside a container function that can iterate through all the links returned by our get_links() function and use get_lyrics to write the lyrics into a file

In [7]:
def write_lyrics(song_links):
    print('Starting lyrics downloads')
    for counter, link in enumerate(song_links):
        url = 'https://www.lyrics.com' + link
        response = requests.get(url)
        
        # I want to be able to grab the name of the artist in this function so 
        # I'll get it from the URL
        
        artist = link.split('/')[3].replace("+", "_")
        artist = artist.replace('%26', '&')
        
        # I somehow can't decode the special characters in the song titles, 
        # so I'll just replace some of the common ones
        
        song_title = link.split('/')[4].replace("+", " ")
        song_title = song_title.replace("%27", "'")
        song_title = song_title.replace('%26', "&")
        song_title = song_title.replace('%2C', ',')
        song_title = song_title.replace('%C3%A9', 'é')
        song_title = song_title.replace('%5B', '[')
        song_title = song_title.replace('%5D', ']')
        song_title = song_title.replace('%21', '!')
        song_title = song_title.replace('%28', '(')
        song_title = song_title.replace('%29', ')')
        
        # now we write all the lyrics into an artist txt file! We use the append mode 
        # to add lyrics so the last lyrics don't get overwritten
        
        with open(f'lyrics/{artist}.txt', 'a') as f:
            f.write("\n" + get_lyrics(url))
        print(counter, song_title, response.status_code)
        #return artist
        
#links_list = get_links(soup)
#write_lyrics(links_list)

From there we can create a function to clean up the .txt documents we just made to prepare them for the bag of words

In [8]:
def clean_up_lyrics(artist):
    with open ('lyrics/' + artist +'.txt', 'r') as f:
        my_lyrics = f.read()
    my_lyrics = my_lyrics.split('\n')
    my_lyrics = [s.lower() for s in my_lyrics]

    clean_lyrics = []

    tokenizer = TreebankWordTokenizer()
    lemmatizer = WordNetLemmatizer()

    for doc in my_lyrics:
        #print(doc)
        tokens = tokenizer.tokenize(text=doc)
        clean_doc = " ".join(lemmatizer.lemmatize(token) for token in tokens)
        clean_lyrics.append(clean_doc)
    labels_one_artist = [artist] * len(clean_lyrics)
    global labels
    labels.extend(labels_one_artist)
    global corpus
    corpus.extend(clean_lyrics)    

#print(corpus)

That's it! Now we create our execution function so we can launch everything from one cell

In [9]:
labels = []
corpus = []

def run_all(url):
    
    artists = []
    artist = get_artist_name(url)
    print('Now looking at ' + artist)
    
    soup = download_artist_page(url)
    links_list = get_links(soup)
    write_lyrics(links_list)
    artists.append(artist)
    clean_up_lyrics(artist)
    print()
    print("_____________")
    print()
        
for url in urls:
    run_all(url)

print('Finished!')

Now looking at James_Vincent_McMorrow
Song links found
___________

Starting lyrics downloads
0 I Lie Awake Every Night 200
1 Alone Together 200
2 Evil 200
3 One Thousand Times 200
4 The Future 200
5 True Care 200
6 Thank You 200
7 Constellations 200
8 Holding On 200
9 Bears 200
10 Pink Salt Lake 200
11 Bend Your Knees 200
12 Change of Heart 200
13 Don't Wait Forever 200
14 I'm in Love 200
15 Rising Water 200
16 Last Story 200
17 Get Low 200
18 Killer Whales [Austin, March 2015] 200
19 Seek Another [Austin, March 2015] 200
20 Surreal [Barcelona, February 2015] 200
21 Lost Angles 200
22 Glacier 200
23 Higher Love 200
24 Ghosts 200
25 Cavalier 200
26 The Lakes 200
27 Red Dust 200
28 Gold 200
29 All Points 200
30 Looking Out 200
31 Repeating 200
32 Post Tropical 200
33 Outside, Digging 200
34 Red Dust (Acoustic) 200
35 Hear the Noise That Moves So Soft and Low 200
36 From the Woods!! 200
37 We Don't Eat 200
38 Wicked Game [Live At Killkenny Arts Festival, Ireland-2011] 200
39 Shells of Si

19 Hey Joe [Live] 200
20 Star Spangled Banner [Live] 200
21 Disguises 200
22 Message to Love 200
23 Tax Free 200
24 Hear My Train A-Comin' 200
25 I Don't Live Today 200
26 Foxey Lady [Afternoon Show] [%2A][Version] 200
27 Earth Blues 200
28 Somewhere 200
29 Baby Let Me Move You 200
30 Let Me Move You 200
31 Isabella 200
32 Izabella 200
33 Easy Blues 200
34 Crash Landing 200
35 Inside Out 200
36 Hey Gypsy Boy 200
37 Mojo Man 200
38 Villanova Junction Blues 200
39 Villanova Junction 200
40 Can You See Me 200
41 Stone Free 200
42 Shake [Remastered Mono] 200
43 Satisfaction (I Can't Get No) [Remastered Mono] 200
44 Go to the Mirror 200
45 White Cliffs of Dover 200
46 Sunshine Of Your Love 200
47 Tomorrow Never Knows 200
48 Hear My Train A Comin' 200
49 Killing Floor 200
50 Manic Depression 200
51 Little Wing 200
52 Spanish Castle Magic 200
53 Are You Experienced [%2A] 200
54 Voodoo Child (Slight Return) [%2A] 200
55 Catfish Blues 200
56 Little Miss Lover 200
57 Love Or Confusion 200
58 Wai

For debugging and clarity I'm putting all the corpus and the labels into a dataframe

In [10]:
data_tuples = list(zip(corpus,labels))
df = pd.DataFrame(data_tuples, columns=['corpus','labels'])

# Despite isna().sum() returning 0s, we have some empty rows.

# This could be cleaned up during the extraction but I was lazy 
# so I'll just delete those rows from the training dataframe

df = df[df.corpus != '']

#df.head()
#df.tail()

# Making predictions

I want to calculate the accuracy of my predictions, so I'm going to train_test_split to create a validation dataset

In [11]:
X = df['corpus']
y = df['labels']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=40)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((8769,), (2923,), (8769,), (2923,))

In [12]:
y_train.value_counts()

Jimi_Hendrix              3164
Foo_Fighters              2962
Mumford_&_Sons            1456
James_Vincent_McMorrow    1187
Name: labels, dtype: int64

In [None]:
"""X_copy = X
y_copy = y"""

In [13]:
"""ros = RandomOverSampler(random_state=10, sampling_strategy={1: 20000})
X_train_ros, y_train_ros = ros.fit_resample(X_train, y_train)
X_test_ros, y_test_ros = ros.fit_resample(X_test, y_test)
X_copy_ros, y_copy_ros = ros.fit_resample(X_copy, y_copy)"""

In [None]:
"""y_train_ros.value_counts()"""

We're going to need the steps of fitting the model for both calculating scores and making predictions, so I'll put my pipeline and pipeline fitting into a function where I can easily adjust the training data and the model chosen

In [14]:
# this cell will store the data to pass as argument

chosen_X = X_train
chosen_y = y_train
chosen_model = LogisticRegression(class_weight='balanced')

In [15]:
# and this contains our actual function

def fit_model(X, y, model):
    
    stopwords_ = stopwords.words('english')

    steps = [('tf-idf', TfidfVectorizer(stop_words=stopwords_)),
              ('clf', model)
            ]

    pipeline = Pipeline(steps)
    pipeline.fit(X_train, y_train)
    
    return pipeline

pipe = fit_model(chosen_X, chosen_y, chosen_model)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Now we can use this pipe to calculate the score

In [16]:
def calculate_score(pipeline):
    pipeline.predict(X_test)
    #pipeline.predict_proba(X_test)
    return pipeline.score(X_test, y_test)

calculate_score(pipe)


0.633937735203558

And finally predict the artist for new texts!

In [17]:
def predict_text(pipeline, text):
    pipeline.predict([text])
    pipeline.predict_proba([text])
    
    return pipeline.predict([text])[-1].replace("_", " ")

And here we are. We ask the user for an input and spit back the prediction based on that string and the chosen model

In [19]:
string = input("Enter your string:")
print()
print('I think this text is from...')
predict_text(pipe, string)

Enter your string:don't be a pretender

I think this text is from...


'Foo Fighters'