In [140]:
import requests
import re
from bs4 import BeautifulSoup
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import TreebankWordTokenizer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from nltk.corpus import stopwords
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score,KFold,ShuffleSplit
import os
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score

# Requesting the table of content page

First we need to download the page containing all links for songs for an artist

In [2]:
urls = ['https://www.lyrics.com/artist/James-Vincent-McMorrow/1093507',
        'https://www.lyrics.com/artist/Mumford-%26-Sons/1570809',
       'https://www.lyrics.com/artist/Jimi-Hendrix/85934',
       'https://www.lyrics.com/artist/Foo-Fighters/144725']

In [3]:
def get_artist_name(url):
    artist_name =  url.split('/')[4].replace("-", "_")
    artist_name = artist_name.replace('%26', '&')
    return artist_name

#get_artist_name('https://www.lyrics.com/artist/Mumford-%26-Sons/1570809')

In [4]:
def download_artist_page(url):

    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}

    r=requests.get(url, headers=headers)
    text = r.text
    soup = BeautifulSoup(text, 'html')
    artist_name = get_artist_name(url)
    #artist = soup.h1.text.replace(" ", "_")
    #artist_name = str(artist)
    #artist_name = artist_name.replace('%26', '&')
    #print(artist_name)

    with open (str(artist_name) + '_lyrics_page', 'w') as f:
        f.write(text)
    
    """with open ('james_vincent_mcorrow_lyrics_page', 'r') as f:
        my_string = f.read()
    print(my_string)"""
    
    #print(f)
    return soup
    
#download_artist_page('https://www.lyrics.com/artist/Mumford-%26-Sons/1570809')

Then we explore the page with beautiful soup to find all links

In [5]:
def get_links(soup):
    page_links = soup.find_all('a')
    song_links = []
    song_names = []
    for link in page_links: 
        if '/lyric/' in link['href']:
            split_name = link.text.split("[")[0].strip()
            if split_name not in song_names:
                song_links.append(link['href'])
                song_names.append(split_name)
            
    print("Song links found")
    print("___________")
    print("")
    return song_links

#get_links(download_artist_page(url))

We then prepare a function to extract the lyrics out of the lyrics page and clean up any style or tag

In [6]:
def get_lyrics(url):
    soup_lyrics = BeautifulSoup(requests.get(url).text, 'html')
    lyrics = soup_lyrics.find_all(id='lyric-body-text')

    for tag in lyrics:
        for minitag in tag:
            if minitag in tag.find_all('a'):
                del minitag['href']
                del minitag['style']
                minitag.extract()
    return lyrics[0].get_text().replace("\n", " ")

In [17]:
def artist_folder_check(artist):
    if not os.path.exists('lyrics/' + artist):
        print('Creating folder for ' + artist)
        os.makedirs('lyrics/' + artist)
        print('Folder created for ' + artist + '!')
        print()
    else: 
        print(artist + ' folder already exists')
    
artist_folder_check('James_Vincent_McMorrow')

James_Vincent_McMorrow folder already exists


We can then use that function inside a container function that can iterate through all the links returned by our get_links() function and use get_lyrics to write the lyrics into a file

In [38]:
def write_lyrics(song_links):
    print('Starting lyrics downloads')
    
    
    for counter, link in enumerate(song_links):
        url = 'https://www.lyrics.com' + link
        #response = requests.get(url)
        #print(response)
        
        artist = link.split('/')[3].replace("+", "_")
        artist = artist.replace('%26', '&')
        
        # I somehow can't decode the special characters in the song titles, 
        #so I'll just replace some of the common ones
        
        song_title = link.split('/')[4].replace("+", " ")
        song_title = song_title.replace("%27", "'")
        song_title = song_title.replace('%26', "&")
        song_title = song_title.replace('%2C', ',')
        song_title = song_title.replace('%C3%A9', 'é')
        song_title = song_title.replace('%5B', '[')
        song_title = song_title.replace('%5D', ']')
        song_title = song_title.replace('%21', '!')
        song_title = song_title.replace('%28', '(')
        song_title = song_title.replace('%29', ')')
        
        # now we write all the lyrics into an artist txt file! We use the append mode 
        #to add lyrics so the last lyrics don't get overwritten
        #return artist
        

        if not os.path.exists('lyrics/' + artist +'/' + song_title + '.txt'):
            print('Writing: ' + song_title)
                        
            with open(f'lyrics/{artist}/{song_title}.txt', 'w') as f:
                f.write(get_lyrics(url))
            #print(counter, song_title, response.status_code)
        else:
            print('Already exists: ' + song_title)
            
        

#links_list = get_links(soup)
#write_lyrics(links_list)

From there we can create a function to clean up the lyrics documents we just made to prepare them for the bag of words

In [39]:
def clean_up_lyrics(artist):
    
    print('Cleaning up lyrics for ' + artist + '\n')
    
    for fn in os.listdir('lyrics/' + artist +'/'):
        my_lyrics = open('lyrics/' + artist +'/' + fn).read()
        #print('\n' + (fn).upper() + '\n')
        #print(my_lyrics)
        my_lyrics = my_lyrics.split('\n')
        my_lyrics = [s.lower() for s in my_lyrics]

        clean_lyrics = []

        tokenizer = TreebankWordTokenizer()
        lemmatizer = WordNetLemmatizer()

        for doc in my_lyrics:
            #print(doc)
            tokens = tokenizer.tokenize(text=doc)
            clean_doc = " ".join(lemmatizer.lemmatize(token) for token in tokens)
            clean_lyrics.append(clean_doc)
        labels_one_artist = [artist] * len(clean_lyrics)
        global labels
        labels.extend(labels_one_artist)
        global corpus
        corpus.extend(clean_lyrics)  
        print(fn + ' lyrics added to corpus!')

#clean_up_lyrics('James_Vincent_McMorrow')
#print(corpus)

In [40]:
#corpus

That's it! Now we create our execution function so we can launch everything from one cell

In [41]:
labels = []
corpus = []

def run_all(url):
    
    artists = []
    artist = get_artist_name(url)
    print('Now looking at ' + artist)
    
    soup = download_artist_page(url)
    links_list = get_links(soup)
    artist_folder_check(artist)
    print()
    write_lyrics(links_list)
    print()
    clean_up_lyrics(artist)
    artists.append(artist)
    print()
    print()
    print("_____________")
    print()
        
for url in urls:
    run_all(url)

print('Finished!')

Now looking at James_Vincent_McMorrow
Song links found
___________

James_Vincent_McMorrow folder already exists

Starting lyrics downloads
Already exists: I Lie Awake Every Night
Already exists: Alone Together
Already exists: Evil
Already exists: One Thousand Times
Already exists: The Future
Already exists: True Care
Already exists: Thank You
Already exists: Constellations
Already exists: Holding On
Already exists: Bears
Already exists: Pink Salt Lake
Already exists: Bend Your Knees
Already exists: Change of Heart
Already exists: Don't Wait Forever
Already exists: I'm in Love
Already exists: Rising Water
Already exists: Last Story
Already exists: Get Low
Already exists: Killer Whales [Austin, March 2015]
Already exists: Seek Another [Austin, March 2015]
Already exists: Surreal [Barcelona, February 2015]
Already exists: Lost Angles
Already exists: Glacier
Already exists: Higher Love
Already exists: Ghosts
Already exists: Cavalier
Already exists: The Lakes
Already exists: Red Dust
Alrea

To Darkness-Kripa.txt lyrics added to corpus!
The Wild.txt lyrics added to corpus!
Lover of the Light.txt lyrics added to corpus!
Broken Crown.txt lyrics added to corpus!
Days-This Time Tomorrow.txt lyrics added to corpus!
Cold Arms.txt lyrics added to corpus!
I Will Wait.txt lyrics added to corpus!
Hold On to What You Believe [%2A].txt lyrics added to corpus!
Guiding Light.txt lyrics added to corpus!
Where Are You Now%3F.txt lyrics added to corpus!
Snake Eyes.txt lyrics added to corpus!
Whispers in the Dark.txt lyrics added to corpus!
Hot Gates.txt lyrics added to corpus!
42.txt lyrics added to corpus!
Babel.txt lyrics added to corpus!
Tompkins Square Park.txt lyrics added to corpus!
Woman.txt lyrics added to corpus!
Devil's Spoke-Sneh Ko Marg.txt lyrics added to corpus!
Ngamila.txt lyrics added to corpus!
I Was Young When I Left Home.txt lyrics added to corpus!
After the Storm.txt lyrics added to corpus!
Si Tu Veux.txt lyrics added to corpus!
Rose of Sharon.txt lyrics added to corpus

Come On.txt lyrics added to corpus!
Rock Me.txt lyrics added to corpus!
Rock Me Baby.txt lyrics added to corpus!
Angel.txt lyrics added to corpus!
Flashing [Instrumental].txt lyrics added to corpus!
Midnight.txt lyrics added to corpus!
Lullaby for the Summer.txt lyrics added to corpus!
The Star Spangled Banner and Purple Haze.txt lyrics added to corpus!
Sgt Pepper's Lonely Hearts Club Band.txt lyrics added to corpus!
Mr. Bad Luck.txt lyrics added to corpus!
God Save the Queen.txt lyrics added to corpus!
Fire [Live].txt lyrics added to corpus!
Sgt. Pepper's Lonely Heart's Club Band.txt lyrics added to corpus!
Land of 1000 Dances.txt lyrics added to corpus!
%2420 Fine.txt lyrics added to corpus!
Voodoo Child (Slight Return) [%2A].txt lyrics added to corpus!
Tomorrow Never Knows - Featuring Jim Morrison.txt lyrics added to corpus!
House Burning Down.txt lyrics added to corpus!
Crying Blue Rain.txt lyrics added to corpus!
Come On (Let The Good Times Roll).txt lyrics added to corpus!
Wait U

Tomorrow.txt lyrics added to corpus!
White Cliffs of Dover.txt lyrics added to corpus!
Little Miss Lover.txt lyrics added to corpus!
([I Can%E2%80%99t Get No) Satisfaction.txt lyrics added to corpus!
Little Miss Strange.txt lyrics added to corpus!
Foxey Lady [Afternoon Show] [%2A][Version].txt lyrics added to corpus!
Lawdy Miss Clawdy.txt lyrics added to corpus!
Catfish Blues.txt lyrics added to corpus!
Hear My Train a Comin' (Get My Heart Back Together).txt lyrics added to corpus!
Hound Dog Blues.txt lyrics added to corpus!
Come On, Pt. 1 [Alternate Take] [Alternate Take][%23].txt lyrics added to corpus!
Inside Out.txt lyrics added to corpus!
Let Me Go.txt lyrics added to corpus!
Freedom and You.txt lyrics added to corpus!
Born Under a Bad Sign.txt lyrics added to corpus!
Exp.txt lyrics added to corpus!
Purple Haze [Live].txt lyrics added to corpus!
Disguises.txt lyrics added to corpus!
The Things That I Used to Do [%23].txt lyrics added to corpus!
3rd Stone From the Sun.txt lyrics ad

Home.txt lyrics added to corpus!
These Days.txt lyrics added to corpus!
Requiem.txt lyrics added to corpus!
Erase-Replace.txt lyrics added to corpus!
Another Round.txt lyrics added to corpus!
Live-in Skin.txt lyrics added to corpus!
Word Forward [%23].txt lyrics added to corpus!
Resolve [DVD].txt lyrics added to corpus!
T-Shirt.txt lyrics added to corpus!
Arrows.txt lyrics added to corpus!
Congregation.txt lyrics added to corpus!
My Hero.txt lyrics added to corpus!
Let It Die.txt lyrics added to corpus!
FFL.txt lyrics added to corpus!
Tired Of You.txt lyrics added to corpus!
Baker Street.txt lyrics added to corpus!
How I Miss You.txt lyrics added to corpus!
Statues.txt lyrics added to corpus!
More Than A Woman.txt lyrics added to corpus!
You Should Be Dancing.txt lyrics added to corpus!
Free Me.txt lyrics added to corpus!
X-Static.txt lyrics added to corpus!
Up in Arms.txt lyrics added to corpus!
Band On the Run.txt lyrics added to corpus!
World.txt lyrics added to corpus!
Walking Afte

For debugging and clarity I'm putting all the corpus and the labels into a dataframe

In [85]:
data_tuples = list(zip(corpus,labels))
df = pd.DataFrame(data_tuples, columns=['corpus','labels'])

# Despite isna().sum() returning 0s, we have some empty rows.

# This could be cleaned up during the extraction but I was lazy 
# so I'll just delete those rows from the training dataframe

df = df[df.corpus != '']

df.shape

(11287, 2)

# Making predictions


I want to calculate the accuracy of my predictions, so I'm going to train_test_split to create a validation dataset



In [154]:
X = df['corpus']
y = df['labels']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=200)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((8465,), (2822,), (8465,), (2822,))

For debugging and clarity I'm putting all the corpus and the labels into a dataframe

In [115]:
#I want to calculate the accuracy of my predictions, so I'm going to train_test_split to create a validation dataset

X = df['corpus']
y = df['labels']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=10000)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((8465,), (2822,), (8465,), (2822,))

In [128]:
# this cell will store the data to pass as argument

chosen_X = X_train
chosen_y = y_train
LR_balanced  = LogisticRegression(max_iter=1000)
LR = LogisticRegression(class_weight='balanced', max_iter=1000)
NB = MultinomialNB()

models = [LR, LR_balanced, NB]

In [160]:
# and this contains our actual function

def fit_model(X, y, model):
    
    stopwords_ = stopwords.words('english')

    steps = [('tf-idf', TfidfVectorizer(stop_words=stopwords_)),
              ('clf', model)
            ]

    pipeline = Pipeline(steps)
    pipeline.fit(X_train, y_train)
    
    return pipeline

#pipe = fit_model(chosen_X, chosen_y, chosen_model)

In [161]:
def calculate_score(pipeline, X, y):
    y_pred = pipeline.predict(X_test)
    f1 = f1_score(y, y_pred, average='weighted')
    #pipeline.predict_proba(X_test)
    accuracy = pipeline.score(X, y)
    return f1, accuracy

#calculate_score(pipe, X_test, y_test)

We can now calculate the accuracy for all our models

In [162]:
for model in models:
    f1, accuracy = calculate_score(fitted_pipeline, X_test, y_test)
    fitted_pipeline = fit_model(X_train, y_train, model)
    print()
    print(model)
    print("f1 score:", f1)
    print("accuracy:", accuracy)
    print()


LogisticRegression(class_weight='balanced', max_iter=1000)
f1 score: 0.5661531903784556
accuracy: 0.6017009213323884


LogisticRegression(max_iter=1000)
f1 score: 0.6133565010175521
accuracy: 0.6052445074415308


MultinomialNB()
f1 score: 0.6120218463509086
accuracy: 0.6296952515946137



And finally predict the artist for new texts!

In [163]:
def predict_text(pipeline, text):
    pipeline.predict([text])
    pipeline.predict_proba([text])
    
    return pipeline.predict([text])[-1].replace("_", " ")


In [167]:
string = input("Enter your string: ")
print()
print('I think this text is from...')
predict_text(pipe, string)

Enter your string:who shot the sheriff?

I think this text is from...


'Jimi Hendrix'