In [1]:
import requests 
import re
from tqdm import tqdm
import time
import numpy as np
from bs4 import BeautifulSoup

from os import makedirs
from os.path import exists
import glob

from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import RandomOverSampler

from sklearn.preprocessing import FunctionTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import metrics
from sklearn.pipeline import make_pipeline, Pipeline

from fuzzywuzzy import fuzz, process
import spacy

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize

import joblib
import pickle



## Step 1 : 
- Go to the page listing all songs of your favourite artist on lyrics.com.
- Copy the URL
- Use the requests module to download that page
- Examine the HTML code and look for links to songs
- Extract all links using Regular Expressions or BeautifulSoup
- Use the requests module to download song page (with not redundent song names) containing lyrics
- Save in artist folder text files, one songe page per text file

In [3]:
# function : get song urls

def get_song_urls(artist, artist_url):
    ''' input : artist str, artist_url str
        return : song_urls list '''

    request_artist_url = requests.get(artist_url)
    prefix = 'https://www.lyrics.com'
    lyric_soup = BeautifulSoup(request_artist_url.text, 'html.parser') 
    song_urls = []
    for album in lyric_soup.find('div', class_="tdata-ext").find_all('tbody'):
        song_urls += [prefix + x['href'] for x in album.find_all('a')] 
    return song_urls


In [4]:
# function : get song urls drop redundent ones with exact song name match

def get_song_urls_dr(artist, artist_url):
    ''' input : artist str, artist_url str
        return : song_urls list '''

    request_artist_url = requests.get(artist_url)
    prefix = 'https://www.lyrics.com'
    lyric_soup = BeautifulSoup(request_artist_url.text, 'html.parser') 
    song_urls = []
    song_titles = []
    for album in lyric_soup.find('div', class_="tdata-ext").find_all('tbody'):
        # append url if not duplicate song
        for i_title in np.arange(len(album.find_all('a'))):
            title = album.find_all('a')[i_title].text
            if title not in song_titles:
                song_titles += [title]
                song_urls += [prefix + x['href'] for x in album.find_all('a')] 
    return song_urls


In [19]:
# function : get song urls drop redundent ones with fuzzywuzzy 90% match

def get_song_urls_drf(artist, artist_url):
    ''' input : artist str, artist_url str
        return : song_urls list '''

    request_artist_url = requests.get(artist_url)
    prefix = 'https://www.lyrics.com'
    lyric_soup = BeautifulSoup(request_artist_url.text, 'html.parser') 
    song_urls = []
    song_titles = ['']
    for album in lyric_soup.find('div', class_="tdata-ext").find_all('tbody'):
        # append url if not duplicate song
        for i_title in np.arange(len(album.find_all('a'))):
            title = album.find_all('a')[i_title].text
            if title not in song_titles: # first layer filter to save time
                best_match = 0
                best_match_title = ''
                for given_title in song_titles:
                    score = fuzz.ratio(title.lower(), given_title.lower())
                    # from fuzz import process
                    # process.dedupe
                    # threshold !!!
                    if  score > best_match:
                        best_match = score
                        best_match_title = given_title
                if best_match < 90:      
                    song_titles += [title]
                    x = album.find_all('a')[i_title]
                    song_urls += [prefix + x['href']]
                else:
                    print(f'fuzzy title match : {title} vs {best_match_title}') # for testing removed titles
            #else:
                #print(f'exact title match : {title}')
    return song_urls

#a = get_song_urls_drf('simon_garfunkel', 'https://www.lyrics.com/artist/Simon-%26-Garfunkel/5431') 

In [18]:
# function : save song pages to txt files

def save_song_page(artist, song_urls, path):
    ''' input : artist str
                song_url list containing song urls of an artist
                path str
        do : save song page to txt files with filename start with artist to path artist folder'''

    if not exists(f'{path}/{artist}'):
        makedirs(f'{path}/{artist}')
    for i in tqdm(np.arange(len(song_urls))):
        time.sleep(0.5)
        one_song_request = requests.get(song_urls[i])
        open(f'{path}/{artist}/{artist}_song_{i}.txt', 'w').write(one_song_request.text)


In [19]:
# function : strip song page and save lyrics to txt file

def save_lyric_files(artist, path):
    ''' input : artist str
                path str
        do : save striped lyric to txt files with filename start with artist to path artist folder'''

    path_filenames = (path + f'/{artist}/' + f'{artist}_song_*.txt')
    song_files = [f for f in sorted(glob.glob(path_filenames))]
    for i in tqdm(np.arange(len(song_files))):
        # save the text in the song txt files into variable text
        text = open(song_files[i], 'r').read()
        lyric_soup = BeautifulSoup(text, 'html.parser') # text here is the content of txt file, and is equivalent to requests.text
        # strip
        if lyric_soup.find('pre') is not None:
            open(f'./{artist}/{artist}_lyric_{i}.txt', 'a').write(lyric_soup.find('pre').text)


In [None]:
# run all steps for two artists

artists = ['simon_garfunkel', 'queen']
artist_urls = ['https://www.lyrics.com/artist/Simon-%26-Garfunkel/5431', 'https://www.lyrics.com/artist/Queen/5205']
path = '/Users/jinglin/Documents/spiced_projects/unsupervised-lemon-student-code/week_04/project'

for n in np.arange(len(artists)): 
    artist = artists[n]
    artist_url = artist_urls[n]

    # get song urls
    song_urls = get_song_urls_drf(artist, artist_url)
    print(len(song_urls))

    # save song page 
    save_song_page(artist, song_urls, path)

    # save striped lyric to txt files
    save_lyric_files(artist, path)

# simon_garfunkel : no drop 1875, drop exact match 283, drop fuzzywuzzy match 204 remained
# queen : before droping over 3800, after 578
# some of 204 and 578, the html lyric body pre is none and skipped -> in the end two artist together 738 lyrics

## step 2: 
- Create corpus and labels from lyrics txt files of both artists
    - X : corpus, vector, each value = string one song
    - y : labels, vector, each value = one artist name
- Train test split ! otherwise information leakage
- Deal with class imbalance with training data  
    - (this step do not belong to pipeline, do not do for test)
- Feature Engineer Pipeline 
    - Cleaning: re
    - vectorization : 
        - Countvectorizer, TfIdfVectorizer, or doc2vec vectorizer
        - can filter with english stopwords, maximal document frequency, minimal document frequency (appearence over all songs)
- Machine Learning, Train, CV, Hyperparameter tunning cycle 
- Test check
- Predict an external x 

In [3]:
# function create corpus and labels

def get_corpus_labels(artists, path):
    '''input : artists str-list 
               path str path to the artist folders containting lyrics txt files
       return : corpus list each row a str with lyrics of one song
                labels list each row a str of artist name'''
    corpus = []
    labels = []
    labels_n = []
    for i_artist in np.arange(len(artists)):
        artist = artists[i_artist] 
        path_filenames = (path + f'/{artist}/' + f'{artist}_lyric_*.txt')
        song_files = [f for f in sorted(glob.glob(path_filenames))]
        for i in range(len(song_files)):
            text = open(song_files[i], 'r').read()
            corpus.append(text)
            labels.append(artist) # string labels
            labels_n.append(i_artist) # 0,1 labels
    return corpus, labels, labels_n


In [4]:
artists = ['simon_garfunkel', 'queen']
path = '/Users/jinglin/Documents/spiced_projects/unsupervised-lemon-student-code/week_04/project'
(corpus, labels, labels_n) = get_corpus_labels(artists, path)
len(corpus), len(labels), type(corpus), type(labels), labels[0], labels_n[0]

(738, 738, list, list, 'simon_garfunkel', 0)

In [5]:
# feature and label/target value
X = pd.DataFrame({'lyric' : corpus})
y = pd.DataFrame({'artist' : labels})

# train test split
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.25, random_state=0)
print(Xtrain.shape, Xtest.shape, ytrain.shape, ytest.shape)
print(ytrain.describe())
print(ytest.describe())

# class imbalance 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

(553, 1) (185, 1) (553, 1) (185, 1)
       artist
count     553
unique      2
top     queen
freq      402
       artist
count     185
unique      2
top     queen
freq      142


In [6]:
# deal with imbalance 
# original Xtrain simon_garfunkel 151, queen 402; increase simon_garfunkel to 250
ros = RandomOverSampler(sampling_strategy={'simon_garfunkel' : 250}, random_state=0)
X_ros, y_ros = ros.fit_resample(Xtrain, ytrain)
print(X_ros.shape, y_ros.shape)
print(y_ros.describe()) 

(652, 1) (652, 1)
       artist
count     652
unique      2
top     queen
freq      402


In [7]:
# cleaning : need to add e.g. stemming 
def lyric_cleaning(X):
    ''' text cleaning : input can be str, list of sring, or pandas Series '''
    if isinstance(X, str): # e.g. "hello darkness my old friend"
        x_clean = re.sub(r'\n', ' ', X.lower())
    elif isinstance(X, list): # e.g. Xtrain['lyric'].tolist()
        x_clean = [x.replace('\n', ' ') for x in X]
    else: # e.g. Xtrain['lyric']
        x_clean = [re.sub(r'\n', ' ', x.lower()) for x in X]
    return x_clean

In [8]:
# try feature englineering - machine learning - pipeline
fe_pipe = make_pipeline(
    FunctionTransformer(lyric_cleaning), 
    TfidfVectorizer(stop_words='english', max_df=0.8, min_df=0.01, ngram_range=(1,2))
)

fe_ml_pipe = Pipeline([
    ('fe', fe_pipe),
    ('logreg', LogisticRegression(random_state=0))
])


In [9]:
fe_ml_pipe.fit(Xtrain['lyric'].tolist(), ytrain['artist'].tolist())
train_score = fe_ml_pipe.score(Xtrain['lyric'].tolist(), np.ravel(ytrain))
test_score = fe_ml_pipe.score(Xtest['lyric'].tolist(), np.ravel(ytest))
print(train_score, test_score)

0.9240506329113924 0.9135135135135135


In [28]:
# save and load model so that can be used without have to train again
joblib.dump(fe_ml_pipe, 'lyric.mod') 
model = joblib.load('lyric.mod')

In [15]:
joblib.dump(fe_ml_pipe, 'lyric.pkl') 
model_pp = joblib.load('lyric.pkl') 

In [16]:
pickle.dump(fe_ml_pipe, open('lyric.sav', 'wb'))
model_p = pickle.load(open('lyric.sav', 'rb'))

In [17]:
# try an external example 
userinput = "hello darkness my old friend "
print(model_p.predict([userinput]))
userinput = "hello darkness my old friend I've come to talk to you again"
print(model_pp.predict([userinput]))
userinput = "And here's to you, "
print(model.predict([userinput]))
userinput = "And here's to you, Mrs. Robinson "
print(model.predict([userinput]))
userinput = "And here's to you, Mrs. Robinson Jesus loves you more than you will know Whoa, whoa, whoa"
print(model.predict([userinput]))
userinput = "And here's to you, Mrs. Robinson Jesus loves you more than you will know Whoa, whoa, whoaGod bless you, please, Mrs. Robinson Heaven holds a place for those who pray Hey, hey, hey Hey, hey, hey"
print(model.predict([userinput]))


['queen']
['queen']
['queen']
['simon_garfunkel']
['queen']
['simon_garfunkel']
