## Bussiness Goal:

Build a text classification model on song lyrics. The task is to predict the artist from a piece of text. To train such a model, you first need to collect your own lyrics dataset:

1. Download a HTML page with links to songs.

2. Extract hyperlinks of song pages.

3. Download and extract the song lyrics.

4. Vectorize the text using the Bag Of Words method.

5. Train a classification model that predicts the artist from a piece of text.

6. Refactor the code into functions.

7. Write a simple command-line interface for the program.

In [None]:
import os
import requests 
from requests.exceptions import HTTPError, ConnectionError
import re
from pathlib import PurePosixPath
from urllib.parse import unquote, urlparse
import logging

import pickle

import pandas as pd
import numpy as np

from bs4 import BeautifulSoup
import warnings
warnings.simplefilter("ignore")

_log = logging.getLogger(__name__)

CURR_DIR = os.path.abspath('')

url_queen = 'https://www.lyrics.com/artist/Queen'
url_pixies = 'https://www.lyrics.com/artist/Pixies/5149'

#### request_url function.

A function for requesting the files was implemented, instead of simply using the one from the Requests module, so as to be able to catch Exceptions often raised when tying to connect with Servers. There are many more exceptions in the Request library, i used here only the basic ones.

In [None]:
def request_url(url):

    try:
        response = requests.get(url, verify = False).text
    except HTTPError as http_err:
        _log.error('HTTP error occurred: {http_err}')
    except ConnectionError as con_err:
        _log.error('ConnectionError occurred: {con_err_err}')
    try:
        with open('lyrics_site.html', "w", encoding='utf8') as file:
            file.write(response)
    except IOError as e:
        _log.error ("I/O error({0}): {1}").format(e.errno, e.strerror)
                
    return response

In [None]:
def find_lyrics_url(url):
'''Function to request the url for the lyrics site for each individual artist
    and with the help of Regular Expression extracts the links -href- for
    each song, and saves them to a list'''

    url_list = []
    resp_text = request_url(url)
    url_pattern = 'href="/lyric(.+?)"'
    extract_lyrics_url = re.findall(url_pattern, resp_text)
    for href in extract_lyrics_url:
        conc_url = 'https://www.lyrics.com/lyric' + href
        url_list.append(conc_url)
    
    return url_list

In [None]:
def extract_songs_html(url):
'''Given an url with the lyrics site, scrap the every song page of the songs list and 
extract the lyrics to a text'''

    url_list = find_lyrics_url(url)
    paths_list = []
    for url in url_list[:100]:  
        song = requests.get(url, verify = False).text
        url_filename = re.findall('([^\/]+$)', url)
        filename = '{0}.txt'.format(url_filename)
        path = os.path.join(CURR_DIR + '/' + filename)
        if not os.path.isdir(CURR_DIR):
            os.makedirs(CURR_DIR)
        open(os.path.join(CURR_DIR,filename), 'w', encoding='utf8').write(song)
        paths_list.append(path)
    return paths_list

In [None]:
def extract_lyrics_from_html(url):#or function or list of paths
'''Given a path to the text song files with HTML content, generate a Dataframe from a list with the artist name, the songs
and their lyrics.'''

    paths = extract_songs_html(url)
    data_lyrics = []
    for path in paths:
        if path.endswith('txt'):
            soup = BeautifulSoup(open(path, encoding="utf8", errors='ignore').read())
            artist = soup.find(attrs={'class': 'lyric-artist'}).text.replace('\nBuy This Song\n\n', ' ')
            lyric = soup.find(attrs={'class':'lyric-body'}).text
            lyrics = re.sub(r'[^\w]+', ' ', lyric).lower()
            data_lyrics.append((lyrics,artist))
   
    return pd.DataFrame(data_lyrics, columns=['lyrics','artist'])

In [None]:
paths_queen = extract_songs_html(url_queen)
data_queen = extract_lyrics_from_html(url_queen)
data_queen

##### Save the dataframe with pickle, for future use.

In [None]:
pickle.dump(data_queen, open('data_queen.pkl', 'wb'))

In [None]:
paths_pixies = extract_songs_html(url_pixies)
data_pixies = extract_lyrics_from_html(url_pixies)
data_pixies

In [None]:
pickle.dump(data_pixies, open('data_pixies.pkl', 'wb'))