# Text classification: Predicting the correct artist (out of 2) from song lyrics using ML

In [1]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import spacy

## 1. Business goal
Build a text classification model to predict the artist from a piece of text.

---

## 2. Get data

### Get links to all individual lyrics from a website for artist 1 (Maroon 5) and artist 2 (Coldplay)

In [2]:
def get_song_links(artist_name, number):  
    """
    Given some artist (string) and number (int) of pages with lyrics on metrolyrics.com, 
    collect links for all the artist's songs and return them.
    """
    links = []
    artist = artist_name.replace(' ', '-').lower()
    for i in range(1,number+1):
        url = 'https://www.metrolyrics.com/' + artist + '-alpage-' + str(i) + '.html'
        response = requests.get(url)
        if response.status_code != 200:
            print('Sorry, try again.')
        soup = BeautifulSoup(markup=response.text)
        for element in soup.find_all('a', attrs = {'class' : 'title'}):
            links.append(element.get('href'))
    return links

In [3]:
maroon = get_song_links('Maroon 5', 3)

In [4]:
coldplay = get_song_links('Coldplay', 4)

In [5]:
#maroon

In [6]:
#coldplay

### Get rid of duplicate URLS (by turning list into data frame, extract titles and delete duplicate rows)

In [7]:
def remove_duplicate_links (links, num):
    """
    Given a list of links from a website ending on ".com", 
    the function turns it into a DataFrame, extracts titles and deletes rows with duplicate titles based on a number (num) of identical initial characters.
    """
    df = pd.DataFrame(data=links, columns = ['url'])
    for i in df:
        df[['url2','title']] = df['url'].str.split(".com/",expand=True)
        df['title_short'] = [x[:num] for x in df['title']]
    df.drop_duplicates(subset=['title_short'], keep='first', inplace=True)
    return df

#### Maroon 5

In [8]:
maroon_df = remove_duplicate_links (maroon, 4)

In [9]:
maroon_df.shape

(130, 4)

##### Remove other duplicate after visual inspection

In [10]:
maroon_df = maroon_df[maroon_df['title_short'] != 'rag-'].copy()

In [11]:
maroon_df = maroon_df[maroon_df['title_short'] != 'is-t'].copy()

In [12]:
maroon_df.shape

(128, 4)

In [13]:
maroon_df.head(3)

Unnamed: 0,url,url2,title,title_short
0,http://www.metrolyrics.com/accidentally-in-lov...,http://www.metrolyrics,accidentally-in-love-lyrics-maroon-5.html,acci
1,http://www.metrolyrics.com/angel-in-blue-jeans...,http://www.metrolyrics,angel-in-blue-jeans-lyrics-maroon-5.html,ange
2,http://www.metrolyrics.com/animal-remix-lyrics...,http://www.metrolyrics,animal-remix-lyrics-maroon-5.html,anim


#### Coldplay

In [14]:
coldplay_df = remove_duplicate_links (coldplay, 4)

In [15]:
coldplay_df.shape

(183, 4)

##### Remove other duplicate after visual inspection

In [16]:
coldplay_df = coldplay_df[coldplay_df['title_short'] != 'son-'].copy()

In [17]:
coldplay_df = coldplay_df[coldplay_df['title_short'] != 'dona'].copy()

In [18]:
coldplay_df = coldplay_df[coldplay_df['title_short'] != 'till'].copy()

In [19]:
coldplay_df = coldplay_df[coldplay_df['title_short'] != 'upup'].copy()

In [20]:
coldplay_df.shape

(179, 4)

##### Create linklists

In [21]:
def df_to_linklist(df, s):
    """
    Turns a column (series) of a given DataFrame (df) into a list and returns it.
    """
    linklist = df[s].tolist()
    return linklist

In [22]:
maroon_linklist = df_to_linklist(maroon_df, 'url')

In [23]:
coldplay_linklist = df_to_linklist(coldplay_df, 'url')

In [24]:
#maroon_linklist

In [25]:
#coldplay_linklist

### Loop through both lists and extract lyrics from the respective pages ( + save lyrics to disk)

#### Maroon 5

In [26]:
def get_lyrics(links, artist_name):
    """Given a list of song urls and the name of the artist (string),
    returns a list of the lyrics and save them to disc"""
   
    lyrics = []
    artist = artist_name.replace(' ', '_')
    for link in links:
        response = requests.get(f'{link}')
        soup = BeautifulSoup(markup=response.text)
        lyrics_body = soup.find('div', attrs = {'id':'lyrics-body-text'})
        lyrics_paragraph = []
        for verse in lyrics_body.find_all('p', attrs = {'class' : 'verse'}):
            lyrics_paragraph.append(verse.text)
        lyrics.append((' '.join(lyrics_paragraph)))
    df = pd.DataFrame(data=lyrics, columns = ['lyrics'])
    df.to_html(f'{artist}/{artist}_lyrics')
    return lyrics

In [27]:
maroon_lyrics = get_lyrics(maroon_linklist, 'Maroon 5')

In [28]:
#maroon_lyrics

#### Coldplay

In [29]:
coldplay_lyrics = get_lyrics(coldplay_linklist, 'Coldplay')

AttributeError: 'NoneType' object has no attribute 'find_all'

In [None]:
#BASE_URL.format(row['prefix_1'], year, row['prefix_2'])

### Load lyrics into a CORPUS / data frame and preprocess data

- clean up (RegEx)

- remove punctuation
- make everything lowercase (or use spacy)

In [None]:
CORPUS = maroon_lyrics

In [None]:
CORPUS = [s.lower() for s in CORPUS]

In [None]:
LABELS = ['maroon 5'] * 128 + ['coldplay'] * 179

In [None]:
#nlp = spacy.load('en_core_web_md')

In [None]:
#text = nlp(text_original)

In [None]:
#for token in text[:15]:
#    print(token, token.lemma_)

In [None]:
#for token in text[:15]:
#    print(token, token.is_stop)

**create own list of stopwords**: https://github.com/explosion/spaCy/blob/master/spacy/lang/en/stop_words.py

In [None]:
# def spacy_cleaner(document):
#     tokenize_doc = nlp(document)
#     new_doc = []
#     for token in tokenize_doc:
#         if not token.is_stop and token.is_alpha:
#             new_doc.append(token.lemma_)
#     return new_doc

In [None]:
# import spacy
# from dframcy import DframCy

# nlp = spacy.load('en_core_web_sm')
# dframcy = DframCy(nlp)
# doc = dframcy.nlp(u'Apple is looking at buying U.K. startup for $1 billion')
# annotation_dataframe = dframcy.to_dataframe(doc)

---

## 3. Train-test-split

---

## 4. Feature engineering

- do lemmatization
- remove stop words
- check apostrophes (--> replace words that were cut by them?)
- feature selection techniques that we've already seen (e.g. feature selection, feature importance, dimensionality reduction(?))

---

## 5. Train model

---

## 6. Cross-validation

---

## 7. Get test scores