## Text Classification Model on Song Lyrics

### Project Goals: 
    Extracting lyrics dataset and build a classification model which could predict an artist from a piece of text 

In [67]:
import requests
import re
from bs4 import BeautifulSoup
import os


### Songs ULRL

In [68]:
url_fassie = 'https://www.lyrics.com/artist/Brenda-Fassie/21870'
url_carey = 'https://www.lyrics.com/artist/Mariah-Carey/62404'

### Convert html to text and to bs object

In [69]:
def get_soup(url):
    r = requests.get(url)
    text = r.text
    soup = BeautifulSoup(text, "html.parser")
    return soup

In [70]:
soup_fassie = get_soup(url_fassie)
soup_carey = get_soup(url_carey)

### Requesting and getting the list of lyrics links

In [71]:
def get_lyrics_links(soup):
    '''Returns the list of lyric links.'''
    list_of_links = []
    for a in soup.find_all('a', href = True):
        href = a['href']
        if '/lyric/'in href:
            list_of_links.append('https://www.lyrics.com'+href)
    return list_of_links

In [72]:
fassie_list_of_links = get_lyrics_links(soup_fassie)
carey_list_of_links = get_lyrics_links(soup_carey)

### Downloading and saving the song lyrics

In [73]:
def get_path(links):
    for link in links:
        artist = re.sub("\+", "_", link.split("/")[-2])
        directory = f"../data/{artist}/"
    return directory

In [34]:
import time
from tqdm import tqdm
def downloading_and_saving_lyrics(links):
    '''a function that request, download and save song lyrics.'''
    for link in tqdm(links):
        artist = re.sub("\+", "_", link.split("/")[-2])
        song = link.split("/")[-1]
        songname =  f"{directory}/"+re.sub("\+", "_", f"{song}.txt")
        r_soup = BeautifulSoup(requests.get(link).text, "html.parser")
        try:
            lyric1 = r_soup.find("pre", {"id":"lyric-body-text"}).text
            with open(songname, "w") as file:
                file.write(lyric1)
        except AttributeError: 
            print(link)
        time.sleep(5)

In [31]:
downloading_and_saving_lyrics(carey_list_of_links[8:9])

100%|████████████████████████████████████████████| 1/1 [00:06<00:00,  6.12s/it]


### Creating a corpus / bag of Words with song lyrics

In [74]:
import os
corpus_list = []
def make_corpus_list(folder):
    '''a function that create opens saved song lyrics (from their respective folders),
    and adding them to a list (or 2 lists?).'''
    for lyric in os.listdir(f"{directory}/"):
         corpus = open(f"{directory}/" + lyric).read()
         corpus_list.append(corpus)
    return corpus_list

In [75]:
directory = get_path(carey_list_of_links)

In [76]:
import os
fassie_corpus_list = []
carey_corpus_list = []

for fn in os.listdir('../data/Brenda_Fassie/'):
     corpus_fassie = open('../data/Brenda_Fassie/' + fn).read()
     fassie_corpus_list.append(corpus_fassie)
    
for fn in os.listdir('../data/Mariah_Carey/'):
     corpus_carey = open('../data/Mariah_Carey/' + fn).read()
     carey_corpus_list.append(corpus_carey)
    

In [77]:
corpus_list = fassie_corpus_list + carey_corpus_list
labels = ["fassie"]*17 + ["carey"]*17

### Transforming the corpus/words into a matrix using count Vectorize

In [78]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

In [79]:
vectorizer = CountVectorizer(stop_words='english', max_df=0.8)
vectorizer.fit(corpus_list)
X= vectorizer.transform(corpus_list)

In [80]:
X

<34x883 sparse matrix of type '<class 'numpy.int64'>'
	with 1434 stored elements in Compressed Sparse Row format>

X_df = pd.DataFrame(X.todense(), columns=vectorizer.get_feature_names(), index=labels)
X_df.head()

### Normalises the word counts using TF_IDF 

In [81]:
from sklearn.feature_extraction.text import TfidfTransformer
tf = TfidfTransformer() 
X_norm = tf.fit_transform(X)

In [82]:
X_norm_df=pd.DataFrame(X_norm.todense(), columns=vectorizer.get_feature_names_out(), index=labels)

### Classification models

In [83]:
from sklearn.linear_model import LogisticRegression
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, f1_score


In [84]:
X=X_norm_df
y=labels
X_train, X_test, y_train, y_test = train_test_split(X,y)

#### Logistic Regression


In [85]:
l_m = LogisticRegression()
l_m.fit(X_train, y_train)
l_m.score(X_test, y_test),l_m.score(X_train, y_train)

(0.5555555555555556, 1.0)

In [86]:
l_m.predict(X_train)

array(['carey', 'carey', 'carey', 'fassie', 'fassie', 'fassie', 'carey',
       'fassie', 'carey', 'fassie', 'fassie', 'carey', 'carey', 'carey',
       'fassie', 'fassie', 'carey', 'carey', 'carey', 'fassie', 'fassie',
       'carey', 'carey', 'fassie', 'fassie'], dtype='<U6')

In [87]:
X_norm

<34x883 sparse matrix of type '<class 'numpy.float64'>'
	with 1434 stored elements in Compressed Sparse Row format>

#### Naive Bayes in sklearn


In [88]:
from sklearn.naive_bayes import MultinomialNB

nm_m = MultinomialNB()
nm_m.fit(X, y)
nm_m.score(X, y)

1.0

#### Random forest classifier

In [92]:
from sklearn.ensemble import RandomForestClassifier

# n_est should go up, max_depth can go down
rf = RandomForestClassifier(n_estimators=20, 
                            max_depth=4, 
                            random_state=10)
rf.fit(X_train, y_train)

RandomForestClassifier(max_depth=4, n_estimators=20, random_state=10)

In [93]:
ypred_rf = rf.predict(X_test)

In [98]:
rf.score(X_train, y_train), rf.score(X_test, y_test),

(1.0, 0.7777777777777778)