### Task - Create model that predicats if a sentence from a song belongs to 50 Cent or Britney spears

###### This file includes scraping and cleaning of list of songs for 50 Cent, Data will be shared with a collegue, who will share Songs of Britney Spears.

In [19]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import time

### Step 1 - Obtain Song Lyrics from Urls

In [20]:
# open Artist Url

s_url = 'https://www.lyrics.com/artist/50-Cent/372609'
s_headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
s_response = requests.get(s_url, headers=s_headers)
#s_response.text

In [21]:
# turn into Soup object

soup = BeautifulSoup(s_response.text, 'html.parser')
#type(soup)

In [22]:
# Extract song titles, and song links from Artist Url

song_links = soup.find_all('td', attrs={'class': 'tal qx'})
song_titles = []
lyric_links =[]

for row in song_links:
    song_titles.append(row.a.text)
    lyric_links.append(row.a['href'])


In [23]:
# Turn into a DataFrame

fifty_cent = pd.DataFrame({'song_title':song_titles,'link': ['https://www.lyrics.com'+ str(i) for i in lyric_links]})
fifty_cent

Unnamed: 0,song_title,link
0,Remember the Name,https://www.lyrics.com/lyric/36259009/50+Cent/...
1,Yeah Yeah,https://www.lyrics.com/lyric/35373551/50+Cent/...
2,In Da Club,https://www.lyrics.com/lyric/35567415/50+Cent/...
3,I Get High,https://www.lyrics.com/lyric/35164479/50+Cent/...
4,If I Can't,https://www.lyrics.com/lyric/35098290/50+Cent/...
...,...,...
1681,In Da Club (Shake Sh*t Up),https://www.lyrics.com/lyric/17222602/50+Cent/...
1682,In da Club,https://www.lyrics.com/lyric/19185906/50+Cent/...
1683,Stunt 101,https://www.lyrics.com/lyric/19185805/50+Cent/...
1684,We Up [Edited],https://www.lyrics.com/lyric/29260338/50+Cent/...


In [24]:
# Check for duplicates
fifty_cent[fifty_cent['song_title']=='In Da Club']

Unnamed: 0,song_title,link
2,In Da Club,https://www.lyrics.com/lyric/35567415/50+Cent/...
7,In Da Club,https://www.lyrics.com/lyric/35168662/50+Cent/...
9,In Da Club,https://www.lyrics.com/lyric/34804707/50+Cent/...
30,In Da Club,https://www.lyrics.com/lyric/34595206/50+Cent/...
35,In Da Club,https://www.lyrics.com/lyric/34218461/50+Cent/...
37,In Da Club,https://www.lyrics.com/lyric/34549272/50+Cent/...
41,In Da Club,https://www.lyrics.com/lyric/31967113/50+Cent/...
136,In Da Club,https://www.lyrics.com/lyric/35286484/50+Cent/...
141,In Da Club,https://www.lyrics.com/lyric/31566976/50+Cent/...
145,In Da Club,https://www.lyrics.com/lyric/33182333/50+Cent/...


In [25]:
# Remove duplicates
fifty_cent_dropped = fifty_cent.drop_duplicates(subset='song_title', keep='first', inplace=False)
fifty_cent_dropped = fifty_cent_dropped.reset_index()
fifty_cent_dropped = fifty_cent_dropped.drop('index', axis=1)
fifty_cent_dropped


Unnamed: 0,song_title,link
0,Remember the Name,https://www.lyrics.com/lyric/36259009/50+Cent/...
1,Yeah Yeah,https://www.lyrics.com/lyric/35373551/50+Cent/...
2,In Da Club,https://www.lyrics.com/lyric/35567415/50+Cent/...
3,I Get High,https://www.lyrics.com/lyric/35164479/50+Cent/...
4,If I Can't,https://www.lyrics.com/lyric/35098290/50+Cent/...
...,...,...
526,Hit You Up,https://www.lyrics.com/lyric/10345621/50+Cent/...
527,MJB Da MVP [Alternate Version; Bonus Track],https://www.lyrics.com/lyric/13441239/50+Cent/...
528,Could've Been You [Bonus Track],https://www.lyrics.com/lyric/29458357/50+Cent/...
529,Nah I'm Talking 'Bout,https://www.lyrics.com/lyric/31649270/50+Cent/...


In [26]:
# Remove special characters from the song names (prevent error with song name)
import string
pattern = r'[' + string.punctuation + ']'
for i in range(len(fifty_cent_dropped['song_title'])):
    fifty_cent_dropped['song_title'][i] = re.sub(pattern, '', fifty_cent_dropped['song_title'][i])
 


In [51]:
# loop through the links of song_lyric_urls    
# Extract Lyric Text
# create Artist Folder, and save lyrics to text files

import os
for i in range(209,len(fifty_cent_dropped.link)):
    lyrics = BeautifulSoup(requests.get(fifty_cent_dropped.link[i]).text)
    time.sleep(3)
    try:
        filename = f"song/{fifty_cent_dropped.song_title[i]}.txt"
        os.makedirs(os.path.dirname(filename), exist_ok=True)
        with open(filename,'w') as f:
            f.write(lyrics.find('div',attrs={'class':'lyric clearfix'}).pre.text)            
            f.close()
    except:
        continue


### Step 2 - Extract Lyrics from text files into a DataFrame, and clean Data

- original dataframe 531 urls
- 502 text files created (errors ignored via try, except)
- despite removal of duplicate song_titles in Step 1, there are repetition of songd with slightly different titles
- duplicate of songs will not be included in final DataFrame 
- empty text files will not be included
- final - 291 songs included

In [103]:
# open each file, extract text, split into rows, and strip
# append in a growing List of Song_texts
# try and except - to ignore error resulting from removed files
# extract song name into a new list for each successful extraction of text

song_text = []
song_name = []
for i in range(len(fifty_cent_dropped.link)):
    try:
        filename = f"song/{fifty_cent_dropped.song_title[i]}.txt"
        with open(filename, "r") as f:
            song_text.append(f.read())
            f.close()
        song_name.append(fifty_cent_dropped.song_title[i])
    except:
        continue



In [104]:
len(song_text), len(song_name)      # why 314 ??? check ... also check better way to exclude duplicated files

(314, 314)

In [171]:
# Turn list of song_text >>> DataFrame with row for each sentence of a song, column song name, column artist name

df_final = pd.DataFrame()
for i in range(len(song_text)):
    rows = song_text[i].split("\n")                                         # Split Song_text into list of sentences
    rows_stripped = [rows[n].strip() for n in range(len(rows))]             # loop through sentences and strip
    rows_stripped = pd.DataFrame(rows_stripped)                             # turn into DataFrame
    rows_stripped['song_name'] = song_name[i]                               # add column, and fill with song name
    rows_stripped['artist_name'] = '50 Cent'                                # add column, and fill with artist name
    df_final = pd.concat([df_final, rows_stripped], axis=0)                 # Append to Final DataFrame


In [172]:
df_final = df_final.rename({0: 'lyrics_row'}, axis=1)
df_final.head()

Unnamed: 0,lyrics_row,song_name,artist_name
0,"Yeah, I was born a misfit, grew up ten miles f...",Remember the Name,50 Cent
1,,Remember the Name,50 Cent
2,"Wanted to make it big, I wished it to existence",Remember the Name,50 Cent
3,,Remember the Name,50 Cent
4,"I never was a sick kid, always dismissed quick",Remember the Name,50 Cent


In [173]:
df_final = df_final.reset_index()
df_final = df_final.drop('index', axis=1)
df_final.head()

Unnamed: 0,lyrics_row,song_name,artist_name
0,"Yeah, I was born a misfit, grew up ten miles f...",Remember the Name,50 Cent
1,,Remember the Name,50 Cent
2,"Wanted to make it big, I wished it to existence",Remember the Name,50 Cent
3,,Remember the Name,50 Cent
4,"I never was a sick kid, always dismissed quick",Remember the Name,50 Cent


In [174]:
df_final.shape

(42940, 3)

In [175]:
type(df_final)

pandas.core.frame.DataFrame

In [176]:
# Delete Empty rows
for i in range(len(df_final['lyrics_row'])):
    if len(df_final['lyrics_row'][i])==0:
        df_final.drop(i, axis=0, inplace=True)


In [177]:
df_final.shape

(22981, 3)

In [179]:
df_final = df_final.reset_index()
df_final = df_final.drop('index', axis=1)
df_final.head()

Unnamed: 0,lyrics_row,song_name,artist_name
0,"Yeah, I was born a misfit, grew up ten miles f...",Remember the Name,50 Cent
1,"Wanted to make it big, I wished it to existence",Remember the Name,50 Cent
2,"I never was a sick kid, always dismissed quick",Remember the Name,50 Cent
3,"""Stick to singing, stop rappin'"", like it's Ch...",Remember the Name,50 Cent
4,"And if you're talkin' money, then my conversat...",Remember the Name,50 Cent
