# Music / Lyric Analyzer

## Webscrapping the songs from Wikipedia (1959 - 2021)
#### Notice: There will be a part 2, where i fetch all the lyrics from a site called https://www.mldb.org

OBS:! I saw that most songs have a link. With some aditional data. Genre, Length, Label, Released.
Also all most of singers actually!

In [283]:
import bs4
import requests

def get_top100_song_year(year=2021):
    """This function will return a dictionary with any of the official Billboard American top 100 end of the year lists.
    The top 100 list was introduced in 1959. Before that it was a top 50 only. Returns by default 2021.
    link: https://en.wikipedia.org/wiki/Billboard_Year-End_Hot_100_singles_of_2021"""
    result_dict = dict()
    
    url = "https://en.wikipedia.org/wiki/Billboard_Year-End_Hot_100_singles_of_" + str(year)
    
    r = requests.get(url)
    r.raise_for_status()
    soup = bs4.BeautifulSoup(r.text, 'html.parser')
    
    table = soup.select('.wikitable > tbody > tr')
    
    for el in table:
        td = el.select('td')
        if(len(td) > 0):
            number = td[0].text.rstrip('\n')
            title = td[1].text.rstrip('\n')
            artist = extract_artists(td[2])
            title_url = extract_title_url(td[1])
            result_dict[number] = [title[1:-1], artist[0], artist[1], title_url]
    
    return result_dict

def extract_artists(element):
    """This function will return a list of the artists that have made the song. Only 2 artists will be applied,
    if more than two artists have collaborated. 
    """
    list_of_artists = [x.text.rstrip('\n') for x in element]
            
    result_list = split_featuring(list_of_artists)  
    
    # Need to add a blank spot if there is no featuring artist.
    if(len(result_list) < 2):
        result_list.append("")
        
    # Some featurings starts with ' & ' and some ends with 'and'. Quick fix:
    if(' and ' in result_list[1][-5:]):
        result_list[1] = result_list[1][:-5]
    if(' & ' in result_list[1][:3]):
        result_list[1] = result_list[1][3:]
        
    return [artist for artist in result_list][:2]


def split_featuring(list):
        """Many of the songs have featuring artists, sometimes even an additional ' and ' artist.
        This function splits them and returns a new list.
        
        Examples:
        1. Blackstreet featuring Mýa, Mase and Blinky Blink   # featuring + and + ,
        2. Silk Sonic (Bruno Mars and Anderson .Paak)         # Tuples
        3. SpotemGottem featuring Pooh Shiesty or DaBaby      # featuring + or
        4. Gerry and the Pacemakers                           # ' and the ' was a common band name in the old days
        5. Puff Daddy & the Family featuring The Notorious B.I.G.  # & the Family.. Gave some troubles.. 
        """
        
        new_list = []
        
        for artist in list:
            if(' featuring ' in artist):
                [new_list.append(x) for x in artist.split(' featuring ')]
            elif(' and ' in artist and ' and the ' not in artist ):
                [new_list.append(x) for x in artist.split(' and ')]
            elif(' or ' in artist):
                [new_list.append(x) for x in artist.split(' or ')] 
            else:
                new_list.append(artist)
                
        return [x for x in new_list if
                x != ' and ' and x != '' and x != ' (' and x != ')' and 
                x != ', ' and x != ' & 'and x != ' with ' and x != ' & the Family']
            

def extract_title_url(element):
    """This function will return the url for the song. This will be used later on to extract more data about each song."""
    
    if(element.find('a')):
        return element.select('a')[0].get('href')
    else:
        return ""

#get_top100_song_year(1998)

In [179]:
def get_all_top100_song_period(start=1959, end=2021):
    """This function will return..."""
    result_dict = dict()
    for i in range(start, end + 1):
        result_dict[i] = get_top100_song_year(i)
    return result_dict

        
billboard_data = get_all_top100_song_period(1959, 2021)
print("Size of Data:",len(billboard_data))
print("Example (2021):")
billboard_data.get(2021)


Size of Data: 63
Example (2021):


{'1': ['Levitating', 'Dua Lipa', '', '/wiki/Levitating_(song)'],
 '2': ['Save Your Tears',
  'The Weeknd',
  'Ariana Grande',
  '/wiki/Save_Your_Tears'],
 '3': ['Blinding Lights', 'The Weeknd', '', '/wiki/Blinding_Lights'],
 '4': ['Mood', '24kGoldn', 'Iann Dior', '/wiki/Mood_(song)'],
 '5': ['Good 4 U', 'Olivia Rodrigo', '', '/wiki/Good_4_U'],
 '6': ['Kiss Me More', 'Doja Cat', 'SZA', '/wiki/Kiss_Me_More'],
 '7': ['Leave the Door Open',
  'Silk Sonic',
  'Bruno Mars',
  '/wiki/Leave_the_Door_Open'],
 '8': ['Drivers License',
  'Olivia Rodrigo',
  '',
  '/wiki/Drivers_License_(song)'],
 '9': ['Montero (Call Me by Your Name)',
  'Lil Nas X',
  '',
  '/wiki/Montero_(Call_Me_by_Your_Name)'],
 '10': ['Peaches',
  'Justin Bieber',
  'Daniel Caesar',
  '/wiki/Peaches_(Justin_Bieber_song)'],
 '11': ['Butter', 'BTS', '', '/wiki/Butter_(song)'],
 '12': ['Stay',
  'The Kid Laroi',
  'Justin Bieber',
  '/wiki/Stay_(The_Kid_Laroi_and_Justin_Bieber_song)'],
 '13': ['Deja Vu',
  'Olivia Rodrigo',
  '

## Turn it into a dataframe (IMPORTANT)

In [180]:
import pandas as pd

def make_data_into_dataframe(billboard_data):
    data_list = []
    for year, value in billboard_data.items():
        for place, song in value.items():
            list_song = [year, place, song[0], song[1], song[2], song[3]]
            data_list.append(list_song)
    return pd.DataFrame(data_list, columns=['Year', 'Place', 'Title', 'Artist', 'Featuring','Title_url'])
   
make_data_into_dataframe(billboard_data)

Unnamed: 0,Year,Place,Title,Artist,Featuring,Title_url
0,1959,1,The Battle of New Orleans,Johnny Horton,,/wiki/The_Battle_of_New_Orleans
1,1959,2,Mack the Knife,Bobby Darin,,/wiki/Mack_the_Knife
2,1959,3,Personality,Lloyd Price,,/wiki/Personality_(Lloyd_Price_song)
3,1959,4,Venus,Frankie Avalon,,/wiki/Venus_(Frankie_Avalon_song)
4,1959,5,Lonely Boy,Paul Anka,,/wiki/Lonely_Boy_(Paul_Anka_song)
...,...,...,...,...,...,...
6296,2021,96,Things a Man Oughta Know,Lainey Wilson,,/wiki/Things_a_Man_Oughta_Know
6297,2021,97,Throat Baby (Go Baby),BRS Kash,,/wiki/Throat_Baby_(Go_Baby)
6298,2021,98,Tombstone,Rod Wave,,/wiki/Tombstone_(song)
6299,2021,99,Drinkin' Beer. Talkin' God. Amen.,Chase Rice,Florida Georgia Line,/wiki/Drinkin%27_Beer._Talkin%27_God._Amen.


## Save as CSV. So i dont have to scrape it everyday..

In [181]:
#df_to_csv = pd.DataFrame.from_dict(billboard_data) 
make_data_into_dataframe(billboard_data).to_csv ('data/raw_top100_1959_2021.csv', index = False, header=True)

## Load Data

In [1]:
import pandas as pd
df = pd.read_csv('data/raw_top100_1959_2021.csv' )
df

Unnamed: 0,Year,Place,Title,Artist,Featuring,Title_url
0,1959,1,The Battle of New Orleans,Johnny Horton,,/wiki/The_Battle_of_New_Orleans
1,1959,2,Mack the Knife,Bobby Darin,,/wiki/Mack_the_Knife
2,1959,3,Personality,Lloyd Price,,/wiki/Personality_(Lloyd_Price_song)
3,1959,4,Venus,Frankie Avalon,,/wiki/Venus_(Frankie_Avalon_song)
4,1959,5,Lonely Boy,Paul Anka,,/wiki/Lonely_Boy_(Paul_Anka_song)
...,...,...,...,...,...,...
6296,2021,96,Things a Man Oughta Know,Lainey Wilson,,/wiki/Things_a_Man_Oughta_Know
6297,2021,97,Throat Baby (Go Baby),BRS Kash,,/wiki/Throat_Baby_(Go_Baby)
6298,2021,98,Tombstone,Rod Wave,,/wiki/Tombstone_(song)
6299,2021,99,Drinkin' Beer. Talkin' God. Amen.,Chase Rice,Florida Georgia Line,/wiki/Drinkin%27_Beer._Talkin%27_God._Amen.


## Testing the data:

In [56]:
df[df['Featuring'] == 'the Pacemakers']

Unnamed: 0,Year,Place,Title,Artist,Featuring,Title_url
548,1964,49,Don't Let the Sun Catch You Crying,Gerry,the Pacemakers,/wiki/Don%27t_Let_the_Sun_Catch_You_Crying
588,1964,89,How Do You Do It?,Gerry,the Pacemakers,/wiki/How_Do_You_Do_It%3F
642,1965,43,Ferry Cross the Mersey,Gerry,the Pacemakers,/wiki/Ferry_Cross_the_Mersey


In [80]:
df[df['Artist'] == 'Gerry and the Pacemakers'] # Fixed

Unnamed: 0,Year,Place,Title,Artist,Featuring,Title_url
548,1964,49,Don't Let the Sun Catch You Crying,Gerry and the Pacemakers,,/wiki/Don%27t_Let_the_Sun_Catch_You_Crying
588,1964,89,How Do You Do It?,Gerry and the Pacemakers,,/wiki/How_Do_You_Do_It%3F
642,1965,43,Ferry Cross the Mersey,Gerry and the Pacemakers,,/wiki/Ferry_Cross_the_Mersey


In [87]:
df[df['Artist'] == 'The Mamas & the Papas']

Unnamed: 0,Year,Place,Title,Artist,Featuring,Title_url
700,1966,1,California Dreamin',The Mamas & the Papas,,/wiki/California_Dreamin%27
723,1966,24,"Monday, Monday",The Mamas & the Papas,,"/wiki/Monday,_Monday"
861,1967,62,Dedicated to the One I Love,The Mamas & the Papas,,/wiki/Dedicated_to_the_One_I_Love


In [88]:
df[df['Title'] == 'Stay']

Unnamed: 0,Year,Place,Title,Artist,Featuring,Title_url
197,1960,98,Stay,Maurice Williams and the Zodiacs,,/wiki/Stay_(Maurice_Williams_song)
3345,1992,45,Stay,Shakespears Sister,,/wiki/Stay_(Shakespears_Sister_song)
3580,1994,80,Stay,Eternal,,/wiki/Stay_(Eternal_song)
5413,2013,13,Stay,Rihanna,Mikky Ekko,/wiki/Stay_(Rihanna_song)
5817,2017,17,Stay,Zedd,Alessia Cara,/wiki/Stay_(Zedd_and_Alessia_Cara_song)
6212,2021,12,Stay,The Kid Laroi,Justin Bieber,/wiki/Stay_(The_Kid_Laroi_and_Justin_Bieber_song)


In [90]:
df[df['Title'] == 'Venus']

Unnamed: 0,Year,Place,Title,Artist,Featuring,Title_url
3,1959,4,Venus,Frankie Avalon,,/wiki/Venus_(Frankie_Avalon_song)
1133,1970,33,Venus,Shocking Blue,,/wiki/Venus_(Shocking_Blue_song)
2738,1986,38,Venus,Bananarama,,/wiki/Venus_(Shocking_Blue_song)


In [91]:
df[df['Artist'] == 'The Beatles']

Unnamed: 0,Year,Place,Title,Artist,Featuring,Title_url
500,1964,1,I Want to Hold Your Hand,The Beatles,,/wiki/I_Want_to_Hold_Your_Hand
501,1964,2,She Loves You,The Beatles,,/wiki/She_Loves_You
512,1964,13,A Hard Day's Night,The Beatles,,/wiki/A_Hard_Day%27s_Night_(song)
513,1964,14,Love Me Do,The Beatles,,/wiki/Love_Me_Do
515,1964,16,Please Please Me,The Beatles,,/wiki/Please_Please_Me_(song)
539,1964,40,Twist and Shout,The Beatles,,/wiki/Twist_and_Shout
551,1964,52,Can't Buy Me Love,The Beatles,,/wiki/Can%27t_Buy_Me_Love
554,1964,55,Do You Want to Know a Secret,The Beatles,,/wiki/Do_You_Want_to_Know_a_Secret
594,1964,95,I Saw Her Standing There,The Beatles,,/wiki/I_Saw_Her_Standing_There
606,1965,7,Help!,The Beatles,,/wiki/Help!_(song)


#### Testing featuring after refactoring the web scrapper:

In [106]:
# How many songs have a featurin artist for the song:
print("Featuring artists: ",df['Artist'].str.contains('featuring').sum())
print("AND in the name: ",df['Artist'].str.contains('and').sum()) # OBS.: This could also be Andrew, Andy etc.. not valid!

Featuring artists:  0
AND in the name:  227


In [183]:
print("Featuring artists: ",df['Featuring'].str.contains('&').sum())

Featuring artists:  4


In [167]:
test_featuring_for_and[test_featuring_for_and['Featuring'].str.contains(' and ')]

Unnamed: 0,Year,Place,Title,Artist,Featuring,Title_url
6149,2020,49,For the Night,Pop Smoke,Lil Baby and DaBaby,/wiki/For_the_Night
6221,2021,21,34+35,Ariana Grande,Doja Cat and,/wiki/34%2B35
6259,2021,59,Every Chance I Get,DJ Khaled,Lil Baby and Lil Durk,/wiki/Every_Chance_I_Get_(song)
6260,2021,60,Essence,Wizkid,Justin Bieber and,/wiki/Essence_(song)


In [185]:
# It's usually used if there is a third featuring person/band. DRAKE featuring 21 SAVAGE and PROJECT PAT
# Might have to change the way i crawl the data!

test_featuring_for_and = df.dropna(subset='Featuring')

test_featuring_for_and[test_featuring_for_and['Featuring'].str.contains(' and')]
#df[df['Featuring'].str.contains(' and')]

Unnamed: 0,Year,Place,Title,Artist,Featuring,Title_url
6149,2020,49,For the Night,Pop Smoke,Lil Baby and DaBaby,/wiki/For_the_Night
6259,2021,59,Every Chance I Get,DJ Khaled,Lil Baby and Lil Durk,/wiki/Every_Chance_I_Get_(song)


In [3]:
#testdf['Year'].values.tolist()

#for x in test_featuring_for_and.values:
#    print(x[0], x[1], x[4])

In [102]:
df[df['Artist'] == 'Dionne'] # Not perfect.. This should have been Dionne Warwick and a bunch of featuring artists

Unnamed: 0,Year,Place,Title,Artist,Featuring,Title_url
2701,1986,1,That's What Friends Are For,Dionne,Friends (,/wiki/That%27s_What_Friends_Are_For


In [96]:
df[df['Title'] == 'Leave the Door Open'] # This title's artist got '()' in the name....FIXED

Unnamed: 0,Year,Place,Title,Artist,Featuring,Title_url
6207,2021,7,Leave the Door Open,Silk Sonic,Bruno Mars,/wiki/Leave_the_Door_Open


In [187]:
df[df['Artist'] == 'Puff Daddy'] # & the Family is not valid...

Unnamed: 0,Year,Place,Title,Artist,Featuring,Title_url
3803,1997,3,I'll Be Missing You,Puff Daddy,Faith Evans,/wiki/I%27ll_Be_Missing_You
3805,1997,5,Can't Nobody Hold Me Down,Puff Daddy,Mase,/wiki/Can%27t_Nobody_Hold_Me_Down
3919,1998,19,Been Around the World,Puff Daddy,The Notorious B.I.G.,/wiki/Been_Around_the_World
3947,1998,47,Come with Me,Puff Daddy,Jimmy Page,/wiki/Come_with_Me_(Puff_Daddy_song)
3968,1998,68,Victory,Puff Daddy,The Notorious B.I.G.,/wiki/Victory_(Puff_Daddy_song)
4095,1999,95,Satisfy You,Puff Daddy,R. Kelly,/wiki/Satisfy_You_(Puff_Daddy_song)


In [412]:
df[df['Artist'] == 'Prince']

Unnamed: 0,Year,Place,Title,Artist,Featuring,Title_url
2195,1980,95,I Wanna Be Your Lover,Prince,,/wiki/I_Wanna_Be_Your_Lover
2425,1983,25,Little Red Corvette,Prince,,/wiki/Little_Red_Corvette
2441,1983,41,1999,Prince,,/wiki/1999_(Prince_song)
2501,1984,1,When Doves Cry,Prince,,/wiki/When_Doves_Cry
2651,1985,51,Raspberry Beret,Prince,the Revolution,/wiki/Raspberry_Beret
2719,1986,19,Kiss,Prince,The Revolution,/wiki/Kiss_(Prince_song)
2838,1987,38,U Got the Look,Prince,,/wiki/U_Got_the_Look
2860,1987,60,Sign o' the Times,Prince,,/wiki/Sign_o%27_the_Times_(song)
3044,1989,44,Batdance,Prince,,/wiki/Batdance
3198,1990,98,Thieves in the Temple,Prince,,/wiki/Thieves_in_the_Temple


In [92]:
df[df['Title'] == 'Peaches'] # This title's artist(s) ot featuring + and in the name. FIXED

Unnamed: 0,Year,Place,Title,Artist,Featuring,Title_url
6210,2021,10,Peaches,Justin Bieber,Daniel Caesar,/wiki/Peaches_(Justin_Bieber_song)


In [97]:
print("Number of songs: ",len(df))
print("Years: ", len(df["Year"].drop_duplicates()))

Number of songs:  6301
Years:  63


In [98]:
# Find all the no.1 songs

df[df['Place'] == "1"]

Unnamed: 0,Year,Place,Title,Artist,Featuring,Title_url
0,1959,1,The Battle of New Orleans,Johnny Horton,,/wiki/The_Battle_of_New_Orleans
100,1960,1,Theme from A Summer Place,Percy Faith,,/wiki/Theme_from_A_Summer_Place
200,1961,1,Tossin' and Turnin',Bobby Lewis,,/wiki/Tossin%27_and_Turnin%27
300,1962,1,Stranger on the Shore,Acker Bilk,,/wiki/Stranger_on_the_Shore
400,1963,1,Sugar Shack,Jimmy Gilmer and the Fireballs,,/wiki/Sugar_Shack
...,...,...,...,...,...,...
5801,2017,1,Shape of You,Ed Sheeran,,/wiki/Shape_of_You
5901,2018,1,God's Plan,Drake,,/wiki/God%27s_Plan_(song)
6001,2019,1,Old Town Road,Lil Nas X,Billy Ray Cyrus,/wiki/Old_Town_Road
6101,2020,1,Blinding Lights,The Weeknd,,/wiki/Blinding_Lights


In [99]:
songs_without_url = df[df['Title_url'].isnull()]

In [100]:
# Find songs that don't have a link.
            
print(len(songs_without_url), " out of ", len(df), " Songs.",  (len(songs_without_url) / len(df))*100, "%")
songs_without_url

74  out of  6301  Songs. 1.1744167592445642 %


Unnamed: 0,Year,Place,Title,Artist,Featuring,Title_url
112,1960,13,Greenfields,The Brothers Four,,
139,1960,40,Way Down Yonder in New Orleans,Freddy Cannon,,
165,1960,66,It's Time to Cry,Paul Anka,,
181,1960,82,Lady Luck,Lloyd Price,,
182,1960,83,Step by Step,The Crests,,
...,...,...,...,...,...,...
4061,1999,61,If You,Silk,,
4081,1999,81,Faded Pictures,Case,Joe,
4289,2001,89,I'm a Thug,Trick Daddy,,
4388,2002,88,Anything,Jaheim,Next,


### Notes after testing

In [115]:
# Notice 1986: Dionne and Friends (Dionne Warwick, Gladys Knight, Elton John and Stevie Wonder)
# Need a a clever way to break that down.
# Also 2004, 2012 (and others) have "feature" or "and". Need to be able to break down aswell 
# 74 songs wont get extra data. Which is acceptable
# Paul McCartney need some attention. "Paul", "Paul M and the Wings", "Beatles"


## Get additional data from wikipedia link

In [96]:
import bs4
import requests
import re


def get_additional_song_data(link):
    result_dict = dict()
    
    
    #print("==================")
    #print(link)
    #print("==================")

    
    # All the data we want to extract (Will convert it into an Object!)
    released = ""
    genres = [""]
    length = ""
    label = ""
    total_labels = ""
    writer = [""]
    youtube = ""
    
    if(link != ''):
    
        url = "https://en.wikipedia.org"+link

        r = requests.get(url)
        r.raise_for_status()
        soup = bs4.BeautifulSoup(r.text, 'html.parser')

        # Some songs have more than one infobox. We only want the first.
        if(soup.select('.infobox')):
            infobox = soup.select('.infobox')
            table = infobox[0].select('tbody > tr')


            for el in table:


                if(el.find('th')):
                    th = el.select('th')
                    td = el.select('td')
                    if(len(td) > 0):
                        #print(th[0].text, td[0].text)
                        match th[0].text:
                            case "Released":
                                released =  extract_released_year(td[0])
                                #print("Released", extract_released_year(td[0]))
                            case "Genre":
                                genres = extract_genre(td[0])
                                #print("Genre:", extract_genre(td[0]))
                            case "Length":
                                length = extract_length(td[0])
                                #print("Length", extract_length(td[0]))
                            case "Label":
                                label_data = extract_label(td[0])
                                label = label_data[0]
                                total_labels = label_data[1]
                                #print("Label", extract_label(td[0]))
                            case "Songwriter(s)":
                                writer = extract_songwriter(td[0])
                                #print("Songwriter(s)", extract_songwriter(td[0]))

                # Search for YT link
                if(el.find('a', {'title': 'YouTube'})):
                    youtube = el.find('a', {'class': 'external'}).get('href')
                    #print(el.find('a', {'class': 'external'}).get('href'))


    # Make sure the lists have the correct size. NaN will be handle when converted to df.
    if(len(genres) < 3 ):
        genres.append('')
        genres.append('')
    if(len(writer) < 2):
        writer.append('')
                
    return [released, genres[0], genres[1], genres[2], length, label, total_labels, writer[0], writer[1], youtube]
    
    

def extract_released_year(element):
    """This function will return the year the song was released. But tests shows that is different scenarios. 
    1. Multiple release days, release etc., 
    2. Sometimes only the Year and Month(or Just the year), 
    
    I am using regex and split to make a list of 4 digit numbers and only returns index 0(Incase there was a rerelease later)
    """    
    text = element.text
    
    result_list = re.findall("\d{4}", text)
    
    
    #print("Extracted released year!")
    
    return(min(result_list))
    
    #years_reg = re.compile(r"\b(19|20)\d{2}\b")
        
    #return [year for year in element.text.split() if re.search(years_reg, year)][0]
    
                        
def extract_genre(element):
    """This function will return a list of up to 3 genres. But tests shows that is different scenarios. 
    1. Only one Genre, 
    2. Multiple genres in an unordered list(ul), 
    3. Multiple genres with a bookmark like this. Pop[1], Rock[2].
    
    Which is handle but simple if statements.
    """
    letters_reg = re.compile(r"[a-zA-Z]")
        

    if(element.find('a')):
        result_list = [x.text for x in element.select('a')]
    else:
        result_list = element.text.split()
    
    
    #print("Extracted genres!")
    
    return [genre.capitalize() for genre in result_list if re.search(letters_reg, genre)][:3]
    
    
def extract_length(element):
    """This function will return the length of the song. In some cases there is a single and an album version etc.,
    but i decided to return just the longest version. 
    1. The function splits the text elements into a list if it contains ':'.
    2. It cleans up the result for any letters or symbols(not ':' obviously).
    3. Return the max value."""

    # Keeping this until im sure new solution works!
  #  lengths = []

  #  if(element.find('li')):
  #      list_items = [x.text for x in element.select('li')]    
  #      for item in list_items:
  #          split_time = item.split(':')
  #          minutes = split_time[0]
  #          seconds = split_time[1][:2] # Only 2 digits
  #          lengths.append(minutes+':'+seconds)
  #  else:
  #      # c = character   l = length
  #      lengths = ["".join([c for c in l if int(c.isnumeric()) or c == ":"]) for l in element.text.split() if ":" in l] 

    result_list = []
    
    if(element.find('li')):
        lengths = [x.text for x in element.select('li')]
    else:
        lengths = [x for x in element.text.split() if ":" in x]
        
    for item in lengths:
        split_time = item.split(':')
        minutes = split_time[0]
        seconds = split_time[1][:2] # Only 2 digits
        time = "".join([char for char in minutes+':'+seconds if int(char.isnumeric()) or char == ":"])
        result_list.append(time)    

    
    #print("Extracted length!")
        
    return max(result_list)
    
def extract_label(element):
    """This function will return only the first mentioned Label. Some singers/song change labels for whatever reason.
    Sometimes up tp two or three times. For simplicity I only return the first mentioned and the amount of different
    labels that the song have had. Maybe we can use that information for something interresting."""
    
    if(element.find('a')):
        result_list = [x.text for x in element.select('a')]
    elif(element.find('li')):
        result_list = [x.text for x in element.select('li')]
    else:
        result_list = element.text.split()
    
    
    #print("Extracted Labels!")
    
    # Returning originale label and the amount of different labels.
    return [[label for label in result_list][0], len(result_list)]

def extract_songwriter(element):
    """This function will return only the first mentioned songwriter. Some singers/song change labels for whatever reason.
    Sometimes up tp two or three times. For simplicity I only return the first mentioned and the amount of different
    labels that the song have had. Maybe we can use that information for something interresting."""
    
    result_list = []
    list_of_writers = []
    
    #list_of_writers = [x.text.rstrip('\n') for x in element.select('li')]
    if(element.select('li')):
        list_of_writers = [x for x in element.select('li')]
    elif(element.select('a')):
        list_of_writers = [x for x in element.select('a')]
    else:
        result_list = [x for x in element]
    
    
    if(len(list_of_writers) > 0):
        for x in list_of_writers:
            if(x.find('a')):
                result_list.append(x.find('a').text)
            else:
                result_list.append(x.text)

    # Sometimes the songwriters are listed a bit weird. Ends up with: '[4]'.
    for w in result_list[:2]:
        if(re.search('\[|,', str(w)) or w == ""):
            #result_list = [ x for x in element.text.split(', ')]
            result_list = [ x for x in re.split(' & |, |,',element.text)]
        
    
    # Need to add a blank spot if there is only one writer.
    if(len(result_list) < 2):
        result_list.append("")
        
    
    
    
    #print("Extracted Songwriters!")
    
    #return [writers for writers in result_list][:2]
    
    # Removes the referal bracket, which sometimes appears: 'Chuckie Howard[4]'
    return [writer[:-3] if re.search('\[', str(writer)) else writer for writer in result_list][:2]
    
    


#get_additional_song_data("/wiki/Thank_You_(Boyz_II_Men_song)")
#get_additional_song_data("/wiki/%27Til_You_Do_Me_Right")
#get_additional_song_data("/wiki/Freak_like_Me")
#get_additional_song_data("/wiki/Kiss_from_a_Rose")

# Some songs have up to 7 genres! Limit it to 3!?
# Length can be different aswell, album and single (Think i should go with the longest one)
# Multiple recording days. (Take the earliest)
# More than one label! (Like Wham - Careless whispers...)
# Song writers is often more than one. Some weird ones like Andy Gibb - shadow song:  "Barry, Robin & Maurice Gibb; Andy Gibb"

In [124]:
test1 = "/wiki/The_First_Time_Ever_I_Saw_Your_Face" # 2 different lenghts + label also have a number after
test2 = "/wiki/When_Doves_Cry" # 7 different genres! And 3  dates..
test3 = "/wiki/Shadow_Dancing_(song)" # Songwriters: "Barry, Robin & Maurice Gibb; Andy Gibb"
test4 = "/wiki/Careless_Whisper" # Genre has this [1] etc + multiple labels
test5 = "/wiki/The_Way_We_Were_(song)" # No genre
test6 = "/wiki/The_Sign_(song)" # the genres prints like this: Techno-reggaepopEuropop (Make sure it's splits correctly)

#get_additional_song_data(test1) # Fixed
#get_additional_song_data(test2) # Fixed
#get_additional_song_data(test3) # Fixed
#get_additional_song_data(test4) # Fixed
#get_additional_song_data(test5) # Fixed
#get_additional_song_data(test6) # Fixed

In [3]:
import pandas as pd

def get_all_additional_data_dict(dataframe):
    new_dict = dict()
    
    # Can't use NaN link. Breaks the build
    df = pd.DataFrame(dataframe['Title_url'].fillna(""))
    
    for index, row in df.iterrows():
        data = get_additional_song_data(row['Title_url'])
        new_dict[index] = data[0],data[1],data[2],data[3],data[4],data[5],data[6],data[7],data[8],data[9]

    return new_dict

def convert_additional_data_into_df(song_data):
    data_list = []
    #for index, value in song_data.items():
    for index , data in song_data.items():
        list_song = [data[0],data[1],data[2],data[3],data[4],data[5],data[6],data[7],data[8],data[9]]
        data_list.append(list_song)
    return pd.DataFrame(data_list, columns= ['Released', 'Genre 1' ,'Genre 2', 'Genre 3', 'Length', 'Label', 'Total Labels', 'Writer 1', 'Writer 2', 'Youtube'] )
   

def merge_dataframes(left, right):
    return pd.merge(left , right, left_index=True, right_index=True)


In [44]:
test_bg = df[df['Artist'] == 'Bee Gees']

tbg = get_all_additional_data_dict(test_bg)
convert_additional_data_into_df(tbg)

Unnamed: 0,Released,Genre 1,Genre 2,Genre 3,Length,Label,Total Labels,Writer 1,Writer 2,Youtube
0,1971,Pop,,,3:57,Polydor,2,Barry Gibb,Robin Gibb,https://www.youtube.com/watch?v=PciJq0qYJj8
1,1970,Baroque pop,Soft rock,,3:45,Polydor,2,Barry Gibb,Robin Gibb,https://www.youtube.com/watch?v=ipwWbZnpgiI
2,1975,Funk,Disco,,3:44,RSO,1,Barry Gibb,Robin Gibb,https://www.youtube.com/watch?v=oALKAh_bL5g
3,1976,Disco,,,4:47,RSO,1,Barry Gibb,Robin Gibb,https://www.youtube.com/watch?v=FgXTygS865M
4,1976,R&b,Soul,Soft rock,4:02,RSO,1,Barry,Robin,
5,1976,R&b,,,3:34,RSO,1,Barry,Robin,
6,1978,Disco,,,3:32,RSO,1,Barry Gibb,Robin Gibb,https://www.youtube.com/watch?v=SkypZuY6ZvA
7,1977,Disco,,,4:45,RSO,1,Barry Gibb,Robin Gibb,https://www.youtube.com/watch?v=fNFzfwLM72c
8,1977,Soft rock,,,4:02,RSO,1,Barry Gibb,Robin Gibb,https://www.youtube.com/watch?v=XpqqjU7u5Yc
9,1978,R&b,Soul,,4:58,RSO,1,Barry Gibb,Robin Gibb,https://www.youtube.com/watch?v=i6iBAuwBODA


In [29]:
test = df[df['Title'] == 'Footloose'].reset_index(drop=True)
test 

Unnamed: 0,Year,Place,Title,Artist,Featuring,Title_url
0,1984,4,Footloose,Kenny Loggins,,/wiki/Footloose_(song)


In [30]:
test2 = df[df['Title'] == 'Karma Chameleon'].reset_index(drop=True)
test2 

Unnamed: 0,Year,Place,Title,Artist,Featuring,Title_url
0,1984,10,Karma Chameleon,Culture Club,,/wiki/Karma_Chameleon


In [31]:
test3 = df[df['Title'] == 'Cum on Feel the Noize'].reset_index(drop=True)
test3 

Unnamed: 0,Year,Place,Title,Artist,Featuring,Title_url
0,1984,68,Cum on Feel the Noize,Quiet Riot,,/wiki/Cum_on_Feel_the_Noize


In [32]:
test4 = df[df['Title'] == "Let's Go Crazy" ].reset_index(drop=True)
test4 

Unnamed: 0,Year,Place,Title,Artist,Featuring,Title_url
0,1984,21,Let's Go Crazy,Prince &,the Revolution,/wiki/Let%27s_Go_Crazy_(song)


In [33]:
t1 = get_all_additional_data_dict(test)
convert_additional_data_into_df(t1)

Unnamed: 0,Released,Genre 1,Genre 2,Genre 3,Length,Label,Total Labels,Writer 1,Writer 2,Youtube
0,1984,Pop rock,,,3:48,Columbia,1,Kenny Loggins,Dean Pitchford,


In [34]:
t2 = get_all_additional_data_dict(test2)
convert_additional_data_into_df(t2)

Unnamed: 0,Released,Genre 1,Genre 2,Genre 3,Length,Label,Total Labels,Writer 1,Writer 2,Youtube
0,1983,Pop,,,4:11,Virgin,1,Boy George,Jon Moss,https://www.youtube.com/watch?v=JmcA9LIIXWw


In [35]:
t3 = get_all_additional_data_dict(test3)
convert_additional_data_into_df(t3)

Unnamed: 0,Released,Genre 1,Genre 2,Genre 3,Length,Label,Total Labels,Writer 1,Writer 2,Youtube
0,1973,Glam rock,Hard rock,,4:24,Polydor,1,Noddy Holder,Jim Lea,


In [36]:
t4 = get_all_additional_data_dict(test4)
convert_additional_data_into_df(t4)

Unnamed: 0,Released,Genre 1,Genre 2,Genre 3,Length,Label,Total Labels,Writer 1,Writer 2,Youtube
0,1984,Funk rock,,,7:35,Warner Bros.,1,Prince,,https://www.youtube.com/watch?v=aXJhDltzYVQ


### Test with all the Beatles song

In [20]:
beatles_test = df[df['Artist'] == 'The Beatles'].reset_index(drop=True)
beatles_test

Unnamed: 0,Year,Place,Title,Artist,Featuring,Title_url
0,1964,1,I Want to Hold Your Hand,The Beatles,,/wiki/I_Want_to_Hold_Your_Hand
1,1964,2,She Loves You,The Beatles,,/wiki/She_Loves_You
2,1964,13,A Hard Day's Night,The Beatles,,/wiki/A_Hard_Day%27s_Night_(song)
3,1964,14,Love Me Do,The Beatles,,/wiki/Love_Me_Do
4,1964,16,Please Please Me,The Beatles,,/wiki/Please_Please_Me_(song)
5,1964,40,Twist and Shout,The Beatles,,/wiki/Twist_and_Shout
6,1964,52,Can't Buy Me Love,The Beatles,,/wiki/Can%27t_Buy_Me_Love
7,1964,55,Do You Want to Know a Secret,The Beatles,,/wiki/Do_You_Want_to_Know_a_Secret
8,1964,95,I Saw Her Standing There,The Beatles,,/wiki/I_Saw_Her_Standing_There
9,1965,7,Help!,The Beatles,,/wiki/Help!_(song)


In [94]:
# Get all the info as a dict
beatles_dict = get_all_additional_data_dict(beatles_test)

# 
df_beatles_additional = convert_additional_data_into_df(beatles_dict)

df_beatles_additional

Unnamed: 0,Released,Genre 1,Genre 2,Genre 3,Length,Label,Total Labels,Writer 1,Writer 2,Youtube
0,1963,Rock and roll,Pop,,2:24,Parlophone,2,Lennon–McCartney,,https://www.youtube.com/watch?v=XT4pwRi2JmY
1,1963,Rock and roll,Pop,,2:18,Parlophone,2,Lennon–McCartney,,
2,1964,Rock,,,2:34,Parlophone,2,Lennon–McCartney,,
3,1962,Merseybeat,Pop,R&b,2:22,Parlophone,2,Lennon–McCartney,,
4,1963,Merseybeat,Rock and roll,,2:00,Parlophone,2,McCartney–Lennon,,
5,1961,Rock and roll,,,2:05,Atlantic,1,Bert Berns,Phil Medley,
6,1964,Rock and roll,Pop rock,R&b,2:11,Parlophone,2,Lennon–McCartney,,
7,1963,Merseybeat,Pop,Doo-wop,1:56,Parlophone,2,Lennon–McCartney,,
8,1963,Rock and roll,,,2:55,Capitol,1,Lennon-McCartney,,https://www.youtube.com/watch?v=oxwAB3SECtc
9,1965,Folk rock,,,2:18,Parlophone,2,Lennon–McCartney,,https://www.youtube.com/watch?v=2Q_ZzBGPdqE


In [377]:
df_beatles = merge_dataframes(beatles_test, df_beatles_additional)  

df_beatles

Unnamed: 0,Year,Place,Title,Artist,Featuring,Title_url,Released,Genre 1,Genre 2,Genre 3,Length,Label,Total Labels,Writer 1,Writer 2,Youtube
0,1964,1,I Want to Hold Your Hand,The Beatles,,/wiki/I_Want_to_Hold_Your_Hand,1963,Rock and roll,Pop,,2:24,Parlophone,2,Lennon–McCartney,,https://www.youtube.com/watch?v=XT4pwRi2JmY
1,1964,2,She Loves You,The Beatles,,/wiki/She_Loves_You,1963,Rock and roll,Pop,,2:18,Parlophone,2,Lennon–McCartney,,
2,1964,13,A Hard Day's Night,The Beatles,,/wiki/A_Hard_Day%27s_Night_(song),1964,Rock,,,2:34,Parlophone,2,Lennon–McCartney,,
3,1964,14,Love Me Do,The Beatles,,/wiki/Love_Me_Do,1962,Merseybeat,Pop,R&b,2:22,Parlophone,2,Lennon–McCartney,,
4,1964,16,Please Please Me,The Beatles,,/wiki/Please_Please_Me_(song),1963,Merseybeat,Rock and roll,,2:00,Parlophone,2,McCartney–Lennon,,
5,1964,40,Twist and Shout,The Beatles,,/wiki/Twist_and_Shout,1961,Rock and roll,,,2:05,Atlantic,1,Bert Berns,Phil Medley,
6,1964,52,Can't Buy Me Love,The Beatles,,/wiki/Can%27t_Buy_Me_Love,1964,Rock and roll,Pop rock,R&b,2:11,Parlophone,2,Lennon–McCartney,,
7,1964,55,Do You Want to Know a Secret,The Beatles,,/wiki/Do_You_Want_to_Know_a_Secret,1963,Merseybeat,Pop,Doo-wop,1:56,Parlophone,2,Lennon–McCartney,,
8,1964,95,I Saw Her Standing There,The Beatles,,/wiki/I_Saw_Her_Standing_There,1963,Rock and roll,,,2:55,Capitol,1,Lennon-McCartney,,https://www.youtube.com/watch?v=oxwAB3SECtc
9,1965,7,Help!,The Beatles,,/wiki/Help!_(song),1965,Folk rock,,,2:18,Parlophone,2,Lennon–McCartney,,https://www.youtube.com/watch?v=2Q_ZzBGPdqE


## Save (Beatles test)

In [378]:
#df_to_csv = pd.DataFrame.from_dict(billboard_data) 
df_beatles.to_csv ('data/beatles_test.csv', index = False, header=True)

## Load (Beatles test)

In [268]:
import pandas as pd
df_beatles = pd.read_csv('data/beatles_test.csv')
df_beatles

Unnamed: 0,Year,Place,Title,Artist,Featuring,Title_url,Released,Genre 1,Genre 2,Genre 3,Length,Label,Total Labels,Writer 1,Writer 2,Youtube
0,1964,1,I Want to Hold Your Hand,The Beatles,,/wiki/I_Want_to_Hold_Your_Hand,1963,Rock and roll,Pop,,2:24,Parlophone,2,Lennon–McCartney,,https://www.youtube.com/watch?v=XT4pwRi2JmY
1,1964,2,She Loves You,The Beatles,,/wiki/She_Loves_You,1963,Rock and roll,Pop,,2:18,Parlophone,2,Lennon–McCartney,,
2,1964,13,A Hard Day's Night,The Beatles,,/wiki/A_Hard_Day%27s_Night_(song),1964,Rock,,,2:34,Parlophone,2,Lennon–McCartney,,
3,1964,14,Love Me Do,The Beatles,,/wiki/Love_Me_Do,1962,Merseybeat,Pop,R&b,2:22,Parlophone,2,Lennon–McCartney,,
4,1964,16,Please Please Me,The Beatles,,/wiki/Please_Please_Me_(song),1963,Merseybeat,Rock and roll,,2:00,Parlophone,2,McCartney–Lennon,,
5,1964,40,Twist and Shout,The Beatles,,/wiki/Twist_and_Shout,1961,Rock and roll,,,2:05,Atlantic,1,Bert Berns,Phil Medley,
6,1964,52,Can't Buy Me Love,The Beatles,,/wiki/Can%27t_Buy_Me_Love,1964,Rock and roll,Pop rock,R&b,2:11,Parlophone,2,Lennon–McCartney,,
7,1964,55,Do You Want to Know a Secret,The Beatles,,/wiki/Do_You_Want_to_Know_a_Secret,1963,Merseybeat,Pop,Doo-wop,1:56,Parlophone,2,Lennon–McCartney,,
8,1964,95,I Saw Her Standing There,The Beatles,,/wiki/I_Saw_Her_Standing_There,1963,Rock and roll,,,2:55,Capitol,1,Lennon-McCartney,,https://www.youtube.com/watch?v=oxwAB3SECtc
9,1965,7,Help!,The Beatles,,/wiki/Help!_(song),1965,Folk rock,,,2:18,Parlophone,2,Lennon–McCartney,,https://www.youtube.com/watch?v=2Q_ZzBGPdqE


## Test with 1984

In [457]:
year1984_test = df[df['Year'] == 1984].reset_index(drop=True)
year1984_test

Unnamed: 0,Year,Place,Title,Artist,Featuring,Title_url
0,1984,1,When Doves Cry,Prince,,/wiki/When_Doves_Cry
1,1984,2,What's Love Got to Do with It,Tina Turner,,/wiki/What%27s_Love_Got_to_Do_with_It_(song)
2,1984,3,Say Say Say,Paul McCartney,Michael Jackson,/wiki/Say_Say_Say
3,1984,4,Footloose,Kenny Loggins,,/wiki/Footloose_(song)
4,1984,5,Against All Odds (Take a Look at Me Now),Phil Collins,,/wiki/Against_All_Odds_(Take_a_Look_at_Me_Now)
...,...,...,...,...,...,...
95,1984,96,Major Tom (Coming Home),Peter Schilling,,/wiki/Major_Tom_(Coming_Home)
96,1984,97,Magic,The Cars,,/wiki/Magic_(The_Cars_song)
97,1984,98,When You Close Your Eyes,Night Ranger,,/wiki/When_You_Close_Your_Eyes
98,1984,99,Rock Me Tonite,Billy Squier,,/wiki/Rock_Me_Tonite


In [490]:
# Get all the info as a dict
year1984_dict = get_all_additional_data_dict(year1984_test)

# Convert it to df
df_year1984_additional = convert_additional_data_into_df(year1984_dict)

df_year1984_additional

Unnamed: 0,Released,Genre 1,Genre 2,Genre 3,Length,Label,Total Labels,Writer 1,Writer 2,Youtube
0,1984,Experimental pop,Neo-psychedelia,Soul,5:52,Warner Bros.,1,,,https://www.youtube.com/watch?v=UG3VcCAlUgE
1,1984,R&b,Synth-pop,,3:48,Capitol,1,Terry Britten,Graham Lyle,https://www.youtube.com/watch?v=oGpFcHTxjZs
2,1983,Post-disco,Funk,Pop,5:40,Parlophone,2,Paul McCartney,Michael Jackson,https://www.youtube.com/watch?v=hu7hmBJLpkk
3,1984,Pop rock,,,3:48,Columbia,1,Kenny Loggins,Dean Pitchford,
4,1984,Pop,Soft rock,,3:23,Atlantic,1,,,
...,...,...,...,...,...,...,...,...,...,...
95,1983,Neue deutsche welle,New wave,Synthpop,8:02,Elektra,2,,,
96,1984,Rock,New wave,Pop,3:57,Elektra,1,Ric Ocasek,,https://www.youtube.com/watch?v=E0Kv6vxZwL8
97,1984,Arena rock,Pop metal,,4:08,,,Jack Blades,Alan Fitzgerald,
98,1984,Pop rock,Synth-pop,Hard rock,4:57,Capitol,1,,,https://www.youtube.com/watch?v=PhnEpg9VMY8


In [491]:
df_year1984 = merge_dataframes(year1984_test, df_year1984_additional)  

df_year1984

Unnamed: 0,Year,Place,Title,Artist,Featuring,Title_url,Released,Genre 1,Genre 2,Genre 3,Length,Label,Total Labels,Writer 1,Writer 2,Youtube
0,1984,1,When Doves Cry,Prince,,/wiki/When_Doves_Cry,1984,Experimental pop,Neo-psychedelia,Soul,5:52,Warner Bros.,1,,,https://www.youtube.com/watch?v=UG3VcCAlUgE
1,1984,2,What's Love Got to Do with It,Tina Turner,,/wiki/What%27s_Love_Got_to_Do_with_It_(song),1984,R&b,Synth-pop,,3:48,Capitol,1,Terry Britten,Graham Lyle,https://www.youtube.com/watch?v=oGpFcHTxjZs
2,1984,3,Say Say Say,Paul McCartney,Michael Jackson,/wiki/Say_Say_Say,1983,Post-disco,Funk,Pop,5:40,Parlophone,2,Paul McCartney,Michael Jackson,https://www.youtube.com/watch?v=hu7hmBJLpkk
3,1984,4,Footloose,Kenny Loggins,,/wiki/Footloose_(song),1984,Pop rock,,,3:48,Columbia,1,Kenny Loggins,Dean Pitchford,
4,1984,5,Against All Odds (Take a Look at Me Now),Phil Collins,,/wiki/Against_All_Odds_(Take_a_Look_at_Me_Now),1984,Pop,Soft rock,,3:23,Atlantic,1,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,1984,96,Major Tom (Coming Home),Peter Schilling,,/wiki/Major_Tom_(Coming_Home),1983,Neue deutsche welle,New wave,Synthpop,8:02,Elektra,2,,,
96,1984,97,Magic,The Cars,,/wiki/Magic_(The_Cars_song),1984,Rock,New wave,Pop,3:57,Elektra,1,Ric Ocasek,,https://www.youtube.com/watch?v=E0Kv6vxZwL8
97,1984,98,When You Close Your Eyes,Night Ranger,,/wiki/When_You_Close_Your_Eyes,1984,Arena rock,Pop metal,,4:08,,,Jack Blades,Alan Fitzgerald,
98,1984,99,Rock Me Tonite,Billy Squier,,/wiki/Rock_Me_Tonite,1984,Pop rock,Synth-pop,Hard rock,4:57,Capitol,1,,,https://www.youtube.com/watch?v=PhnEpg9VMY8


In [492]:
#df_to_csv = pd.DataFrame.from_dict(billboard_data) 
df_year1984.to_csv ('data/year1984_test.csv', index = False, header=True)

## Test with 1962

In [493]:
year1962_test = df[df['Year'] == 1962].reset_index(drop=True)
year1962_test

Unnamed: 0,Year,Place,Title,Artist,Featuring,Title_url
0,1962,1,Stranger on the Shore,Acker Bilk,,/wiki/Stranger_on_the_Shore
1,1962,2,I Can't Stop Loving You,Ray Charles,,/wiki/I_Can%27t_Stop_Loving_You
2,1962,3,Mashed Potato Time,Dee Dee Sharp,,/wiki/Mashed_Potato_Time
3,1962,4,Roses Are Red (My Love),Bobby Vinton,,/wiki/Roses_Are_Red_(My_Love)
4,1962,5,The Stripper,David Rose,,/wiki/The_Stripper
...,...,...,...,...,...,...
95,1962,96,"(Girls, Girls, Girls) Made to Love",Eddie Hodges,,
96,1962,97,Town Without Pity,Gene Pitney,,/wiki/Town_Without_Pity_(song)
97,1962,98,If I Had a Hammer,"Peter, Paul",Mary,/wiki/If_I_Had_a_Hammer
98,1962,99,I Wish That We Were Married,Ronnie & the Hi-Lites,,


In [502]:
# Get all the info as a dict
year1962_dict = get_all_additional_data_dict(year1962_test)

# Convert it to df
df_year1962_additional = convert_additional_data_into_df(year1962_dict)

df_year1962_additional

Unnamed: 0,Released,Genre 1,Genre 2,Genre 3,Length,Label,Total Labels,Writer 1,Writer 2,Youtube
0,1961,Easy listening,Jazz,,2:52,Columbia,2,Acker Bilk,Robert Mellin,
1,1957,Country,,,2:37,RCA Victor,1,,,
2,1962,R&b,Pop,,2:27,Cameo,1,Brian Holland,Daniel Rey,
3,1962,Pop,,,2:38,Epic,1,Paul Evans,,
4,1962,Jazz,,,1:57,MGM,1,David Rose,,
...,...,...,...,...,...,...,...,...,...,...
95,,,,,,,,,,
96,,,,,,,,,,
97,1950,Folk,,,,Hootenanny,1,Pete Seeger,Lee Hays,
98,,,,,,,,,,


In [503]:
df_year1962 = merge_dataframes(year1962_test, df_year1962_additional)  

df_year1962

Unnamed: 0,Year,Place,Title,Artist,Featuring,Title_url,Released,Genre 1,Genre 2,Genre 3,Length,Label,Total Labels,Writer 1,Writer 2,Youtube
0,1962,1,Stranger on the Shore,Acker Bilk,,/wiki/Stranger_on_the_Shore,1961,Easy listening,Jazz,,2:52,Columbia,2,Acker Bilk,Robert Mellin,
1,1962,2,I Can't Stop Loving You,Ray Charles,,/wiki/I_Can%27t_Stop_Loving_You,1957,Country,,,2:37,RCA Victor,1,,,
2,1962,3,Mashed Potato Time,Dee Dee Sharp,,/wiki/Mashed_Potato_Time,1962,R&b,Pop,,2:27,Cameo,1,Brian Holland,Daniel Rey,
3,1962,4,Roses Are Red (My Love),Bobby Vinton,,/wiki/Roses_Are_Red_(My_Love),1962,Pop,,,2:38,Epic,1,Paul Evans,,
4,1962,5,The Stripper,David Rose,,/wiki/The_Stripper,1962,Jazz,,,1:57,MGM,1,David Rose,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,1962,96,"(Girls, Girls, Girls) Made to Love",Eddie Hodges,,,,,,,,,,,,
96,1962,97,Town Without Pity,Gene Pitney,,/wiki/Town_Without_Pity_(song),,,,,,,,,,
97,1962,98,If I Had a Hammer,"Peter, Paul",Mary,/wiki/If_I_Had_a_Hammer,1950,Folk,,,,Hootenanny,1,Pete Seeger,Lee Hays,
98,1962,99,I Wish That We Were Married,Ronnie & the Hi-Lites,,,,,,,,,,,,


In [504]:
#df_to_csv = pd.DataFrame.from_dict(billboard_data) 
df_year1962.to_csv ('data/year1962_test.csv', index = False, header=True)

## Test with 1995

In [45]:
year1995_test = df[df['Year'] == 1995].reset_index(drop=True)
year1995_test

Unnamed: 0,Year,Place,Title,Artist,Featuring,Title_url
0,1995,1,Gangsta's Paradise,Coolio,L.V.,/wiki/Gangsta%27s_Paradise
1,1995,2,Waterfalls,TLC,,/wiki/Waterfalls_(TLC_song)
2,1995,3,Creep,TLC,,/wiki/Creep_(TLC_song)
3,1995,4,Kiss from a Rose,Seal,,/wiki/Kiss_from_a_Rose
4,1995,5,On Bended Knee,Boyz II Men,,/wiki/On_Bended_Knee
...,...,...,...,...,...,...
95,1995,96,I Miss You,N II U,,/wiki/I_Miss_You_(N_II_U_song)
96,1995,97,Give It 2 You,Da Brat,,/wiki/Give_It_2_You
97,1995,98,Best Friend,Brandy,,/wiki/Best_Friend_(Brandy_Norwood_song)
98,1995,99,Misery,Soul Asylum,,/wiki/Misery_(Soul_Asylum_song)


In [90]:
# Get all the info as a dict
year1995_dict = get_all_additional_data_dict(year1995_test)

# Convert it to df
df_year1995_additional = convert_additional_data_into_df(year1995_dict)

df_year1995_additional

['Marqueze Etheridge', 'Lisa Lopes', 'Organized Noize']
['Dallas Austin']
['Henry Samuel']
['Jimmy Jam and Terry Lewis']
['Juergen Wind (J. Wind)', 'Frank "Quickmix" Hassas', 'Olaf Jeglitza']
['Mariah Carey', 'Dave Hall', 'Adrian Belew', 'Chris Frantz', 'Steven Stanley', 'Tina Weymouth']
['Madonna', 'Kenneth Edmonds']
['Dallas Austin', 'Monica', 'Willie James Baker', 'Derrick Simmons', 'James Todd Smith', 'Carlton Ridenhour', 'Quincy Jones III', 'Abrim Tilmon, Jr.', 'Hank Shocklee', 'Eric Sadler', 'George Clinton', 'James Brown']
['Montell Jordan', 'Oji Pierce', 'Ricky Walters']
['Milton Davis', 'William DuVall']
['Babyface']
['Eugene Hanes', 'Marc Valentine', 'Loren Hill', 'William "Bootsy" Collins', 'George Clinton, Jr.']
['John Popper']
['Jennifer Kimball']
['Bryan Adams', 'Michael Kamen', 'Robert John "Mutt" Lange']
['Jon Bon Jovi']
['Orville Burrell', 'Robert Livingston', 'King Floyd III']
['Jim Steinman']
["Des'ree", 'Ashley Ingram']
['Robert Kelly']
['Mark Bryan', 'Dean Felber',

Unnamed: 0,Released,Genre 1,Genre 2,Genre 3,Length,Label,Total Labels,Writer 1,Writer 2,Youtube
0,1995,Hip hop,Gangsta rap,G-funk,4:04,Tommy Boy,3,,,https://www.youtube.com/watch?v=fPO76Jlnz6c
1,1995,R&b,,,4:40,LaFace,2,Marqueze Etheridge,Lisa Lopes,https://www.youtube.com/watch?v=8WEtxJ4-sh4
2,1994,R&b,,,4:29,LaFace,2,Dallas Austin,,https://www.youtube.com/watch?v=LlZydtG3xqI
3,1994,Rock,Soul,,4:47,ZTT,3,Henry Samuel,,https://www.youtube.com/watch?v=lkzXi4cmdJk
4,1994,R&b,,,5:29,Motown,1,Jimmy Jam and Terry Lewis,,https://www.youtube.com/watch?v=jSUSFow70no
...,...,...,...,...,...,...,...,...,...,...
95,1994,R&b,,,4:00,Arista,1,Vincent Herbert,Chuckie Howard,https://www.youtube.com/watch?v=9BPmlcOQtpg
96,1994,G-funk,,,3:13,So So Def Recordings,1,Jermaine Dupri,Shawntae Harris,
97,1995,R&b,Pop,,4:48,Atlantic,1,Keith Crouch,Glenn McKinney,
98,1995,Alternative rock,,,4:24,Columbia,1,Dave Pirner,,


In [91]:
df_year1995 = merge_dataframes(year1995_test, df_year1995_additional)  

df_year1995

Unnamed: 0,Year,Place,Title,Artist,Featuring,Title_url,Released,Genre 1,Genre 2,Genre 3,Length,Label,Total Labels,Writer 1,Writer 2,Youtube
0,1995,1,Gangsta's Paradise,Coolio,L.V.,/wiki/Gangsta%27s_Paradise,1995,Hip hop,Gangsta rap,G-funk,4:04,Tommy Boy,3,,,https://www.youtube.com/watch?v=fPO76Jlnz6c
1,1995,2,Waterfalls,TLC,,/wiki/Waterfalls_(TLC_song),1995,R&b,,,4:40,LaFace,2,Marqueze Etheridge,Lisa Lopes,https://www.youtube.com/watch?v=8WEtxJ4-sh4
2,1995,3,Creep,TLC,,/wiki/Creep_(TLC_song),1994,R&b,,,4:29,LaFace,2,Dallas Austin,,https://www.youtube.com/watch?v=LlZydtG3xqI
3,1995,4,Kiss from a Rose,Seal,,/wiki/Kiss_from_a_Rose,1994,Rock,Soul,,4:47,ZTT,3,Henry Samuel,,https://www.youtube.com/watch?v=lkzXi4cmdJk
4,1995,5,On Bended Knee,Boyz II Men,,/wiki/On_Bended_Knee,1994,R&b,,,5:29,Motown,1,Jimmy Jam and Terry Lewis,,https://www.youtube.com/watch?v=jSUSFow70no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,1995,96,I Miss You,N II U,,/wiki/I_Miss_You_(N_II_U_song),1994,R&b,,,4:00,Arista,1,Vincent Herbert,Chuckie Howard,https://www.youtube.com/watch?v=9BPmlcOQtpg
96,1995,97,Give It 2 You,Da Brat,,/wiki/Give_It_2_You,1994,G-funk,,,3:13,So So Def Recordings,1,Jermaine Dupri,Shawntae Harris,
97,1995,98,Best Friend,Brandy,,/wiki/Best_Friend_(Brandy_Norwood_song),1995,R&b,Pop,,4:48,Atlantic,1,Keith Crouch,Glenn McKinney,
98,1995,99,Misery,Soul Asylum,,/wiki/Misery_(Soul_Asylum_song),1995,Alternative rock,,,4:24,Columbia,1,Dave Pirner,,


In [92]:
#df_to_csv = pd.DataFrame.from_dict(billboard_data) 
df_year1995.to_csv ('data/year1995_test.csv', index = False, header=True)

## Test 2009

In [95]:
year2009_test = df[df['Year'] == 2009].reset_index(drop=True)

# Get all the info as a dict
year2009_dict = get_all_additional_data_dict(year2009_test)

# Convert it to df
df_year2009_additional = convert_additional_data_into_df(year2009_dict)


df_year2009 = merge_dataframes(year2009_test, df_year2009_additional)  

df_year2009

First here
First here
First here
First here


Unnamed: 0,Year,Place,Title,Artist,Featuring,Title_url,Released,Genre 1,Genre 2,Genre 3,Length,Label,Total Labels,Writer 1,Writer 2,Youtube
0,2009,1,Boom Boom Pow,The Black Eyed Peas,,/wiki/Boom_Boom_Pow,2009,,,,5:08,Interscope,1,William Adams,Allan Pineda,https://www.youtube.com/watch?v=4m48GqaOz90
1,2009,2,Poker Face,Lady Gaga,,/wiki/Poker_Face_(song),2008,Synth-pop,Dance-pop,,3:58,KonLive,3,Stefani Germanotta,Nadir Khayat,https://www.youtube.com/watch?v=bESGLojNYSo
2,2009,3,Just Dance,Lady Gaga,Colby O'Donis,/wiki/Just_Dance_(song),2008,Electropop,Synth-pop,Dance-pop,4:01,Kon Live,3,Stefani Germanotta,"Nadir ""RedOne"" Khayat",https://www.youtube.com/watch?v=2Abk1jAONjw
3,2009,4,I Gotta Feeling,The Black Eyed Peas,,/wiki/I_Gotta_Feeling,2009,Dance-pop,,,4:49,Interscope,1,William Adams,Stacy Ferguson,https://www.youtube.com/watch?v=uSD4vsh1zDA
4,2009,5,Love Story,Taylor Swift,,/wiki/Love_Story_(Taylor_Swift_song),2008,Country pop,,,3:57,Big Machine,1,Taylor Swift,,https://www.youtube.com/watch?v=8xg3vE8Ie_E
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,2009,96,Goodbye,Kristinia DeBarge,,/wiki/Goodbye_(Kristinia_DeBarge_song),2009,Dance-pop,Electropop,,3:28,Island,1,Paul Leka,Adonis Shropshire,
96,2009,97,Say Hey (I Love You),Michael Franti & Spearhead,Cherine Anderson,/wiki/Say_Hey_(I_Love_You),2008,Dancehall,Reggae fusion,Hip hop,3:55,ANTI-,1,Michael Franti,Carl Young,
97,2009,98,Pop Champagne,Jim Jones,Ron Browz,/wiki/Pop_Champagne,2008,Hip hop,,,3:35,Columbia,3,Joseph Jones,Rondell Turner,https://www.youtube.com/watch?v=KrG4yKAVQuM
98,2009,99,Pretty Wings,Maxwell,,/wiki/Pretty_Wings,2009,R&b,Neo soul,,5:09,Columbia,1,Maxwell,Hod David,


In [97]:
#df_to_csv = pd.DataFrame.from_dict(billboard_data) 
df_year2009.to_csv ('data/year2009_test.csv', index = False, header=True)

## Web scrape all the lyrics

## Textblob (For the lyrics)
- https://www.youtube.com/watch?v=ea4IadDRwuc&list=PL_92WMXSLe_-RkWW5zAQZ-gMdVqZ7T-_F&index=3
- https://textblob.readthedocs.io/en/dev/quickstart.html

In [150]:
# Research more