In [None]:
import bs4
import requests
import re


def get_additional_song_data(link):
    result_dict = dict()
    
    
    #print("==================")
    #print(link)
    #print("==================")

    
    # All the data we want to extract (Will convert it into an Object!)
    released = ""
    genres = [""]
    length = ""
    label = ""
    total_labels = ""
    writer = [""]
    youtube = ""
    
    if(link != ''):
    
        url = "https://en.wikipedia.org"+link

        r = requests.get(url)
        r.raise_for_status()
        soup = bs4.BeautifulSoup(r.text, 'html.parser')

        # Some songs have more than one infobox. We only want the first.
        infobox = soup.select('.infobox')
        table = infobox[0].select('tbody > tr')


        for el in table:


            if(el.find('th')):
                th = el.select('th')
                td = el.select('td')
                if(len(td) > 0):
                    #print(th[0].text, td[0].text)
                    match th[0].text:
                        case "Released":
                            released =  extract_released_year(td[0])
                            #print("Released", extract_released_year(td[0]))
                        case "Genre":
                            genres = extract_genre(td[0])
                            #print("Genre:", extract_genre(td[0]))
                        case "Length":
                            length = extract_length(td[0])
                            #print("Length", extract_length(td[0]))
                        case "Label":
                            label_data = extract_label(td[0])
                            label = label_data[0]
                            total_labels = label_data[1]
                            #print("Label", extract_label(td[0]))
                        case "Songwriter(s)":
                            writer = extract_songwriter(td[0])
                            #print("Songwriter(s)", extract_songwriter(td[0]))

            # Search for YT link
            if(el.find('a', {'title': 'YouTube'})):
                youtube = el.find('a', {'class': 'external'}).get('href')
                #print(el.find('a', {'class': 'external'}).get('href'))
    
    
    # Make sure the lists have the correct size. NaN will be handle when converted to df.
    if(len(genres) < 3 ):
        genres.append('')
        genres.append('')
    if(len(writer) < 2):
        writer.append('')
                
    return [released, genres[0], genres[1], genres[2], length, label, total_labels, writer[0], writer[1], youtube]
    
    

def extract_released_year(element):
    """This function will return the year the song was released. But tests shows that is different scenarios. 
    1. Multiple release days, release etc., 
    2. Sometimes only the Year and Month(or Just the year), 
    
    I am using regex and split to make a list of 4 digit numbers and only returns index 0(Incase there was a rerelease later)
    """    
    text = element.text
    
    result_list = re.findall("\d{4}", text)
    
    
    #print("Extracted released year!")
    
    return(min(result_list))
    
    #years_reg = re.compile(r"\b(19|20)\d{2}\b")
        
    #return [year for year in element.text.split() if re.search(years_reg, year)][0]
    
                        
def extract_genre(element):
    """This function will return a list of up to 3 genres. But tests shows that is different scenarios. 
    1. Only one Genre, 
    2. Multiple genres in an unordered list(ul), 
    3. Multiple genres with a bookmark like this. Pop[1], Rock[2].
    
    Which is handle but simple if statements.
    """
    letters_reg = re.compile(r"[a-zA-Z]")
        

    if(element.find('a')):
        result_list = [x.text for x in element.select('a')]
    else:
        result_list = element.text.split()
    
    
    #print("Extracted genres!")
    
    return [genre.capitalize() for genre in result_list if re.search(letters_reg, genre)][:3]
    
    
def extract_length(element):
    """This function will return the length of the song. In some cases there is a single and an album version etc.,
    but i decided to return just the longest version. 
    1. The function splits the text elements into a list if it contains ':'.
    2. It cleans up the result for any letters or symbols(not ':' obviously).
    3. Return the max value."""

    # Keeping this until im sure new solution works!
  #  lengths = []

  #  if(element.find('li')):
  #      list_items = [x.text for x in element.select('li')]    
  #      for item in list_items:
  #          split_time = item.split(':')
  #          minutes = split_time[0]
  #          seconds = split_time[1][:2] # Only 2 digits
  #          lengths.append(minutes+':'+seconds)
  #  else:
  #      # c = character   l = length
  #      lengths = ["".join([c for c in l if int(c.isnumeric()) or c == ":"]) for l in element.text.split() if ":" in l] 

    result_list = []
    
    if(element.find('li')):
        lengths = [x.text for x in element.select('li')]
    else:
        lengths = [x for x in element.text.split() if ":" in x]
        
    for item in lengths:
        split_time = item.split(':')
        minutes = split_time[0]
        seconds = split_time[1][:2] # Only 2 digits
        time = "".join([char for char in minutes+':'+seconds if int(char.isnumeric()) or char == ":"])
        result_list.append(time)    

    
    #print("Extracted length!")
        
    return max(result_list)
    
def extract_label(element):
    """This function will return only the first mentioned Label. Some singers/song change labels for whatever reason.
    Sometimes up tp two or three times. For simplicity I only return the first mentioned and the amount of different
    labels that the song have had. Maybe we can use that information for something interresting."""
    
    if(element.find('a')):
        result_list = [x.text for x in element.select('a')]
    elif(element.find('li')):
        result_list = [x.text for x in element.select('li')]
    else:
        result_list = element.text.split()
    
    
    #print("Extracted Labels!")
    
    # Returning originale label and the amount of different labels.
    return [[label for label in result_list][0], len(result_list)]

def extract_songwriter(element):
    """This function will return only the first mentioned songwriter. Some singers/song change labels for whatever reason.
    Sometimes up tp two or three times. For simplicity I only return the first mentioned and the amount of different
    labels that the song have had. Maybe we can use that information for something interresting."""
    
    result_list = []
    
    
     
    #writer_list = element.text
    #for writer in writer_list:
    #    print(writer)
    #    result_list.append(writer)
        #if(re.findall("[a-zA-Z]", writer)):
        #    result_list.append(writer)
            
    
    #print(test_list)
    
    
    #list_of_writers = [x.text.rstrip('\n') for x in element.select('li')]
    if(element.select('li')):
        print("1")
        list_of_writers = [x for x in element.select('li').text]
    elif(element.select('a')):
        print("2")
        list_of_writers = [x for x in element.select('a').text]
    else:
        print("3")
        list_of_writers = [x for x in element]
        
    print(list_of_writers)
    
    for x in list_of_writers:
        print(x)
        #if(x.find('a')):
        #    result_list.append(x.find('a').text)
        #else:
        result_list.append(x)
    
    

    for w in result_list:
        if(re.search('\[', str(w))):
            result_list.pop(result_list.index(w))
             

    
    
    
    
    # Need to add a blank spot if there is only one writer.
    if(len(result_list) < 2):
        result_list.append("")
        
    # Some writers might starts with ' & ' and some ends with 'and'. Quick fix:
    #if(' and ' in result_list[1][-5:]):
    #    result_list[1] = result_list[1][:-5]
    #if(' & ' in result_list[1][:3]):
    #    result_list[1] = result_list[1][3:]
    
    
    #print("Extracted Songwriters!")
    
    return [writers for writers in result_list][:2]
    
    
    # Need to add a blank spot if there is no featuring artist.
    #if(len(result_list) < 2):
    #    result_list.append("")
    
    #if(element.find('a')):
    #    result_list = [x.text for x in element.select('a')]
        
    #else:
    #    result_list = re.split('; |,',element.text) 
    
    # Returning originale label and the amount of different labels.
    #return [writer for writer in result_list][:2]
    


#get_additional_song_data("/wiki/When_Doves_Cry")

# Some songs have up to 7 genres! Limit it to 3!?
# Length can be different aswell, album and single (Think i should go with the longest one)
# Multiple recording days. (Take the earliest)
# More than one label! (Like Wham - Careless whispers...)
# Song writers is often more than one. Some weird ones like Andy Gibb - shadow song:  "Barry, Robin & Maurice Gibb; Andy Gibb"