In [1]:
import requests
import pandas as pd
import numpy as np
import json
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from bs4 import BeautifulSoup
import re
import time

# Billboard Dataset

In [2]:
def topartists(year):
    """Retrieve the top artists for a specific year according to Billboard
    charts.

    This function sends a GET request to the Billboard website to fetch the top
    artists for the specified year.
    It then extracts and returns the list of top artists in a pandas DataFrame.

    Parameters
    ----------
    year : int
        The year for which you want to retrieve the top artists.

    Returns
    -------
    pandas.DataFrame
        A DataFrame containing the top artists for the specified year.
    """
    
    response = requests.get(
        f'https://www.billboard.com/charts/year-end/{year}/top-artists/'
    )
    soup = BeautifulSoup(response.text)
    artists = [
        {'Artist': artist.select_one('li > h3').text.strip()}
        for artist in soup.select('div.o-chart-results-list-row-container')
    ]
    df_artists = pd.DataFrame(artists)
    df_artists.index = np.arange(1, len(df_artists) + 1)

    return df_artists

def alltopartists():
    """Retrieve the top artists for multiple years from 2006 to 2022 according
    to Billboard charts.

    This function iterates through the years from 2006 to 2022 and retrieves
    the top artists for each year by calling the `topartists` function.
    It then combines the data for all years into a single pandas DataFrame
    and removes duplicates.

    Returns
    -------
    pandas.DataFrame
        A DataFrame containing the top artists from 2006 to 2022 without
        duplicates.
    """
    df_all = pd.DataFrame()
    for i in range(2006,2023):
        df_all = pd.concat([df_all,topartists(i)])
    df_all = df_all.drop_duplicates(keep='first', inplace=False,
                                    ignore_index=True)
    df_all.index = np.arange(1, len(df_all) + 1)

    return df_all




Note: Result may not be the same due to changes in Billboard website. Refer to artists.pkl to read original scraped artists used.

In [3]:
artists_df = alltopartists()
artists_df

Unnamed: 0,Artist
1,Chris Brown
2,Nickelback
3,Rascal Flatts
4,Sean Paul
5,Ne-Yo
...,...
502,Brent Faiyaz
503,Jessica Darrow
504,Chencho Corleone
505,CKay


# Wikipedia Dataset

We'll use the original artists dataset here that was scraped Nov. 16, 2023.

In [5]:
artists_df_old = pd.read_pickle('artists.pkl')
artists_df_old

Unnamed: 0,Name
0,2 Chainz
1,21 Savage
2,24kGoldn
3,3OH!3
4,5 Seconds Of Summer
...,...
500,blackbear
501,fun.
502,gnash
503,twenty one pilots


In [6]:
def scrape_wikipedia_sidebar(url,
                             keywords=["band", "musician", "singer",
                                       "rapper", "artist"]
                            ):
    """Scrape information from the sidebar of a Wikipedia page or find relevant
    links.

    This function makes an HTTP request to a Wikipedia page specified by the
    provided URL. It attempts to scrape information from the sidebar of the
    page and returns a dictionary containing the extracted data. If the page is
    a disambiguation page or does not have a sidebar, the function searches
    for relevant links in the notable people
    section or the page itself, following the provided keywords.

    Parameters
    ----------
    url : str
        The URL of the Wikipedia page to scrape.
    keywords : list of str, optional
        Keywords used to identify relevant links when searching for people.
        Default is ["band", "musician", "singer", "rapper", "artist"].

    Returns
    -------
    dict or None
        If a sidebar is found on the Wikipedia page, a dictionary containing
        the extracted data is returned. If no relevant information is found
        or if the request is unsuccessful, None is returned.
    """
    # Make an HTTP request to the Wikipedia page
    response = requests.get(url)

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Parse the HTML content of the page
        soup = BeautifulSoup(response.text, "html.parser")

        # Find the sidebar element (class="infobox")
        sidebar = soup.find("table", {"class": "infobox"})

        if sidebar:
            print("Sidebar found. Extracting information...")

            # Create a dictionary to store the information
            sidebar_data = {}

            # Iterate through rows in the sidebar table
            for row in sidebar.find_all("tr"):
                # Find the header and data cells
                header_cell = row.find("th")
                data_cell = row.find("td")

                # Check if both header and data cells exist
                if header_cell and data_cell:
                    # Extract text from header and data cells
                    header = header_cell.get_text(strip=True)
                    data = " ".join(data_cell.stripped_strings)

                    # Add the information to the dictionary
                    sidebar_data[header] = data

            return sidebar_data
        else:
            # Check if there is a section with a list of people
            people_section = soup.find("div", {"id": "setindexbox"})

            if people_section:
                print("Notable people section found. Finding relevant link...")

                # Find the first link containing keywords
                for link in soup.find_all("a", href=True):
                    if any(keyword in link.text.lower()
                           for keyword in keywords):
                        new_url = "https://en.wikipedia.org" + link["href"]
                        print(f"Found relevant link: {new_url}")
                        return scrape_wikipedia_sidebar(new_url)

                print("No relevant link found in the notable people section.")
                print(url)
                return None
            elif soup.find("div", {"id": "disambigbox"}):
                print("Disambiguation page detected. Finding relevant link...")

                # Find the first link containing keywords
                for link in soup.find_all("a", href=True):
                    if any(keyword in link.text.lower()
                           for keyword in keywords):
                        new_url = "https://en.wikipedia.org" + link["href"]
                        print(f"Found relevant link: {new_url}")
                        return scrape_wikipedia_sidebar(new_url)

                print("No relevant link found in the disambiguation page.")
                print(url)
                return None
            else:
                print(
                    """The page does not seem to be a disambiguation page
                    or have a sidebar or notable people section."""
                )
                print(url)
                return None
    else:
        # Print an error message if the request was not successful
        print(
            "Error: Unable to retrieve the Wikipedia page (Status code "
            f"{response.status_code})"
        )
        print(url)


def get_wikipedia_url(title):
    """
    Retrieve the URL of a Wikipedia page by its title.

    This function queries the Wikipedia API to obtain the URL of a Wikipedia
    page based on its title.

    Parameters
    ----------
    title : str
        The title of the Wikipedia page for which the URL is to be retrieved.
    """
    api_url = "https://en.wikipedia.org/w/api.php"

    # Define parameters for the API request to get page information
    params = {
        "action": "query",
        "format": "json",
        "titles": title,
        "prop": "info",
        "inprop": "url",
    }

    # Make the API request
    response = requests.get(api_url, params=params)
    data = response.json()

    # Extract information from the API response
    page_id = list(data["query"]["pages"].keys())[0]

    # Check if the page exists
    if page_id != "-1":
        # Get the URL of the page
        page_url = data["query"]["pages"][page_id]["fullurl"]
        return page_url
    else:
        return f"Page with title '{title}' not found."

In [7]:
names = artists_df_old['Name'].to_list()
urls = []
for name in names:
    url = get_wikipedia_url(name)
    urls.append(url)
    time.sleep(.5)
len(urls)

505

In [8]:
for i in range(len(urls)):
    if urls[i] == "https://en.wikipedia.org/wiki/Bastille":
        urls[i] = "https://en.wikipedia.org/wiki/Bastille_(disambiguation)"
    elif urls[i] == "https://en.wikipedia.org/wiki/Eagles":
        urls[i] = "https://en.wikipedia.org/wiki/Eagles_(disambiguation)"
    elif urls[i] == "https://en.wikipedia.org/wiki/CJ":
        urls[i] = "https://en.wikipedia.org/wiki/CJ_(rapper)"
    elif urls[i] == 'https://en.wikipedia.org/wiki/Cassie':
        urls[i] = 'https://en.wikipedia.org/wiki/Cassie_Ventura'
    elif urls[i] == 'https://en.wikipedia.org/wiki/Future':
        urls[i] = 'https://en.wikipedia.org/wiki/Future_(rapper)'
    elif urls[i] == 'https://en.wikipedia.org/wiki/Muse':
        urls[i] = 'https://en.wikipedia.org/wiki/Muse_(band)'
    elif urls[i] == 'https://en.wikipedia.org/wiki/Mustard':
        urls[i] = 'https://en.wikipedia.org/wiki/Mustard_(record_producer)'
    elif urls[i] == 'https://en.wikipedia.org/wiki/Logic':
        urls[i] = 'https://en.wikipedia.org/wiki/Logic_(rapper)'
    elif urls[i] == 'https://en.wikipedia.org/wiki/Nirvana':
        urls[i] = 'https://en.wikipedia.org/wiki/Nirvana_(band)'
    elif urls[i] == 'https://en.wikipedia.org/wiki/Passenger':
        urls[i] = 'https://en.wikipedia.org/wiki/Passenger_(singer)'
    elif urls[i] == 'https://en.wikipedia.org/wiki/Prince':
        urls[i] = 'https://en.wikipedia.org/wiki/Prince_(musician)'
    elif urls[i] == 'https://en.wikipedia.org/wiki/Train':
        urls[i] = 'https://en.wikipedia.org/wiki/Train_(band)'
    elif urls[i] == 'https://en.wikipedia.org/wiki/Zayn':
        urls[i] = 'https://en.wikipedia.org/wiki/Zayn_Malik'
    elif urls[i] == 'https://en.wikipedia.org/wiki/Pitbull':
        urls[i] = 'https://en.wikipedia.org/wiki/Pitbull_(rapper)'
    elif urls[i] ==  'https://en.wikipedia.org/wiki/Bow_Wow':
        urls[i] = 'https://en.wikipedia.org/wiki/Bow_Wow_(rapper)'

In [9]:
scraped_sidebars = []
sidebars_content = []
for url in urls:
    sidebar = scrape_wikipedia_sidebar(url)
    scraped_sidebars.append(sidebar)
    sidebars_content.append([' '.join(list(sidebar.values()))])
    print(sidebar)
    time.sleep(.5)

Sidebar found. Extracting information...
{'Born': 'Tauheed K. Epps ( 1977-09-12 ) September 12, 1977 (age\xa046) College Park, Georgia , U.S.', 'Other\xa0names': 'Tity Boi Drenchgod', 'Alma\xa0mater': 'Virginia State University ( BS )', 'Occupations': 'Rapper songwriter', 'Years\xa0active': '1997–present', 'Spouse': 'Kesha Ward \u200b ( m. 2018) \u200b', 'Children': '3', 'Awards': 'Full list', 'Genres': 'Hip hop dirty south trap [1]', 'Instrument(s)': 'Vocals', 'Labels': 'Gamebread T.R.U. Def Jam [2]', 'Formerly of': 'Playaz Circle DTP', 'Website': '2chainz .com'}
Sidebar found. Extracting information...
{'Birth name': 'Shéyaa Bin Abraham-Joseph', 'Born': '( 1992-10-22 ) October 22, 1992 (age\xa031) Plaistow , London , England', 'Origin': 'Atlanta, Georgia , U.S.', 'Genres': 'Hip hop trap horrorcore', 'Occupation(s)': 'Rapper songwriter record producer', 'Years active': '2013–present', 'Labels': 'Epic The Orchard Slaughter Gang', 'Children': '3', 'Website': 'shop .21savage .com'}
Sideb

In [10]:
print(len(urls))
print(len(names))
print(len(scraped_sidebars))
print(len(sidebars_content))

505
505
505
505


In [11]:
data = {'URL': urls, 'Name': names, 'Scraped_Sidebar': scraped_sidebars,
        'Sidebar_Content': sidebars_content}
sidebars_df = pd.DataFrame(data)

sidebars_df

Unnamed: 0,URL,Name,Scraped_Sidebar,Sidebar_Content
0,https://en.wikipedia.org/wiki/2_Chainz,2 Chainz,{'Born': 'Tauheed K. Epps ( 1977-09-12 ) Septe...,"[Tauheed K. Epps ( 1977-09-12 ) September 12, ..."
1,https://en.wikipedia.org/wiki/21_Savage,21 Savage,"{'Birth name': 'Shéyaa Bin Abraham-Joseph', 'B...",[Shéyaa Bin Abraham-Joseph ( 1992-10-22 ) Octo...
2,https://en.wikipedia.org/wiki/24kGoldn,24kGoldn,"{'Birth name': 'Golden Landis Von Jones', 'Als...",[Golden Landis Von Jones Goldn El Dorado ( 200...
3,https://en.wikipedia.org/wiki/3OH!3,3OH!3,"{'Origin': 'Boulder, Colorado , U.S.', 'Genres...","[Boulder, Colorado , U.S. Electropop synth-pop..."
4,https://en.wikipedia.org/wiki/5_Seconds_Of_Summer,5 Seconds Of Summer,"{'Also known as': '5SOS', 'Origin': 'Sydney , ...","[5SOS Sydney , New South Wales , Australia Pop..."
...,...,...,...,...
500,https://en.wikipedia.org/wiki/Blackbear,blackbear,"{'Origin': 'Canada', 'Genres': 'Northern conte...",[Canada Northern contemporary 2000–present Tri...
501,https://en.wikipedia.org/wiki/Fun.,fun.,"{'Origin': 'New York City, U.S.', 'Genres': 'B...","[New York City, U.S. Baroque pop [1] alternati..."
502,https://en.wikipedia.org/wiki/Gnash,gnash,"{'Birth name': 'Garrett Charles Nash', 'Born':...","[Garrett Charles Nash ( 1993-06-16 ) June 16, ..."
503,https://en.wikipedia.org/wiki/Twenty_one_pilots,twenty one pilots,"{'Origin': 'Columbus, Ohio , U.S.', 'Genres': ...","[Columbus, Ohio , U.S. Alternative rock altern..."


In [12]:
sidebars_df.to_pickle('raw_data_from_preproc.pkl')

# Query Data

In [13]:
query_url = 'https://en.wikipedia.org/wiki/Laufey_(singer)'
query_sidebar = scrape_wikipedia_sidebar(query_url)
query_data = [[query_url, query_sidebar,
              ' '.join(list(query_sidebar.values()))]]
query_df = pd.DataFrame(query_data, index=['Laufey'],
                        columns=['URL', 'Scraped_Sidebar', 'Sidebar_Content'])
query_df

Sidebar found. Extracting information...


Unnamed: 0,URL,Scraped_Sidebar,Sidebar_Content
Laufey,https://en.wikipedia.org/wiki/Laufey_(singer),{'Birth name': 'Laufey Lín Bīng Jónsdóttir / 林...,Laufey Lín Bīng Jónsdóttir / 林冰 [1] ( 1999-04-...


In [14]:
query_df.to_pickle('query_df.pkl')