# 0. Setting up environment

In [1]:
# importing required libraries

import requests
import json
from bs4 import BeautifulSoup
import pandas as pd
from pandas.io.json import json_normalize
import random
import re
import math
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import ipywidgets
import geopy.geocoders
from geopy.geocoders import Nominatim
geopy.geocoders.options.default_user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'
import time


In [2]:
def extract_genres():
    url = 'https://www.discogs.com/search/?sort=want%2Cdesc&type=release'
    html = requests.get(url).content
    soup = BeautifulSoup(html, 'lxml')
    tag = 'a'
    genres_link = [element.get('href') for element in soup.find_all(tag, attrs={'href': re.compile("genre_exact=")})]
    genres_list = [str(re.findall('&genre_exact=\w+\+?\-?\w+', link))[2:-2] for link in genres_link ]
    genres = list(set(genres_list))
    styles_list = [element.get('href') for element in soup.find_all(tag, attrs={'href': re.compile("style_exact=")})]
    styles = [str(re.findall('&style_exact=\w+\+?\w\+?\-?\w+', link))[2:-2] for link in styles_list]
    
    genres = genres + styles
    return genres

genres = ['Rock', 'Pop', 'Electronic', 'Jazz', 'Funk / Soul', 'Hip Hop', 'Latin', 'Reggae', 'Blues']

In [3]:
def df_total_na(df):
    '''
    Returns the percentage of total NULL values in the dataset
    '''
    total_obs = df.count().sum()
    total_nas = df.isna().sum().sum()
    nas_percentage = total_nas *100 / total_obs
    
    print(f'Our dataset has {round(nas_percentage, 2)}% missing values overall' )

In [4]:
def clean_artist(string):
    '''
    This function takes a string and erases the skipline '\n', parenthesis '()' and stars '*'
    '''
    
    new_str = re.sub('\s\(\d\)', '', string).replace('\n\n', '').replace('*', '')
    
    
    return new_str

In [5]:
def underscore_artist(artist):
    '''
    This function takes an artist name and transforms its string into 
    a 'Str_Str' format
    '''
    return artist.replace(' ', '_') 

In [6]:
def column_nulls_percentage(df):
    '''
    Returns a series indicating percentage of NULLS per column
    '''
    # .mean() gets the NULL values and divides it by the total length of the column
    
    return df.isna().mean().round(4) * 100

In [7]:
def genre_link(string):
    '''
    This function takes as an input a genre or style name, and returns a piece of link depending on its category
    ''' 
    
    global genres 
    
    genres = extract_genres()
    
    string = re.sub(' ', '+', string).title()
    link_piece = ''
    
    if ('&genre_exact=' + string) in genres:
        link_piece = ('&genre_exact=' + string)
        
    elif ('&style_exact=' + string) in genres:
        link_piece = ('&style_exact=' + string)
        
    else:
        raise ValueError('The selected genre doesnt exist')
    
    return link_piece

In [8]:
def artist_scrape(n=10, genres=genres):
    '''
    This function scrapes the Discogs.com webpage to get 'n' artist names, sorted by popularity and filtered by genre.
    Returns a list with artist names.
    
    Takes 3 arguments:
    
        genre= type str or type list: Selects the genre(s) you're interested on.
        
        n= type int: Indicates the number of artist returned
        
        decade = type int: Indicates the decade of the artist to be scraped (YYYY format)
    
    '''
    
    i = 0
    
    artists = []
    
          
        genre_link_list = [genre_link(genre) for genre in genres]
        genres = ''.join(genre_link_list)
        
    except TypeError:
        print('Genres must be passed as a list, even if it only contains one element')
 

    
    
    while len(artists) < n:
                
        i += 1
        
        url = 'https://www.discogs.com/search/?sort=have%2Cdesc&limit=250&type=master' + genres + '&page=' + str(i)
        
        print(url)
        
        html = requests.get(url).content
        soup = BeautifulSoup(html, 'lxml')
        tag = 'h5'
        
        text = [element.text for element in soup.find_all(tag)]
        clean_text = [clean_artist(elem) for elem in text[:-5]]
        
        for elem in clean_text:
            if len(artists) < n:
                if elem not in artists:
                    artists.append(elem)
                    
    return artists

IndentationError: unexpected indent (<ipython-input-8-1eee2636e822>, line 21)

In [None]:
def songkick_artist_id(artist_list):
    '''
    This function locates an Songkick ID number for a given list of artists. Then, it creates a tuple
    with the band ID and the band name
    
    Takes two arguments:
        
        artist_list = type lst: List of artist names to be found. If an input it's a string, transform it
                                into a list for a correct processing.
                                
        apikey = type str: Your Songkick API key
    
    The output is a tuple composed of two parts:
    
        output[0]= type int: List of Songkick IDs
        output[1]= type str: Artist names
        
    '''
    global APIKEY
    
    APIKEY = input('Please insert your Songkick API Key: ')
        
    print('Getting artists...')
    
    if type(artist_list) == str:
        artist_list = [artist_list]
    
    IDs = []
    not_found = []
    
    
    for artist in artist_list:
        
        url = 'https://api.songkick.com/api/3.0/search/artists.json?apikey=' + APIKEY +'&query=' +underscore_artist(artist)
        response = requests.get(url)
        results = response.json()
        try:
            songkick_id = results['resultsPage']['results']['artist'][0]['id']
            IDs.append((str(songkick_id), str(artist)))
        except:
            not_found.append(artist)
        
    print(f'Found {len(artist_list)} artists in the Discogs Database')
        
    if len(not_found) > 0:
        print(f"The following artists don't have shows information on Songkick: {not_found}")
        
    return IDs

In [None]:
def collect_data(artist_id_list):  
    '''
    This function iterates over an artist id list to get the links on each artist.
    Calculates in how many pages the data is stored, and iterates on all of them.
    Then, for each page, requests the information to the API and fetchs all the data we need.
    Stores this data as a list of lists object to ensure compatibility with Pandas Dataframes.
    '''
    
    artist_data = []
    count = 0
    
    for artist in artist_id_list:        
        try:
            url = 'https://musicbrainz.org/search?query='+underscore_artist(artist[1])+'&type=artist&method=indexed'
            html = requests.get(url).content
            soup = BeautifulSoup(html, 'lxml')

            brainz_link = [element.get('href') for element in soup.select('td a')]
            brainz_link = brainz_link[0]
        
            url = 'https://musicbrainz.org/' + brainz_link
            html = requests.get(url).content
            soup = BeautifulSoup(html, 'lxml')
        
            founded = [element.text for element in soup.select('dd.begin-date')]
            founded = re.findall('\d{4}', founded[0])
            founded = founded[0]
        
            dissolved = [element.text for element in soup.select('dd.end-date')]
        
            if dissolved == []:
                dissolved = 'Still active'
            else:
                dissolved = re.findall('\d{4}', dissolved[0]) 
                dissolved = dissolved[0]
  
            origin = [element.text for element in soup.select('dd.area')]
            origin = origin[0]
        
            genre_list = [element.text for element in soup.select('.genre-list')]
            
            if len(genre_list) == 1:
                genre_list = genre_list[0] + ['']
            else:
                genre_list = list(set(genre_list[0].split(',')))

        
        except:
            
            founded = None
            dissolved = None
            origin = None
            genre_list = ['Unknown', 'Unknown']
            


        
        count += 1
        url = 'https://api.songkick.com/api/3.0/artists/'+str(artist[0]) +'/gigography.json?apikey=' + APIKEY
        response = requests.get(url)
        result = response.json()
        
        ## Iterating over different pages:
        entries = result['resultsPage']['totalEntries'] 
        per_page = result['resultsPage']['perPage']
        
        pages_round= int(math.ceil(entries/per_page))        
    
        for z in range(pages_round):
                
                # Page 0 and Page 1 are equal, so we'll start from page 1
                url = 'https://api.songkick.com/api/3.0/artists/'+str(artist[0]) +'/gigography.json?&page='+ str(z+1) +'&apikey=' + APIKEY
                response = requests.get(url)
                result = response.json()
        
    
                for i in range(len(result['resultsPage']['results']['event'])):
                        
                    print(f'Fetching artist {count}/{len(artist_id_list)} data, {entries} entries on Songkick...', end='\r', flush=True)

        
                    band = artist[1]
                    b_id = artist[0]
                    bill = result['resultsPage']['results']['event'][i]['performance'][0]['billing']
                    name = result['resultsPage']['results']['event'][i]['displayName']
                    date = result['resultsPage']['results']['event'][i]['start']['date']
                    time = result['resultsPage']['results']['event'][i]['start']['time']
                    city = result['resultsPage']['results']['event'][i]['location']['city']
                    venu = result['resultsPage']['results']['event'][i]['venue']['displayName']
                    lat = result['resultsPage']['results']['event'][i]['venue']['lat'] 
                    lng = result['resultsPage']['results']['event'][i]['venue']['lng'] 
        
                

                    
                    artist_data.append([band, b_id, bill, name, date, time, city, venu, lat, lng, founded, dissolved, origin, genre_list[0].strip().title(), genre_list[1].strip().title()])

   
    return artist_data

In [None]:
def get_shows_data(n=10, genres = genres, artists = None):
    '''
    This function returns an organized Dataframe which contains information about all the shows played
    by that artist over its history. Contains the following variables:
        n = type int: Tells the function hoy many artists to get info
        
        genre = type str or type list: Tells the function which genre (or genres) of music to scrape
        
        apikey = type str: (needed) your Songkick API key
        
        artist = type str or type lst: By default, set to None. This means that artist will be chosen
                               
        based on all-time popularity. If specified, tells what artist to scrape'''
        
    if artists == None:

        artists = artist_scrape(n=n, genres=genres)
    
    
    ids = songkick_artist_id(artists)
    
    data = collect_data(ids)
    
    print('\n\n-Done-')
    
    return data

In [None]:
def shows_dataframe(data):
    df = pd.DataFrame(data)
    df.columns = ['artist', 'artist_id', 'relevance', 'show_name', 'date', 'time', 'city', 'venue', 'lat', 'lng', 'founded', 'dissolved', 'origin', 'genre_1', 'genre_2']
    df['time'] = pd.to_datetime(df.time, errors='coerce')
    
    return df[['artist', 'date', 'venue', 'city', 'lat', 'lng', 'relevance', 'founded', 'origin', 'dissolved', 'genre_1', 'genre_2']]

In [None]:
data = get_shows_data(n=3)
df = shows_dataframe(data)
# df.to_csv('concerts.csv')

In [None]:
df.tail()

In [None]:
# df = pd.read_csv('concerts.csv')

## Filling coordinates (lat, lng) from venue, city

In [None]:
# geolocator = Nominatim()

In [None]:
# missing_coords = df[df_original.lat.isnull()]

In [None]:
# for index, row in missing_coords.iterrows():
#     print(row['lat'])
#     location = geolocator.geocode(row['city'])
#     df_original.at[index , 'lat'] = location.latitude
#     df_original.at[index , 'lng'] = location.longitude

## - Which genres are more active when comes to show playing?

make a plot grouping by genres and venues count




In [None]:
# df.groupby('genre_1')['date'].value_counts()

In [None]:
# fig = plt.figure(figsize=(20,15))

# sns.barplot(x='genre_1', y = df.groupby('genre_1').count(), data= df)

## - Is there any way to predict if an artist is coming soon to your Country?

artist selector, horizontal bar chart country/year


## - Which are the more active venues? For what genres?

count venues, plot 5 max, hue=genre


## - Which countries consume what type of show or genre, and what's the history of every country in this aspect?


## - Do political/historical events do impact on artist growth or activity?


## - What's the tendency of a new artist, relative to number of shows, compared to an older one?


## - What's the percentage of festivals shows relative to normal shows for each artist? Why?


## - There is more activity related to concerts nowadays, or few years ago?


- Genre multiselector


## - Genre/Popularity heatmap by Country (genre selector)


## - Histogram for an artist/nº of shows per country (band selector)



### 

In [None]:
# artists=['Pink Floyd', 'Yes', 'Nirvana', 'The Beatles', '50 Cent']
# artist_data = []

# for artist in artists:        
#     try:
#         url = 'https://musicbrainz.org/search?query='+underscore_artist(artist)+'&type=artist&method=indexed'
#         html = requests.get(url).content
#         soup = BeautifulSoup(html, 'lxml')

#         brainz_link = [element.get('href') for element in soup.select('td a')]
#         brainz_link = brainz_link[0]
        
#         url = 'https://musicbrainz.org/' + brainz_link
#         html = requests.get(url).content
#         soup = BeautifulSoup(html, 'lxml')
        
#         founded = [element.text for element in soup.select('dd.begin-date')]
#         founded = re.findall('\d{4}', founded[0])
#         founded = founded[0]
        
#         dissolved = [element.text for element in soup.select('dd.end-date')]
        
#         if dissolved == []:
#             dissolved = 'Still active'
#         else:
#             dissolved = re.findall('\d{4}', dissolved[0]) 
#             dissolved = dissolved[0]
  
#         origin = [element.text for element in soup.select('dd.area')]
#         origin = origin[0]
        
#         genre_list = [element.text for element in soup.select('.genre-list')]
#         genre_list = list(set(genre_list[0].split(',')))
#         if len(genre_list) == 1:
#             genre_list = genre_list[0] + ['']
            
#     except:
            
#             founded = None
#             dissolved = None
#             origin = None
#             genre_list = ['Unknown', 'Unknown']
            
#     artist_data.append([founded, dissolved, origin, genre_list[0].strip().title(), genre_list[1].strip().title()])
    
# print(artist_data)
