In [1]:
import requests
from bs4 import BeautifulSoup

import pandas as pd

# Create list of tours

## Highest-grossing tours 
https://en.wikipedia.org/wiki/List_of_highest-grossing_concert_tours

In [2]:
url = "https://en.wikipedia.org/wiki/List_of_highest-grossing_concert_tours"

In [3]:
response = requests.get(
	url=url,
)
soup = BeautifulSoup(response.content, 'html.parser')

captions = [
    "Top 20 highest-grossing tours of all time",
    "Top 10 highest-grossing tours of the 1980s",
    "Top 10 highest-grossing tours of the 1990s",
    "Top 10 highest-grossing tours of the 2000s",
    "Top 10 highest-grossing tours of the 2010s",
    "Top 10 highest-grossing tours of the 2020s"
]

In [4]:
URL = "https://en.wikipedia.org/wiki/List_of_highest-grossing_concert_tours"
def create_tour_list(captions, url):
    df = pd.DataFrame()
    for caption in captions:
        df_tmp = pd.read_html(url, 
                          match=caption)[0]
        
        tour_links = pd.read_html(url, 
                  match=caption,
                  extract_links="all")[0][("Tour title", None)]
        
        tour_links = pd.DataFrame(tour_links.tolist(), columns=["Tour title", "Tour link"])
        df_tmp = df_tmp.merge(tour_links, on="Tour title")

        df = pd.concat([df, df_tmp]).drop_duplicates('Tour title').reset_index(drop=True)
        print(df.shape)
    
    return df

df = create_tour_list(captions, url)

(19, 11)
(29, 12)
(39, 12)
(44, 12)
(44, 12)
(49, 12)


In [5]:
df = df.rename(columns={df.columns[3]: 'Adjusted gross (in 2022 dollars)',
                   df.columns[-1]: 'Adjusted gross (in 2022 dollar)',
                   df.columns[2]: 'Actual gross'})

In [6]:
df.loc[df['Adjusted gross (in 2022 dollars)'].isna(), 'Adjusted gross (in 2022 dollars)'] = df['Adjusted gross (in 2022 dollar)']

In [7]:
df = df[["Actual gross", "Adjusted gross (in 2022 dollars)", "Artist", "Tour title", "Tour link", "Year(s)", "Shows"]]

In [8]:
df.head()

Unnamed: 0,Actual gross,Adjusted gross (in 2022 dollars),Artist,Tour title,Tour link,Year(s),Shows
0,"$939,100,000","$939,100,000",Elton John,Farewell Yellow Brick Road,/wiki/Farewell_Yellow_Brick_Road,2018–2023,330
1,"$776,200,000","$888,442,379",Ed Sheeran,÷ Tour,/wiki/%C3%B7_Tour,2017–2019,255
2,"$736,421,586","$958,001,690",U2,U2 360° Tour,/wiki/U2_360%C2%B0_Tour,2009–2011,110
3,"$667,726,905","$667,726,905",Coldplay,Music of the Spheres World Tour †,/wiki/Music_of_the_Spheres_World_Tour,2022–2023,114
4,"$617,300,000","$617,800,000",Harry Styles,Love On Tour,/wiki/Love_On_Tour,2021–2023,169


## List of most-attended concert tours
https://en.wikipedia.org/wiki/List_of_most-attended_concert_tours

In [9]:
captions = [
    "Tours attended by 5 million people or more",
    "Tours attended by 3.5 to 4.9 million people"
]
URL = "https://en.wikipedia.org/wiki/List_of_most-attended_concert_tours"

df_att = create_tour_list(captions, URL)

(14, 7)
(31, 7)


In [10]:
df_att = df_att[["Year(s)", "Tour title", "Tour link", "Artist", "Shows", "Tickets sold"]]
df = pd.concat([df, df_att]).drop_duplicates('Tour link').reset_index(drop=True)

In [11]:
df.head(5)

Unnamed: 0,Actual gross,Adjusted gross (in 2022 dollars),Artist,Tour title,Tour link,Year(s),Shows,Tickets sold
0,"$939,100,000","$939,100,000",Elton John,Farewell Yellow Brick Road,/wiki/Farewell_Yellow_Brick_Road,2018–2023,330,
1,"$776,200,000","$888,442,379",Ed Sheeran,÷ Tour,/wiki/%C3%B7_Tour,2017–2019,255,
2,"$736,421,586","$958,001,690",U2,U2 360° Tour,/wiki/U2_360%C2%B0_Tour,2009–2011,110,
3,"$667,726,905","$667,726,905",Coldplay,Music of the Spheres World Tour †,/wiki/Music_of_the_Spheres_World_Tour,2022–2023,114,
4,"$617,300,000","$617,800,000",Harry Styles,Love On Tour,/wiki/Love_On_Tour,2021–2023,169,


In [12]:
df[df["Tour title"].isin(["No Filter Tour"])]

Unnamed: 0,Actual gross,Adjusted gross (in 2022 dollars),Artist,Tour title,Tour link,Year(s),Shows,Tickets sold
8,"$546,500,000","$590,190,470",The Rolling Stones,No Filter Tour,/wiki/No_Filter_Tour,2017–2021,58,


I remove:
- index 50 - The Garth Brooks World Tour (1996–1998) - duplicated tour
- index 35 - Hell Freezes Over Tour - wrong wikipedia page, that doesn't contain info about tour
- index 29 - The Rolling Stones American Tour 1981 - it is not world tour.
- index 31 - The Division Bell Tour - I couldn't find any proof that it meant to be world tour.
- index 8 - No Filter Tour - It's European/North American tour

In [18]:
df = df.drop(index=[50, 34, 28, 31, 8]).reset_index()

# Gather data without cancelled shows

In [19]:
BASE_URL = "https://en.wikipedia.org"
for index, row in df.iterrows():
    url = BASE_URL + row["Tour link"]
    response = requests.get(url=url)

    soup = BeautifulSoup(response.content, 'html.parser')
    # print(f"{row['Tour title']}")
    for t in soup.body.findAll("table"):
        # print("Cancelled shows" in str(t.find_previous_sibling()))

        if "Cancelled shows" in str(t.find_previous_sibling()):
            t.clear()


I'm gonna combine this piece of code with code that is parsing entire tables

In [20]:
BASE_URL = "https://en.wikipedia.org"
df_tours = pd.DataFrame()

for index, row in df.iterrows():
    url = BASE_URL + row["Tour link"]
    response = requests.get(url=url)

    soup = BeautifulSoup(response.content, 'html.parser')
    for t in soup.body.findAll("table"):
        if "Cancelled shows" in str(t.find_previous_sibling()):
            t.clear()

    tables = pd.read_html(str(soup))
    df_tour_tmp = pd.DataFrame()
    for t in tables:
        if "City" in t.columns and "Country" in t.columns:
            t.columns = t.columns.get_level_values(0)
            for i, _ in enumerate(t.columns):
                if t.columns[i].startswith("Date"):
                    t = t.rename(columns={t.columns[i] : "Date"})
            df_tour_tmp = pd.concat([df_tour_tmp, t[["Country", "City", "Venue", "Date"]]])
    df_tour_tmp = df_tour_tmp.drop(df_tour_tmp[df_tour_tmp["Date"] == df_tour_tmp["City"]].index)
    df_tour_tmp["Artist"] = row["Artist"]
    df_tour_tmp["Tour title"] = row["Tour title"]
    
    df_tours = pd.concat([df_tours, df_tour_tmp])
    # print(f"{row['Tour title']}")

In [21]:
df_tours = df_tours.reset_index(drop=True)

In [22]:
df_tours.shape

(7401, 6)

In [23]:
df_tours = df_tours.drop(index=df_tours[df_tours["Country"].isna()].index ).reset_index(drop=True)

Almost hundred of shows were rejected - before skipping cancelled ones there was 7696 shows.

## Unidecode cities names

During visualization phase I realised that some cities occur in dataset in two different forms (e.g. Chorzow and Chorzów). I used unidecode function to unify those forms.

In [24]:
from unidecode import unidecode
df_tours["City"] = df_tours["City"].apply(lambda x : unidecode(x))

I also noticed that some cities have additional characters (e.g. Arlington[k]). I'm gonna use regex to remove those phrases.

In [25]:
import re
df_tours["City"] = df_tours["City"].apply(lambda x : re.sub('\[.*\]', '', x))

## Fix some errror mistakes

Few more fixes, of errors that I discovered later, at the visualization stage

In [26]:
df_tours.loc[df_tours["City"] == "West Berlin", "City"] = "Berlin"
df_tours.loc[df_tours["City"] == "Paris (Saint-Denis)", "City"] = "Paris"

In [27]:
df_tours[df_tours["Country"] == "Oakland–Alameda County Coliseum Arena"]

Unnamed: 0,Country,City,Venue,Date,Artist,Tour title
3664,Oakland–Alameda County Coliseum Arena,Oakland,,24 October 1986,Genesis,Invisible Touch Tour


In [28]:
df_tours.loc[df_tours["Country"] == "Oakland–Alameda County Coliseum Arena", "Venue"] = "Oakland–Alameda County Coliseum Arena"
df_tours.loc[df_tours["Country"] == "Oakland–Alameda County Coliseum Arena", "Country"] = "United States"

# Geolocation of cities

In [29]:
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="my_user_agent")

Many venues are recurring, so it may be good idea to cache already geolocated places in dictionary object

In [30]:
cached_locations = {}

def geolocalize_cities(df_tours, cached_locations):
    for index, row in df_tours.iterrows():
        venue = f"{df_tours.iloc[index, 1]}, {df_tours.iloc[index, 0]}"
        if venue in cached_locations:
            df_tours.loc[index, "Latitude"] = cached_locations[venue][0]
            df_tours.loc[index, "Longitude"] = cached_locations[venue][1]
        else:
            location = geolocator.geocode(venue, timeout=10)
            if not location:
                print(venue)
                continue
            df_tours.loc[index, "Latitude"] = location.latitude
            df_tours.loc[index, "Longitude"] = location.longitude
            cached_locations[venue] = [location.latitude, location.longitude]


In [31]:
geolocalize_cities(df_tours, cached_locations)

Langraaf, Netherlands
Hanover, West Germany
Moscow, Soviet Union
Moscow, Soviet Union
Moscow, Soviet Union
Moscow, Soviet Union
Moscow, Soviet Union
Cologne, West Germany
Cologne, West Germany
Hockenheim, West Germany
Wurzburg, West Germany
Hanover, West Germany
Hanover, West Germany
Cologne, West Germany
Cologne, West Germany
East Berlin, East Germany
East Berlin, East Germany
Prague, Czechoslovakia
Nurburgring, West Germany
Slane, Ireland, Republic of
Nuremberg, West Germany
Nuremberg, West Germany
Hanover, West Germany
Hanover, West Germany
Hanover, West Germany
Mursfreesboro, United States
Cologne, West Germany
Kaohsiung, Taiwan[21][22]


Now I know, that 30 places aren't geolocalised properly:
    - Some had additional characters attached to country name (Taiwan[21][22])
    - Some had country that is no longer existing (like Moscow, Soviet Union; Hanover - West Germany)
    - Some had misspelings (Langraaf -> Landgraaf; Mursfreesboro->Murfreesboro)

In [32]:
df_tours.loc[df_tours["Country"] == "West Germany", "Country"] = "Germany"
df_tours.loc[df_tours["Country"] == "Ireland, Republic of", "Country"] = "Ireland"
df_tours.loc[df_tours["Country"] == "Republic of Ireland", "Country"] = "Ireland"
df_tours.loc[df_tours["Country"] == "Czechoslovakia", "Country"] = "Czech Republic"
df_tours.loc[df_tours["Country"] == "Soviet Union", "Country"] = "Russia"
df_tours.loc[df_tours["Country"] == "East Germany", "Country"] = "Germany"
df_tours.loc[df_tours["Country"] == "Perú", "Country"] = "Peru"
df_tours.loc[df_tours["Country"] == "México", "Country"] = "Mexico"


df_tours.loc[df_tours["Country"] == "Taiwan[21][22]", "Country"] = "Taiwan"

df_tours.loc[df_tours["City"] == "Langraaf", "City"] = "Landgraaf"
df_tours.loc[df_tours["City"] == "Mursfreesboro", "City"] = "Murfreesboro"
df_tours.loc[df_tours["City"] == "East Berlin", "City"] = "Berlin"


In [33]:
geolocalize_cities(df_tours, cached_locations)

## Add continents data

Create country-continent map. CSV file comes from pytcountry-convert package: https://github.com/jefftune/pycountry-convert/tree/master.

In [34]:
import csv

country_to_continent = {}

with open('Continents_to_CountryNames.csv') as f:
    next(f)
    reader = csv.reader(f)
    for row in reader:
        country_to_continent[row[1]] = row[0]


In [35]:
country_to_continent["England"] = "Europe"
country_to_continent["Scotland"] = "Europe"
country_to_continent["Ireland"] = "Europe"
country_to_continent["Ireland"] = "Europe"
country_to_continent["Wales"] = "Europe"
country_to_continent["Northern Ireland"] = "Europe"
country_to_continent["Puerto Rico"] = "North America"
country_to_continent["Taiwan"] = "Asia"
country_to_continent["Hong Kong"] = "Asia"
country_to_continent["South Korea"] = "Asia"
country_to_continent["Russia"] = "Europe"
country_to_continent["Macau"] = "Asia"

## Add Continent column to dataframe, based on created dict.

In [36]:
df_tours["Continent"] = df_tours["Country"].apply(lambda x : country_to_continent[x])

Confirm that continent matches city in big countries located on more than one continent

In [37]:
df_tours[df_tours["Country"] == "Russia"]["City"].value_counts()

City
Moscow              19
Saint Petersburg     5
Name: count, dtype: int64

In [38]:
df_tours[df_tours["Country"] == "Turkey"]["City"].value_counts()

City
Istanbul    6
Name: count, dtype: int64

Rename 'United Kingdom' to either England, Scotland, Wales or Northern Ireland

In [39]:
df_tours[df_tours["Country"] == "United Kingdom"]

Unnamed: 0,Country,City,Venue,Date,Artist,Tour title,Latitude,Longitude,Continent
3467,United Kingdom,London,Player's Theatre,20 March,David Bowie,Glass Spider Tour,51.507446,-0.127765,Europe
3485,United Kingdom,London,Wembley Stadium,19 June,David Bowie,Glass Spider Tour,51.507446,-0.127765,Europe
3486,United Kingdom,London,Wembley Stadium,20 June,David Bowie,Glass Spider Tour,51.507446,-0.127765,Europe
3487,United Kingdom,Cardiff,National Stadium,21 June,David Bowie,Glass Spider Tour,51.481655,-3.179193,Europe
3488,United Kingdom,Sunderland,Roker Park,23 June,David Bowie,Glass Spider Tour,54.905851,-1.382873,Europe
3498,United Kingdom,Manchester,Maine Road,14 July,David Bowie,Glass Spider Tour,53.479489,-2.245115,Europe
3499,United Kingdom,Manchester,Maine Road,15 July,David Bowie,Glass Spider Tour,53.479489,-2.245115,Europe


In [40]:
df_tours.loc[df_tours["City"] == "Cardiff", "Country"] = "Wales"
df_tours.loc[df_tours["Country"] == "United Kingdom", "Country"] = "England"

# Export data

In [41]:
df_tours[df_tours["Latitude"].isna()]

Unnamed: 0,Country,City,Venue,Date,Artist,Tour title,Latitude,Longitude,Continent


In [42]:
df_tours.to_csv("../data/prepared_data_cont.csv", encoding="utf-8")