In [2]:
import requests
from bs4 import BeautifulSoup

import pandas as pd

# Create list of tours

## Highest-grossing tours 
https://en.wikipedia.org/wiki/List_of_highest-grossing_concert_tours

In [3]:
url = "https://en.wikipedia.org/wiki/List_of_highest-grossing_concert_tours"

In [4]:
response = requests.get(
	url=url,
)
print(response.status_code)

200


In [5]:
soup = BeautifulSoup(response.content, 'html.parser')

In [6]:
captions = [
    "Top 20 highest-grossing tours of all time",
    "Top 10 highest-grossing tours of the 1980s",
    "Top 10 highest-grossing tours of the 1990s",
    "Top 10 highest-grossing tours of the 2000s",
    "Top 10 highest-grossing tours of the 2010s",
    "Top 10 highest-grossing tours of the 2020s"
]

In [7]:
URL = "https://en.wikipedia.org/wiki/List_of_highest-grossing_concert_tours"
def create_tour_list(captions, url):
    df = pd.DataFrame()
    for caption in captions:
        df_tmp = pd.read_html(url, 
                          match=caption)[0]
        
        tour_links = pd.read_html(url, 
                  match=caption,
                  extract_links="all")[0][("Tour title", None)]
        
        tour_links = pd.DataFrame(tour_links.tolist(), columns=["Tour title", "Tour link"])
        df_tmp = df_tmp.merge(tour_links, on="Tour title")

        df = pd.concat([df, df_tmp]).drop_duplicates('Tour title').reset_index(drop=True)
        print(df.shape)
    
    return df

df = create_tour_list(captions, url)

(20, 11)
(30, 12)
(40, 12)
(45, 12)
(45, 12)
(49, 12)


In [8]:
df = df.rename(columns={df.columns[3]: 'Adjusted gross (in 2022 dollars)',
                   df.columns[-1]: 'Adjusted gross (in 2022 dollar)',
                   df.columns[2]: 'Actual gross'})

In [9]:
df.loc[df['Adjusted gross (in 2022 dollars)'].isna(), 'Adjusted gross (in 2022 dollars)'] = df['Adjusted gross (in 2022 dollar)']

In [10]:
df = df[["Actual gross", "Adjusted gross (in 2022 dollars)", "Artist", "Tour title", "Tour link", "Year(s)", "Shows"]]

In [11]:
df

Unnamed: 0,Actual gross,Adjusted gross (in 2022 dollars),Artist,Tour title,Tour link,Year(s),Shows
0,"$939,100,000","$939,100,000",Elton John,Farewell Yellow Brick Road,/wiki/Farewell_Yellow_Brick_Road,2018–2023,330
1,"$780,000,000","$780,000,000",Taylor Swift,The Eras Tour †,/wiki/The_Eras_Tour,2023,56
2,"$776,200,000","$888,442,379",Ed Sheeran,÷ Tour,/wiki/%C3%B7_Tour,2017–2019,255
3,"$736,421,586","$958,001,690",U2,U2 360° Tour,/wiki/U2_360%C2%B0_Tour,2009–2011,110
4,"$667,726,905","$667,726,905",Coldplay,Music of the Spheres World Tour †,/wiki/Music_of_the_Spheres_World_Tour,2022–2023,114
5,"$617,300,000","$617,800,000",Harry Styles,Love On Tour,/wiki/Love_On_Tour,2021–2023,169
6,"$584,200,000","$668,678,225",Guns N' Roses,Not in This Lifetime... Tour,/wiki/Not_in_This_Lifetime..._Tour,2016–2019,158
7,"$579,800,000","$579,800,000",Beyoncé,Renaissance World Tour,/wiki/Renaissance_World_Tour,2023,56
8,"$558,255,524","$787,883,017",The Rolling Stones,A Bigger Bang Tour,/wiki/A_Bigger_Bang_Tour,2005–2007,144
9,"$546,500,000","$590,190,470",The Rolling Stones,No Filter Tour,/wiki/No_Filter_Tour,2017–2021,58


## List of most-attended concert tours
https://en.wikipedia.org/wiki/List_of_most-attended_concert_tours

In [12]:
captions = [
    "Tours attended by 5 million people or more",
    "Tours attended by 3.5 to 4.9 million people"
]
URL = "https://en.wikipedia.org/wiki/List_of_most-attended_concert_tours"

df_att = create_tour_list(captions, URL)

(14, 7)
(31, 7)


In [13]:
df_att = df_att[["Year(s)", "Tour title", "Tour link", "Artist", "Shows", "Tickets sold"]]
df = pd.concat([df, df_att]).drop_duplicates('Tour link').reset_index(drop=True)

In [14]:
df

Unnamed: 0,Actual gross,Adjusted gross (in 2022 dollars),Artist,Tour title,Tour link,Year(s),Shows,Tickets sold
0,"$939,100,000","$939,100,000",Elton John,Farewell Yellow Brick Road,/wiki/Farewell_Yellow_Brick_Road,2018–2023,330,
1,"$780,000,000","$780,000,000",Taylor Swift,The Eras Tour †,/wiki/The_Eras_Tour,2023,56,
2,"$776,200,000","$888,442,379",Ed Sheeran,÷ Tour,/wiki/%C3%B7_Tour,2017–2019,255,
3,"$736,421,586","$958,001,690",U2,U2 360° Tour,/wiki/U2_360%C2%B0_Tour,2009–2011,110,
4,"$667,726,905","$667,726,905",Coldplay,Music of the Spheres World Tour †,/wiki/Music_of_the_Spheres_World_Tour,2022–2023,114,
5,"$617,300,000","$617,800,000",Harry Styles,Love On Tour,/wiki/Love_On_Tour,2021–2023,169,
6,"$584,200,000","$668,678,225",Guns N' Roses,Not in This Lifetime... Tour,/wiki/Not_in_This_Lifetime..._Tour,2016–2019,158,
7,"$579,800,000","$579,800,000",Beyoncé,Renaissance World Tour,/wiki/Renaissance_World_Tour,2023,56,
8,"$558,255,524","$787,883,017",The Rolling Stones,A Bigger Bang Tour,/wiki/A_Bigger_Bang_Tour,2005–2007,144,
9,"$546,500,000","$590,190,470",The Rolling Stones,No Filter Tour,/wiki/No_Filter_Tour,2017–2021,58,


In [86]:
df = df.drop(index=[50, 35]).reset_index()

In [16]:
df

Unnamed: 0,Actual gross,Adjusted gross (in 2022 dollars),Artist,Tour title,Tour link,Year(s),Shows,Tickets sold
0,"$939,100,000","$939,100,000",Elton John,Farewell Yellow Brick Road,/wiki/Farewell_Yellow_Brick_Road,2018–2023,330,
1,"$780,000,000","$780,000,000",Taylor Swift,The Eras Tour †,/wiki/The_Eras_Tour,2023,56,
2,"$776,200,000","$888,442,379",Ed Sheeran,÷ Tour,/wiki/%C3%B7_Tour,2017–2019,255,
3,"$736,421,586","$958,001,690",U2,U2 360° Tour,/wiki/U2_360%C2%B0_Tour,2009–2011,110,
4,"$667,726,905","$667,726,905",Coldplay,Music of the Spheres World Tour †,/wiki/Music_of_the_Spheres_World_Tour,2022–2023,114,
5,"$617,300,000","$617,800,000",Harry Styles,Love On Tour,/wiki/Love_On_Tour,2021–2023,169,
6,"$584,200,000","$668,678,225",Guns N' Roses,Not in This Lifetime... Tour,/wiki/Not_in_This_Lifetime..._Tour,2016–2019,158,
7,"$579,800,000","$579,800,000",Beyoncé,Renaissance World Tour,/wiki/Renaissance_World_Tour,2023,56,
8,"$558,255,524","$787,883,017",The Rolling Stones,A Bigger Bang Tour,/wiki/A_Bigger_Bang_Tour,2005–2007,144,
9,"$546,500,000","$590,190,470",The Rolling Stones,No Filter Tour,/wiki/No_Filter_Tour,2017–2021,58,


# Gather data about tours

In [54]:
t.columns = t.columns.get_level_values(0)

In [55]:
t

Unnamed: 0,Date,City,Country,Venue,Attendance,Revenue
0,8 September 2018,Allentown,United States,PPL Center,"8,983 / 8,983","$1,492,859"
1,11 September 2018,Philadelphia,United States,Wells Fargo Center,"29,531 / 29,531","$4,273,021"
2,12 September 2018,Philadelphia,United States,Wells Fargo Center,"29,531 / 29,531","$4,273,021"
3,15 September 2018,Buffalo,United States,KeyBank Center,"15,581 / 15,641","$2,033,001"
4,16 September 2018,University Park,United States,Bryce Jordan Center,"12,421 / 12,421","$1,662,468"
...,...,...,...,...,...,...
334,2 July 2023,Zürich,Switzerland,Hallenstadion,—,—
335,5 July 2023,Copenhagen,Denmark,Royal Arena,—,—
336,7 July 2023,Stockholm,Sweden,Tele2 Arena,—,—
337,8 July 2023,Stockholm,Sweden,Tele2 Arena,—,—


In [87]:
df_tours = pd.DataFrame()

BASE_URL = "https://en.wikipedia.org"
for index, row in df.iterrows():
    url = BASE_URL + row["Tour link"]
    tables = pd.read_html(url)
    df_tour_tmp = pd.DataFrame()
    for t in tables:
        if "City" in t.columns and "Country" in t.columns:
            t.columns = t.columns.get_level_values(0)
            for i, _ in enumerate(t.columns):
                if t.columns[i].startswith("Date"):
                    t = t.rename(columns={t.columns[i] : "Date"})
            df_tour_tmp = pd.concat([df_tour_tmp, t[["Country", "City", "Venue", "Date"]]])
    df_tour_tmp = df_tour_tmp.drop(df_tour_tmp[df_tour_tmp["Date"] == df_tour_tmp["City"]].index)
    df_tour_tmp["Artist"] = row["Artist"]
    
    df_tours = pd.concat([df_tours, df_tour_tmp])
    print(f"{row['Tour title']}")
    # break

Farewell Yellow Brick Road
The Eras Tour †
÷ Tour
U2 360° Tour
Music of the Spheres World Tour †
Love On Tour
Not in This Lifetime... Tour
Renaissance World Tour
A Bigger Bang Tour
No Filter Tour
A Head Full of Dreams Tour
The Wall
Black Ice World Tour
WorldWired Tour
+–=÷× Tour †
Sticky & Sweet Tour
Beautiful Trauma World Tour
The Joshua Tree Tours 2017 and 2019
Vertigo Tour
24K Magic World Tour
A Momentary Lapse of Reason Tour
Bad
Steel Wheels Tour
Born in the U.S.A. Tour
Glass Spider Tour
Victory Tour
Invisible Touch Tour
Break Every Rule World Tour
The Joshua Tree Tour
The Rolling Stones American Tour 1981
Voodoo Lounge Tour
Bridges to Babylon Tour
The Division Bell Tour
PopMart Tour
HIStory World Tour
Zoo TV Tour
Let's Talk About Love World Tour
Wildest Dreams Tour
The Garth Brooks World Tour
The Police Reunion Tour
Licks Tour
Taking Chances World Tour
Living Proof: The Farewell Tour
Magic Tour
World's Hottest Tour
After Hours til Dawn Tour †
Summer Carnival †
Global Stadium Tour 

I had to remove row with Hell Freezes Over Tour by Eagles because of no information about tour on wiki page.

In [88]:
df_tours

Unnamed: 0,Country,City,Venue,Date,Artist
0,United States,Allentown,PPL Center,8 September 2018,Elton John
1,United States,Philadelphia,Wells Fargo Center,11 September 2018,Elton John
2,United States,Philadelphia,Wells Fargo Center,12 September 2018,Elton John
3,United States,Buffalo,KeyBank Center,15 September 2018,Elton John
4,United States,University Park,Bryce Jordan Center,16 September 2018,Elton John
...,...,...,...,...,...
67,United States,Charlotte,PNC Music Pavilion,28 September 2014,One Direction
68,United States,Atlanta,Georgia Dome,1 October 2014,One Direction
69,United States,Tampa,Raymond James Stadium,3 October 2014,One Direction
70,United States,Miami Gardens,Sun Life Stadium,5 October 2014,One Direction


Now I have info about 7697 events, but it includes also cancelled ones.

In [122]:
df_tours["Venue"].isna().sum()

4

# Gather data without cancelled shows

# Geolocation of cities

In [None]:
df_tours = df_tours.reset_index()

In [143]:
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="my_user_agent_3")
location = geolocator.geocode(f"{df_tours.iloc[0, 2]}, {df_tours.iloc[0, 1]}, {df_tours.iloc[0, 0]}")

In [144]:
location.latitude

39.345732

In [145]:
print(location)

0, Cumberland County, Illinois, 62435, United States


In [123]:
f"{df_tours.iloc[0, 1]}, {df_tours.iloc[0, 0]}"

'Allentown, United States'

In [125]:
# for index, row in df_tours.iterrows():
#     location = geolocator.geocode(f"{df_tours.iloc[0, 2]}, {df_tours.iloc[0, 1]}, {df_tours.iloc[0, 0]}")
#     df.loc[index, "Latitude"] = location.latitude
#     df.loc[index, "Longtitude"] = location.longitude

GeocoderUnavailable: HTTPSConnectionPool(host='nominatim.openstreetmap.org', port=443): Max retries exceeded with url: /search?q=PPL+Center%2C+Allentown%2C+United+States&format=json&limit=1 (Caused by ReadTimeoutError("HTTPSConnectionPool(host='nominatim.openstreetmap.org', port=443): Read timed out. (read timeout=1)"))

Many venues are recurring, so it may be good idea to cache already geolocated places in dictionary object

In [146]:
cached_locations = {}

for index, row in df_tours.iterrows():
    venue = f"{df_tours.iloc[index, 2]}, {df_tours.iloc[index, 1]}, {df_tours.iloc[index, 0]}"
    if venue in cached_locations:
        df_tours.loc[index, "Latitude"] = cached_locations[venue][0]
        df_tours.loc[index, "Longtitude"] = cached_locations[venue][1]
    else:
        location = geolocator.geocode(venue)
        if not location:
            venue2 = f"{df_tours.iloc[index, 1]}, {df_tours.iloc[index, 0]}"
            location = geolocator.geocode(venue2)
        df_tours.loc[index, "Latitude"] = location.latitude
        df_tours.loc[index, "Longtitude"] = location.longitude
        cached_locations[venue] = [location.latitude, location.longitude]


GeocoderUnavailable: HTTPSConnectionPool(host='nominatim.openstreetmap.org', port=443): Max retries exceeded with url: /search?q=United+States%2C+1%2C+1&format=json&limit=1 (Caused by ReadTimeoutError("HTTPSConnectionPool(host='nominatim.openstreetmap.org', port=443): Read timed out. (read timeout=1)"))

In [142]:
df_tours

Unnamed: 0,level_0,index,Country,City,Venue,Date,Artist,Latitude,Longtitude
0,0,0,United States,Allentown,PPL Center,8 September 2018,Elton John,39.345732,-88.243257
1,1,1,United States,Philadelphia,Wells Fargo Center,11 September 2018,Elton John,40.218404,-79.487816
2,2,2,United States,Philadelphia,Wells Fargo Center,12 September 2018,Elton John,40.218404,-79.487816
3,3,3,United States,Buffalo,KeyBank Center,15 September 2018,Elton John,40.218404,-79.487816
4,4,4,United States,University Park,Bryce Jordan Center,16 September 2018,Elton John,40.218404,-79.487816
...,...,...,...,...,...,...,...,...,...
7692,7692,67,United States,Charlotte,PNC Music Pavilion,28 September 2014,One Direction,40.733604,-74.171063
7693,7693,68,United States,Atlanta,Georgia Dome,1 October 2014,One Direction,40.750513,-73.993516
7694,7694,69,United States,Tampa,Raymond James Stadium,3 October 2014,One Direction,40.750513,-73.993516
7695,7695,70,United States,Miami Gardens,Sun Life Stadium,5 October 2014,One Direction,40.682511,-73.975252
