In [1]:
import requests
from bs4 import BeautifulSoup

import pandas as pd

# Create list of tours

## Highest-grossing tours 
https://en.wikipedia.org/wiki/List_of_highest-grossing_concert_tours

In [2]:
url = "https://en.wikipedia.org/wiki/List_of_highest-grossing_concert_tours"

In [3]:
response = requests.get(
	url=url,
)
print(response.status_code)

200


In [4]:
soup = BeautifulSoup(response.content, 'html.parser')

In [5]:
captions = [
    "Top 20 highest-grossing tours of all time",
    "Top 10 highest-grossing tours of the 1980s",
    "Top 10 highest-grossing tours of the 1990s",
    "Top 10 highest-grossing tours of the 2000s",
    "Top 10 highest-grossing tours of the 2010s",
    "Top 10 highest-grossing tours of the 2020s"
]

In [6]:
URL = "https://en.wikipedia.org/wiki/List_of_highest-grossing_concert_tours"
def create_tour_list(captions, url):
    df = pd.DataFrame()
    for caption in captions:
        df_tmp = pd.read_html(url, 
                          match=caption)[0]
        
        tour_links = pd.read_html(url, 
                  match=caption,
                  extract_links="all")[0][("Tour title", None)]
        
        tour_links = pd.DataFrame(tour_links.tolist(), columns=["Tour title", "Tour link"])
        df_tmp = df_tmp.merge(tour_links, on="Tour title")

        df = pd.concat([df, df_tmp]).drop_duplicates('Tour title').reset_index(drop=True)
        print(df.shape)
    
    return df

df = create_tour_list(captions, url)

(20, 11)
(30, 12)
(40, 12)
(45, 12)
(45, 12)
(49, 12)


In [7]:
df = df.rename(columns={df.columns[3]: 'Adjusted gross (in 2022 dollars)',
                   df.columns[-1]: 'Adjusted gross (in 2022 dollar)',
                   df.columns[2]: 'Actual gross'})

In [8]:
df.loc[df['Adjusted gross (in 2022 dollars)'].isna(), 'Adjusted gross (in 2022 dollars)'] = df['Adjusted gross (in 2022 dollar)']

In [9]:
df = df[["Actual gross", "Adjusted gross (in 2022 dollars)", "Artist", "Tour title", "Tour link", "Year(s)", "Shows"]]

In [10]:
df

Unnamed: 0,Actual gross,Adjusted gross (in 2022 dollars),Artist,Tour title,Tour link,Year(s),Shows
0,"$939,100,000","$939,100,000",Elton John,Farewell Yellow Brick Road,/wiki/Farewell_Yellow_Brick_Road,2018–2023,330
1,"$780,000,000","$780,000,000",Taylor Swift,The Eras Tour †,/wiki/The_Eras_Tour,2023,56
2,"$776,200,000","$888,442,379",Ed Sheeran,÷ Tour,/wiki/%C3%B7_Tour,2017–2019,255
3,"$736,421,586","$958,001,690",U2,U2 360° Tour,/wiki/U2_360%C2%B0_Tour,2009–2011,110
4,"$667,726,905","$667,726,905",Coldplay,Music of the Spheres World Tour †,/wiki/Music_of_the_Spheres_World_Tour,2022–2023,114
5,"$617,300,000","$617,800,000",Harry Styles,Love On Tour,/wiki/Love_On_Tour,2021–2023,169
6,"$584,200,000","$668,678,225",Guns N' Roses,Not in This Lifetime... Tour,/wiki/Not_in_This_Lifetime..._Tour,2016–2019,158
7,"$579,800,000","$579,800,000",Beyoncé,Renaissance World Tour,/wiki/Renaissance_World_Tour,2023,56
8,"$558,255,524","$787,883,017",The Rolling Stones,A Bigger Bang Tour,/wiki/A_Bigger_Bang_Tour,2005–2007,144
9,"$546,500,000","$590,190,470",The Rolling Stones,No Filter Tour,/wiki/No_Filter_Tour,2017–2021,58


## List of most-attended concert tours
https://en.wikipedia.org/wiki/List_of_most-attended_concert_tours

In [11]:
captions = [
    "Tours attended by 5 million people or more",
    "Tours attended by 3.5 to 4.9 million people"
]
URL = "https://en.wikipedia.org/wiki/List_of_most-attended_concert_tours"

df_att = create_tour_list(captions, URL)

(14, 7)
(31, 7)


In [12]:
df_att = df_att[["Year(s)", "Tour title", "Tour link", "Artist", "Shows", "Tickets sold"]]
df = pd.concat([df, df_att]).drop_duplicates('Tour link').reset_index(drop=True)

In [13]:
df

Unnamed: 0,Actual gross,Adjusted gross (in 2022 dollars),Artist,Tour title,Tour link,Year(s),Shows,Tickets sold
0,"$939,100,000","$939,100,000",Elton John,Farewell Yellow Brick Road,/wiki/Farewell_Yellow_Brick_Road,2018–2023,330,
1,"$780,000,000","$780,000,000",Taylor Swift,The Eras Tour †,/wiki/The_Eras_Tour,2023,56,
2,"$776,200,000","$888,442,379",Ed Sheeran,÷ Tour,/wiki/%C3%B7_Tour,2017–2019,255,
3,"$736,421,586","$958,001,690",U2,U2 360° Tour,/wiki/U2_360%C2%B0_Tour,2009–2011,110,
4,"$667,726,905","$667,726,905",Coldplay,Music of the Spheres World Tour †,/wiki/Music_of_the_Spheres_World_Tour,2022–2023,114,
5,"$617,300,000","$617,800,000",Harry Styles,Love On Tour,/wiki/Love_On_Tour,2021–2023,169,
6,"$584,200,000","$668,678,225",Guns N' Roses,Not in This Lifetime... Tour,/wiki/Not_in_This_Lifetime..._Tour,2016–2019,158,
7,"$579,800,000","$579,800,000",Beyoncé,Renaissance World Tour,/wiki/Renaissance_World_Tour,2023,56,
8,"$558,255,524","$787,883,017",The Rolling Stones,A Bigger Bang Tour,/wiki/A_Bigger_Bang_Tour,2005–2007,144,
9,"$546,500,000","$590,190,470",The Rolling Stones,No Filter Tour,/wiki/No_Filter_Tour,2017–2021,58,


In [14]:
df = df.drop(index=[50, 35]).reset_index()

In [15]:
df

Unnamed: 0,index,Actual gross,Adjusted gross (in 2022 dollars),Artist,Tour title,Tour link,Year(s),Shows,Tickets sold
0,0,"$939,100,000","$939,100,000",Elton John,Farewell Yellow Brick Road,/wiki/Farewell_Yellow_Brick_Road,2018–2023,330,
1,1,"$780,000,000","$780,000,000",Taylor Swift,The Eras Tour †,/wiki/The_Eras_Tour,2023,56,
2,2,"$776,200,000","$888,442,379",Ed Sheeran,÷ Tour,/wiki/%C3%B7_Tour,2017–2019,255,
3,3,"$736,421,586","$958,001,690",U2,U2 360° Tour,/wiki/U2_360%C2%B0_Tour,2009–2011,110,
4,4,"$667,726,905","$667,726,905",Coldplay,Music of the Spheres World Tour †,/wiki/Music_of_the_Spheres_World_Tour,2022–2023,114,
5,5,"$617,300,000","$617,800,000",Harry Styles,Love On Tour,/wiki/Love_On_Tour,2021–2023,169,
6,6,"$584,200,000","$668,678,225",Guns N' Roses,Not in This Lifetime... Tour,/wiki/Not_in_This_Lifetime..._Tour,2016–2019,158,
7,7,"$579,800,000","$579,800,000",Beyoncé,Renaissance World Tour,/wiki/Renaissance_World_Tour,2023,56,
8,8,"$558,255,524","$787,883,017",The Rolling Stones,A Bigger Bang Tour,/wiki/A_Bigger_Bang_Tour,2005–2007,144,
9,9,"$546,500,000","$590,190,470",The Rolling Stones,No Filter Tour,/wiki/No_Filter_Tour,2017–2021,58,


# Gather data about tours

In [16]:
df_tours = pd.DataFrame()

BASE_URL = "https://en.wikipedia.org"
for index, row in df.iterrows():
    url = BASE_URL + row["Tour link"]
    tables = pd.read_html(url)
    df_tour_tmp = pd.DataFrame()
    for t in tables:
        if "City" in t.columns and "Country" in t.columns:
            t.columns = t.columns.get_level_values(0)
            for i, _ in enumerate(t.columns):
                if t.columns[i].startswith("Date"):
                    t = t.rename(columns={t.columns[i] : "Date"})
            df_tour_tmp = pd.concat([df_tour_tmp, t[["Country", "City", "Venue", "Date"]]])
    df_tour_tmp = df_tour_tmp.drop(df_tour_tmp[df_tour_tmp["Date"] == df_tour_tmp["City"]].index)
    df_tour_tmp["Artist"] = row["Artist"]
    df_tour_tmp["Tour title"] = row["Tour title"]
    
    df_tours = pd.concat([df_tours, df_tour_tmp])
    print(f"{row['Tour title']}")
    # break

Farewell Yellow Brick Road
The Eras Tour †
÷ Tour
U2 360° Tour
Music of the Spheres World Tour †
Love On Tour
Not in This Lifetime... Tour
Renaissance World Tour
A Bigger Bang Tour
No Filter Tour
A Head Full of Dreams Tour
The Wall
Black Ice World Tour
WorldWired Tour
+–=÷× Tour †
Sticky & Sweet Tour
Beautiful Trauma World Tour
The Joshua Tree Tours 2017 and 2019
Vertigo Tour
24K Magic World Tour
A Momentary Lapse of Reason Tour
Bad
Steel Wheels Tour
Born in the U.S.A. Tour
Glass Spider Tour
Victory Tour
Invisible Touch Tour
Break Every Rule World Tour
The Joshua Tree Tour
The Rolling Stones American Tour 1981
Voodoo Lounge Tour
Bridges to Babylon Tour
The Division Bell Tour
PopMart Tour
HIStory World Tour
Zoo TV Tour
Let's Talk About Love World Tour
Wildest Dreams Tour
The Garth Brooks World Tour
The Police Reunion Tour
Licks Tour
Taking Chances World Tour
Living Proof: The Farewell Tour
Magic Tour
World's Hottest Tour
After Hours til Dawn Tour †
Summer Carnival †
Global Stadium Tour 

I had to remove row with Hell Freezes Over Tour by Eagles because of no information about tour on wiki page.

In [17]:
df_tours

Unnamed: 0,Country,City,Venue,Date,Artist,Tour title
0,United States,Allentown,PPL Center,8 September 2018,Elton John,Farewell Yellow Brick Road
1,United States,Philadelphia,Wells Fargo Center,11 September 2018,Elton John,Farewell Yellow Brick Road
2,United States,Philadelphia,Wells Fargo Center,12 September 2018,Elton John,Farewell Yellow Brick Road
3,United States,Buffalo,KeyBank Center,15 September 2018,Elton John,Farewell Yellow Brick Road
4,United States,University Park,Bryce Jordan Center,16 September 2018,Elton John,Farewell Yellow Brick Road
...,...,...,...,...,...,...
67,United States,Charlotte,PNC Music Pavilion,28 September 2014,One Direction,Where We Are Tour
68,United States,Atlanta,Georgia Dome,1 October 2014,One Direction,Where We Are Tour
69,United States,Tampa,Raymond James Stadium,3 October 2014,One Direction,Where We Are Tour
70,United States,Miami Gardens,Sun Life Stadium,5 October 2014,One Direction,Where We Are Tour


Now I have info about 7697 events, but it includes also cancelled ones.

In [18]:
df_tours["Venue"].isna().sum()

4

# Gather data without cancelled shows

In [45]:
BASE_URL = "https://en.wikipedia.org"
for index, row in df.iterrows():
    url = BASE_URL + row["Tour link"]
    response = requests.get(url=url)
    print(response.status_code)

    soup = BeautifulSoup(response.content, 'html.parser')
    print(f"{row['Tour title']}")
    # print(soup.body.findAll("table"))
    for t in soup.body.findAll("table"):
        # print(t.find_previous_sibling(string="Cancelled shows"))
        print("Cancelled shows" in str(t.find_previous_sibling()))

        if "Cancelled shows" in str(t.find_previous_sibling()):
            t.clear()


200
Farewell Yellow Brick Road
False
False
False
True
False
False
200
The Eras Tour †
False
False
False
False
False
False
200
÷ Tour
False
False
True
False
200
U2 360° Tour
False
False
False
200
Music of the Spheres World Tour †
False
False
False
False
False
False
True
False
False
200
Love On Tour
False
False
True
False
200
Not in This Lifetime... Tour
False
False
False
False
False
200
Renaissance World Tour
False
False
False
False
False
200
A Bigger Bang Tour
False
False
False
False
200
No Filter Tour
False
False
False
False
False
False
200
A Head Full of Dreams Tour
False
False
False
False
True
False
False
200
The Wall
False
False
False
False
False
False
False
200
Black Ice World Tour
False
False
False
False
200
WorldWired Tour
False
False
False
False
200
+–=÷× Tour †
False
False
False
False
True
False
False
200
Sticky & Sweet Tour
False
False
False
False
False
False
False
False
200
Beautiful Trauma World Tour
False
False
True
False
200
The Joshua Tree Tours 2017 and 2019
False
False

I'm gonna combine this piece of code with code that is parsing entire tables

In [47]:
BASE_URL = "https://en.wikipedia.org"
df_tours = pd.DataFrame()

for index, row in df.iterrows():
    url = BASE_URL + row["Tour link"]
    response = requests.get(url=url)

    soup = BeautifulSoup(response.content, 'html.parser')
    for t in soup.body.findAll("table"):
        if "Cancelled shows" in str(t.find_previous_sibling()):
            t.clear()

    tables = pd.read_html(str(soup))
    df_tour_tmp = pd.DataFrame()
    for t in tables:
        if "City" in t.columns and "Country" in t.columns:
            t.columns = t.columns.get_level_values(0)
            for i, _ in enumerate(t.columns):
                if t.columns[i].startswith("Date"):
                    t = t.rename(columns={t.columns[i] : "Date"})
            df_tour_tmp = pd.concat([df_tour_tmp, t[["Country", "City", "Venue", "Date"]]])
    df_tour_tmp = df_tour_tmp.drop(df_tour_tmp[df_tour_tmp["Date"] == df_tour_tmp["City"]].index)
    df_tour_tmp["Artist"] = row["Artist"]
    df_tour_tmp["Tour title"] = row["Tour title"]
    
    df_tours = pd.concat([df_tours, df_tour_tmp])
    print(f"{row['Tour title']}")

Farewell Yellow Brick Road
Farewell Yellow Brick Road
The Eras Tour †
The Eras Tour †
÷ Tour
÷ Tour
U2 360° Tour
U2 360° Tour
Music of the Spheres World Tour †
Music of the Spheres World Tour †
Love On Tour
Love On Tour
Not in This Lifetime... Tour
Not in This Lifetime... Tour
Renaissance World Tour
Renaissance World Tour
A Bigger Bang Tour
A Bigger Bang Tour
No Filter Tour
No Filter Tour
A Head Full of Dreams Tour
A Head Full of Dreams Tour
The Wall
The Wall
Black Ice World Tour
Black Ice World Tour
WorldWired Tour
WorldWired Tour
+–=÷× Tour †
+–=÷× Tour †
Sticky & Sweet Tour
Sticky & Sweet Tour
Beautiful Trauma World Tour
Beautiful Trauma World Tour
The Joshua Tree Tours 2017 and 2019
The Joshua Tree Tours 2017 and 2019
Vertigo Tour
Vertigo Tour
24K Magic World Tour
24K Magic World Tour
A Momentary Lapse of Reason Tour
A Momentary Lapse of Reason Tour
Bad
Bad
Steel Wheels Tour
Steel Wheels Tour
Born in the U.S.A. Tour
Born in the U.S.A. Tour
Glass Spider Tour
Glass Spider Tour
Victor

In [48]:
df_tours.shape

(7600, 6)

Almost hundred of shows were rejected

In [54]:
df_tours.groupby("Tour title")["Date"].count()

Tour title
+–=÷× Tour †                               114
24K Magic World Tour                       202
A Bigger Bang Tour                         146
A Head Full of Dreams Tour                 121
A Momentary Lapse of Reason Tour           199
After Hours til Dawn Tour †                 75
Bad                                        137
Beautiful Trauma World Tour                159
Black Ice World Tour                       177
Born in the U.S.A. Tour                    156
Break Every Rule World Tour                222
Bridges to Babylon Tour                    102
Dangerous World Tour                        69
Farewell Yellow Brick Road                 330
Glass Spider Tour                           96
Global Stadium Tour †                       97
HIStory World Tour                          82
Invisible Touch Tour                       115
Let's Talk About Love World Tour            87
Licks Tour                                 117
Living Proof: The Farewell Tour            324
Lo

# Geolocation of cities

In [19]:
df_tours = df_tours.reset_index(drop=True)

In [20]:
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="my_user_agent_3")
location = geolocator.geocode(f"{df_tours.iloc[0, 2]}, {df_tours.iloc[0, 1]}, {df_tours.iloc[0, 0]}")

In [21]:
location.latitude

40.602651449999996

In [22]:
print(location)

PPL Center, 701, West Hamilton Street, Center City, Allentown, Lehigh County, Pennsylvania, 18101, United States


In [33]:
f"{df_tours.iloc[0, 1]}, {df_tours.iloc[0, 0]}"

'Allentown, United States'

In [25]:
# for index, row in df_tours.iterrows():
#     location = geolocator.geocode(f"{df_tours.iloc[0, 2]}, {df_tours.iloc[0, 1]}, {df_tours.iloc[0, 0]}")
#     df.loc[index, "Latitude"] = location.latitude
#     df.loc[index, "Longtitude"] = location.longitude

Many venues are recurring, so it may be good idea to cache already geolocated places in dictionary object

I don't really need exact coordinates, city will be enough.

In [23]:
cached_locations = {}

for index, row in df_tours.iterrows():
    venue = f"{df_tours.iloc[index, 1]}, {df_tours.iloc[index, 0]}"
    if venue in cached_locations:
        df_tours.loc[index, "Latitude"] = cached_locations[venue][0]
        df_tours.loc[index, "Longtitude"] = cached_locations[venue][1]
    else:
        location = geolocator.geocode(venue, timeout=10)
        if not location:
            print(venue)
            continue
        df_tours.loc[index, "Latitude"] = location.latitude
        df_tours.loc[index, "Longtitude"] = location.longitude
        cached_locations[venue] = [location.latitude, location.longitude]


Langraaf, Netherlands
Hanover, West Germany
Moscow, Soviet Union
Moscow, Soviet Union
Moscow, Soviet Union
Moscow, Soviet Union
Moscow, Soviet Union
Cologne, West Germany
Cologne, West Germany
Hockenheim, West Germany
Würzburg, West Germany
Hanover, West Germany
Hanover, West Germany
Cologne, West Germany
Cologne, West Germany
East Berlin, East Germany
East Berlin, East Germany
Prague, Czechoslovakia
Nürburgring, West Germany
Slane, Ireland, Republic of
Nuremberg, West Germany
Nuremberg, West Germany
Hanover, West Germany
Hanover, West Germany
Hanover, West Germany
Mursfreesboro, United States
Cologne, West Germany
Taipei City, Taiwan[21][22]
Kaohsiung, Taiwan[21][22]
Taipei City, Taiwan[21][22]


Now I know, that 30 places aren't geolocalised properly:
    - Some had additional characters attached to country name (Taiwan[21][22])
    - Some had country that is no longer existing (like Moscow, Soviet Union; Hanover - West Germany)
    - Some had misspelings (Langraaf -> Landgraaf; Mursfreesboro->Murfreesboro)

In [28]:
df_tours.loc[df_tours["Country"] == "West Germany", "Country"] = "Germany"
df_tours.loc[df_tours["Country"] == "Ireland, Republic of", "Country"] = "Ireland"
df_tours.loc[df_tours["Country"] == "Czechoslovakia", "Country"] = "Czech Republic"
df_tours.loc[df_tours["Country"] == "Soviet Union", "Country"] = "Russia"
df_tours.loc[df_tours["Country"] == "East Germany", "Country"] = "Germany"


df_tours.loc[df_tours["Country"] == "Taiwan[21][22]", "Country"] = "Taiwan"

df_tours.loc[df_tours["City"] == "Langraaf", "City"] = "Landgraaf"
df_tours.loc[df_tours["City"] == "Mursfreesboro", "City"] = "Murfreesboro"
df_tours.loc[df_tours["City"] == "East Berlin", "City"] = "Berlin"


In [29]:
for index, row in df_tours.iterrows():
    venue = f"{df_tours.iloc[index, 1]}, {df_tours.iloc[index, 0]}"
    if venue in cached_locations:
        df_tours.loc[index, "Latitude"] = cached_locations[venue][0]
        df_tours.loc[index, "Longtitude"] = cached_locations[venue][1]
    else:
        location = geolocator.geocode(venue, timeout=10)
        if not location:
            print(venue)
            continue
        df_tours.loc[index, "Latitude"] = location.latitude
        df_tours.loc[index, "Longtitude"] = location.longitude
        cached_locations[venue] = [location.latitude, location.longitude]

In [31]:
df_tours.to_csv("prepared_data.csv", encoding="utf-8")