
## Scrape and Analyze Stadium Data from stadiumguide.com website
Create a Beautiful Soup object and use it to scrape the data in the HTML table

In [1]:
# importing required libraries

import pandas as pd
import numpy as np
from bs4 import BeautifulSoup as soup
from splinter import Browser

In [2]:
browser = Browser('chrome')

#### Challenge in retrieving trivia for the stadium was that in stradium table, on clicking stadium name, the url of the page containing description about the stadium did not have the stadium name consistent with the stadium name on home page. E.g. The stadium name  on the home page www.stadiumguide.com is `Estadio ABANCA-Riazor` but the stadium description url for the stadium https://www.stadiumguide.com/riazor/ contains `riazor`. So we accessed the href links in all the `a` tags on home page and then using those href urls we accessesd the stadium page to get the trivia

In [3]:
# Looping through each country to get the stadium page links and stadiums list
countries = ["england", "france", "germany", "italy", "spain"]
base_url = "https://www.stadiumguide.com/present/"

hrefs_list = []
stadiums = []

for country in countries:
    final_url = f'{base_url}{country}'
    browser.visit(final_url)
    browser.is_element_present_by_css('div.site', wait_time=1)
    html = browser.html
    try:
        stadium_soup = soup(html, 'html.parser')
        stadium_table = stadium_soup.find('tbody')
        a_tags = stadium_table.find_all('a')
        for tag in a_tags:
            hrefs_list.append(tag['href'])
            stadiums.append(tag.text) 
    except Exception as e:
        print(e)


    


In [4]:
print(len(hrefs_list))
print(len(stadiums))

236
236


In [8]:
stadiums

['Oakwell Stadium',
 'Villa Park',
 "St Andrew's Stadium",
 'Ewood Park',
 'Bloomfield Road',
 'University of Bolton Stadium',
 'Vitality Stadium',
 'Coral Windows Stadium',
 'The Amex',
 'Ashton Gate Stadium',
 'Memorial Stadium',
 'Turf Moor',
 'Gigg Lane',
 'Coventry Building Society Arena',
 'Victoria Road',
 'The iPro Stadium',
 'Keepmoat Stadium',
 'Highbury Stadium',
 'Priestfield Stadium',
 "John Smith's Stadium",
 'MKM Stadium',
 'Portman Road',
 'Elland Road',
 'King Power Stadium',
 'Goodison Park',
 'Anfield',
 'Emirates Stadium',
 'Brentford Community Stadium',
 'The Valley',
 'Stamford Bridge',
 'Selhurst Park',
 'Craven Cottage',
 'The Matchroom Stadium',
 'The Den',
 'Loftus Road',
 'Tottenham Hotspur Stadium',
 'London Stadium',
 'Wembley Stadium',
 'Kenilworth Road',
 'Etihad Stadium',
 'Old Trafford',
 'Riverside Stadium',
 'St James Park',
 'Carrow Road',
 'City Ground',
 'Meadow Lane',
 'The Kassam Stadium',
 'ABAX Stadium',
 'Fratton Park',
 'Deepdale',
 'Madejski

In [11]:
stadium_links_df = pd.DataFrame({'Stadium': stadiums, 'Trivia Page link': hrefs_list})
stadium_links_df.sample(10)

Unnamed: 0,Stadium,Trivia Page link
187,Stadio Marcantonio Bentegodi,https://www.stadiumguide.com/bentegodi/
98,Stade Charlety,https://www.stadiumguide.com/stade-charlety/
225,Estadio Benito Villamarin,https://www.stadiumguide.com/benitovillamarin/
142,Grunwalder Stadion,https://www.stadiumguide.com/grunwalder-stadion/
216,Estadio Carlos Tartiere,https://www.stadiumguide.com/estadio-carlos-ta...
182,Allianz Stadium,https://www.stadiumguide.com/juventusstadium/
51,Crown Oil Arena,https://www.stadiumguide.com/crown-oil-arena/
121,Rudolf-Harbig-Stadion,https://www.stadiumguide.com/glucksgasstadion/
50,Madejski Stadium,https://www.stadiumguide.com/madejski/
150,Volkswagen Arena,https://www.stadiumguide.com/volkswagenarena/


In [12]:
trivia_list =[]
for link in hrefs_list[:50]:
    browser.visit(link)
    browser.is_element_present_by_css('div.site', wait_time=1)
    trivia_html = browser.html
    try:
        trivia_soup = soup(trivia_html, 'html.parser')
        trivias = trivia_soup.find('section', class_='entry-content').find_all('p')
        trivia = trivias[2].text
        trivia_list.append(trivia) 
    except Exception as e:
        print(e)

# print(f'Number of Stadiums: {len(stadiums)}')
# print('First 3 stadiums in the list are:')      
# for stadium in stadiums[:3]:
#     print(stadium)
# print('-'*50)
# print(f'Number of Trivias: {len(trivia_list)}')
# print('Trivia on first 3 stadiums in the list are:')
# for trivia in trivia_list:
#     print(trivia)

In [13]:
len(trivia_list)

50

In [14]:
for link in hrefs_list[50:100]:
    browser.visit(link)
    browser.is_element_present_by_css('div.site', wait_time=1)
    trivia_html = browser.html
    try:
        trivia_soup = soup(trivia_html, 'html.parser')
        trivias = trivia_soup.find('section', class_='entry-content').find_all('p')
        trivia = trivias[2].text
        trivia_list.append(trivia) 
    except Exception as e:
        print(e)

In [15]:
len(trivia_list)

100

In [16]:
for link in hrefs_list[100:150]:
    browser.visit(link)
    browser.is_element_present_by_css('div.site', wait_time=1)
    trivia_html = browser.html
    try:
        trivia_soup = soup(trivia_html, 'html.parser')
        trivias = trivia_soup.find('section', class_='entry-content').find_all('p')
        trivia = trivias[2].text
        trivia_list.append(trivia) 
    except Exception as e:
        print(e)

In [17]:
len(trivia_list)

150

In [18]:
for link in hrefs_list[150:200]:
    browser.visit(link)
    browser.is_element_present_by_css('div.site', wait_time=1)
    trivia_html = browser.html
    try:
        trivia_soup = soup(trivia_html, 'html.parser')
        trivias = trivia_soup.find('section', class_='entry-content').find_all('p')
        trivia = trivias[2].text
        trivia_list.append(trivia) 
    except Exception as e:
        print(e)

In [19]:
len(trivia_list)

200

In [20]:
for link in hrefs_list[200:]:
    browser.visit(link)
    browser.is_element_present_by_css('div.site', wait_time=1)
    trivia_html = browser.html
    try:
        trivia_soup = soup(trivia_html, 'html.parser')
        trivias = trivia_soup.find('section', class_='entry-content').find_all('p')
        trivia = trivias[2].text
        trivia_list.append(trivia) 
    except Exception as e:
        print(e)

In [21]:
len(trivia_list)

236

In [22]:
stadium_links_df['Trivia'] = trivia_list
stadium_links_df.head()

Unnamed: 0,Stadium,Trivia Page link,Trivia
0,Oakwell Stadium,https://www.stadiumguide.com/oakwell/,Oakwell Stadium was built in 1888. It got exte...
1,Villa Park,https://www.stadiumguide.com/villapark/,"In the late 19th century, Villa Park was part ..."
2,St Andrew's Stadium,https://www.stadiumguide.com/standrews/,St. Andrew’s Stadium opened on the 26th of Dec...
3,Ewood Park,https://www.stadiumguide.com/ewoodpark/,Ewood Park developed quickly in the early 20th...
4,Bloomfield Road,https://www.stadiumguide.com/bloomfieldroad/,Football was first played at the site of Bloom...


In [29]:
stadium_df = pd.read_csv('Data/Stadium_Lat_Lng_data.csv')
stadium_df.head()

Unnamed: 0,City,Club,Stadium,Cap,Country,Longitude,Latitude
0,Barnsley,Barnsley,Oakwell Stadium,23287,England,-1.467725,53.55214
1,Birmingham,Aston Villa,Villa Park,42785,England,-87.977837,41.889312
2,Birmingham,Birmingham City,St Andrew's Stadium,30009,England,-1.868041,52.475567
3,Blackburn,Blackburn Rovers,Ewood Park,31154,England,-2.489258,53.728609
4,Blackpool,Blackpool,Bloomfield Road,16007,England,-2.373736,51.365293


In [30]:
stadium_final_df = pd.merge(stadium_df, stadium_links_df, how='left', on="Stadium")
stadium_final_df.sample(100)


Unnamed: 0,City,Club,Stadium,Cap,Country,Longitude,Latitude,Trivia Page link,Trivia
53,Scunthorpe,Scunthorpe United,Glanford Park,9088,England,-0.695326,53.586813,https://www.stadiumguide.com/glanford-park/,Glanford Park got built in the 1980s to replac...
15,Derby,Derby County,The iPro Stadium,33010,England,-54.715325,-61.116337,https://www.stadiumguide.com/prideparkstadium/,"The iPro Stadium, previously known as Pride Pa..."
61,Sunderland,Sunderland,Stadium of Light,49000,England,-1.388258,54.914395,https://www.stadiumguide.com/stadiumoflight/,The Stadium of Light was built in the mid 1990...
43,Norwich,Norwich City,Carrow Road,27010,England,1.310132,52.621983,https://www.stadiumguide.com/carrowroad/,"Norwich quickly found a new site though, and i..."
121,Dresden,Dynamo Dresden,Rudolf-Harbig-Stadion,32249,Germany,13.748038,51.040855,https://www.stadiumguide.com/glucksgasstadion/,"By the 2000s, however, the old stadium had sig..."
...,...,...,...,...,...,...,...,...,...
7,Bradford,Bradford City,Coral Windows Stadium,25136,England,-78.711268,26.536886,https://www.stadiumguide.com/valleyparade/,"Valley Parade was then still a modest stadium,..."
1,Birmingham,Aston Villa,Villa Park,42785,England,-87.977837,41.889312,https://www.stadiumguide.com/villapark/,"In the late 19th century, Villa Park was part ..."
233,Vitoria-Gasteiz,Deportivo Alaves,Estadio de Mendizorroza,19840,Spain,-9.260969,38.708812,https://www.stadiumguide.com/estadio-mendizorr...,Estadio de Mendizorroza (Mendizorrotza in Basq...
18,Gillingham,Gillingham FC,Priestfield Stadium,10500,England,0.560950,51.384211,https://www.stadiumguide.com/priestfield-stadium/,"Priestfield Stadium, officially MEMS Priestfie..."


In [31]:
stadium_final_df.to_csv('Data/stadiums_final.csv', index=False)

In [32]:
browser.quit()