
## Scrape and Analyze Stadium Data from stadiumguide.com website
Create a Beautiful Soup object and use it to scrape the data in the HTML table

In [1]:
# importing required libraries

import pandas as pd
import numpy as np
from bs4 import BeautifulSoup as soup
from splinter import Browser

In [2]:
browser = Browser('chrome')

#### Challenge in retrieving trivia for the stadium was that in stradium table, on clicking stadium name, the url of the page containing description about the stadium did not have the stadium name consistent with the stadium name on home page. E.g. The stadium name  on the home page www.stadiumguide.com is `Estadio ABANCA-Riazor` but the stadium description url for the stadium https://www.stadiumguide.com/riazor/ contains `riazor`. So we accessed the href links in all the `a` tags on home page and then using those href urls we accessesd the stadium page to get the trivia

In [3]:
# Looping through each country to get the stadium page links and stadiums list
countries = ["england", "france", "germany", "italy", "spain"]
base_url = "https://www.stadiumguide.com/present/"

hrefs_list = []
stadiums = []

for country in countries:
    final_url = f'{base_url}{country}'
    browser.visit(final_url)
    browser.is_element_present_by_css('div.site', wait_time=1)
    html = browser.html
    try:
        stadium_soup = soup(html, 'html.parser')
        stadium_table = stadium_soup.find('tbody')
        a_tags = stadium_table.find_all('a')
        for tag in a_tags:
            hrefs_list.append(tag['href'])
            stadiums.append(tag.text) 
    except Exception as e:
        print(e)


    


In [4]:
print(len(hrefs_list))
print(len(stadiums))

236
236


In [8]:
stadiums

['Oakwell Stadium',
 'Villa Park',
 "St Andrew's Stadium",
 'Ewood Park',
 'Bloomfield Road',
 'University of Bolton Stadium',
 'Vitality Stadium',
 'Coral Windows Stadium',
 'The Amex',
 'Ashton Gate Stadium',
 'Memorial Stadium',
 'Turf Moor',
 'Gigg Lane',
 'Coventry Building Society Arena',
 'Victoria Road',
 'The iPro Stadium',
 'Keepmoat Stadium',
 'Highbury Stadium',
 'Priestfield Stadium',
 "John Smith's Stadium",
 'MKM Stadium',
 'Portman Road',
 'Elland Road',
 'King Power Stadium',
 'Goodison Park',
 'Anfield',
 'Emirates Stadium',
 'Brentford Community Stadium',
 'The Valley',
 'Stamford Bridge',
 'Selhurst Park',
 'Craven Cottage',
 'The Matchroom Stadium',
 'The Den',
 'Loftus Road',
 'Tottenham Hotspur Stadium',
 'London Stadium',
 'Wembley Stadium',
 'Kenilworth Road',
 'Etihad Stadium',
 'Old Trafford',
 'Riverside Stadium',
 'St James Park',
 'Carrow Road',
 'City Ground',
 'Meadow Lane',
 'The Kassam Stadium',
 'ABAX Stadium',
 'Fratton Park',
 'Deepdale',
 'Madejski

In [11]:
stadium_links_df = pd.DataFrame({'Stadium': stadiums, 'Trivia Page link': hrefs_list})
stadium_links_df.sample(10)

Unnamed: 0,Stadium,Trivia Page link
187,Stadio Marcantonio Bentegodi,https://www.stadiumguide.com/bentegodi/
98,Stade Charlety,https://www.stadiumguide.com/stade-charlety/
225,Estadio Benito Villamarin,https://www.stadiumguide.com/benitovillamarin/
142,Grunwalder Stadion,https://www.stadiumguide.com/grunwalder-stadion/
216,Estadio Carlos Tartiere,https://www.stadiumguide.com/estadio-carlos-ta...
182,Allianz Stadium,https://www.stadiumguide.com/juventusstadium/
51,Crown Oil Arena,https://www.stadiumguide.com/crown-oil-arena/
121,Rudolf-Harbig-Stadion,https://www.stadiumguide.com/glucksgasstadion/
50,Madejski Stadium,https://www.stadiumguide.com/madejski/
150,Volkswagen Arena,https://www.stadiumguide.com/volkswagenarena/


In [12]:
trivia_list =[]
for link in hrefs_list[:50]:
    browser.visit(link)
    browser.is_element_present_by_css('div.site', wait_time=1)
    trivia_html = browser.html
    try:
        trivia_soup = soup(trivia_html, 'html.parser')
        trivias = trivia_soup.find('section', class_='entry-content').find_all('p')
        trivia = trivias[2].text
        trivia_list.append(trivia) 
    except Exception as e:
        print(e)

# print(f'Number of Stadiums: {len(stadiums)}')
# print('First 3 stadiums in the list are:')      
# for stadium in stadiums[:3]:
#     print(stadium)
# print('-'*50)
# print(f'Number of Trivias: {len(trivia_list)}')
# print('Trivia on first 3 stadiums in the list are:')
# for trivia in trivia_list:
#     print(trivia)

In [13]:
len(trivia_list)

50

In [14]:
for link in hrefs_list[50:100]:
    browser.visit(link)
    browser.is_element_present_by_css('div.site', wait_time=1)
    trivia_html = browser.html
    try:
        trivia_soup = soup(trivia_html, 'html.parser')
        trivias = trivia_soup.find('section', class_='entry-content').find_all('p')
        trivia = trivias[2].text
        trivia_list.append(trivia) 
    except Exception as e:
        print(e)

In [15]:
len(trivia_list)

100

In [16]:
for link in hrefs_list[100:150]:
    browser.visit(link)
    browser.is_element_present_by_css('div.site', wait_time=1)
    trivia_html = browser.html
    try:
        trivia_soup = soup(trivia_html, 'html.parser')
        trivias = trivia_soup.find('section', class_='entry-content').find_all('p')
        trivia = trivias[2].text
        trivia_list.append(trivia) 
    except Exception as e:
        print(e)

In [17]:
len(trivia_list)

150

In [18]:
for link in hrefs_list[150:200]:
    browser.visit(link)
    browser.is_element_present_by_css('div.site', wait_time=1)
    trivia_html = browser.html
    try:
        trivia_soup = soup(trivia_html, 'html.parser')
        trivias = trivia_soup.find('section', class_='entry-content').find_all('p')
        trivia = trivias[2].text
        trivia_list.append(trivia) 
    except Exception as e:
        print(e)

In [19]:
len(trivia_list)

200

In [20]:
for link in hrefs_list[200:]:
    browser.visit(link)
    browser.is_element_present_by_css('div.site', wait_time=1)
    trivia_html = browser.html
    try:
        trivia_soup = soup(trivia_html, 'html.parser')
        trivias = trivia_soup.find('section', class_='entry-content').find_all('p')
        trivia = trivias[2].text
        trivia_list.append(trivia) 
    except Exception as e:
        print(e)

In [21]:
len(trivia_list)

236

In [22]:
stadium_links_df['Trivia'] = trivia_list
stadium_links_df.head()

Unnamed: 0,Stadium,Trivia Page link,Trivia
0,Oakwell Stadium,https://www.stadiumguide.com/oakwell/,Oakwell Stadium was built in 1888. It got exte...
1,Villa Park,https://www.stadiumguide.com/villapark/,"In the late 19th century, Villa Park was part ..."
2,St Andrew's Stadium,https://www.stadiumguide.com/standrews/,St. Andrew’s Stadium opened on the 26th of Dec...
3,Ewood Park,https://www.stadiumguide.com/ewoodpark/,Ewood Park developed quickly in the early 20th...
4,Bloomfield Road,https://www.stadiumguide.com/bloomfieldroad/,Football was first played at the site of Bloom...


In [23]:
stadium_df = pd.read_csv('Data/Stadiums_All.csv')

In [26]:
stadium_final_df = pd.merge(stadium_df, stadium_links_df, how='left', on="Stadium")
stadium_final_df.sample(100)


Unnamed: 0,ID,City,Club,Stadium,Cap,Country,Trivia Page link,Trivia
117,184,Cologne,1. FC Koln,RheinEnergieStadion,50000,Germany,https://www.stadiumguide.com/rheinenergie/,RheinEnergieStadion was built to replace Colog...
116,183,Brunswick,Eintracht Braunschweig,Eintracht-Stadion,23325,Germany,https://www.stadiumguide.com/eintracht-stadion/,Eintracht-Stadion got built in 1923 when Eintr...
165,231,Frosinone,Frosinone,Stadio Benito Stirpe,16227,Italy,https://www.stadiumguide.com/stadio-benito-sti...,First plans for a new stadium for the city of ...
125,192,Frankfurt,Eintracht Frankfurt,Deutsche Bank Park,51500,Germany,https://www.stadiumguide.com/commerzbankarena/,"Deutsche Bank Park, then called Commerzbank-Ar..."
183,249,Sassuolo,MAPEI Stadium - Citta del Tricolore,,20084,Italy,,
...,...,...,...,...,...,...,...,...
170,236,Livorno,Livorno,Stadio Armando Picchi,19234,Italy,https://www.stadiumguide.com/armandopicchi/,Construction of Stadio Armando Picchi started ...
50,117,Reading,Reading,Madejski Stadium,24161,England,https://www.stadiumguide.com/madejski/,Madejski Stadium replaced Reading’s previous g...
44,111,Nottingham,Nottingham Forest,City Ground,30567,England,https://www.stadiumguide.com/cityground/,Nottingham Forest moved to the City Ground in ...
174,240,Monza,Monza,Stadio Brianteo,7499,Italy,https://www.stadiumguide.com/stadio-brianteo/,"However, from approval of the plans in 1980 an..."


In [27]:
stadium_final_df.to_csv('Data/stadiums_final.csv', index=False)