In [1]:
from bs4 import BeautifulSoup as bs
import requests
import pandas as pd

In [2]:
url = "https://en.wikipedia.org/wiki/List_of_FIFA_World_Cup_penalty_shoot-outs"
r = requests.get(url)
soup = bs(r.content, "lxml")

In [3]:
t = soup.find_all('table')

In [4]:
table = t[1]

In [5]:
rows = table.find_all('tr')

In [6]:
d = {}

In [7]:
d['winners'] = []
for row in rows:
    cells = row.find_all('td')
    if cells:
        d['winners'].append(cells[0].text[1:-1]) #spliting string as plain form is '\xa0West Germany\n'

print(d['winners'])

['West Germany', 'France', 'West Germany', 'Belgium', 'Republic of Ireland', 'Argentina', 'Argentina', 'West Germany', 'Bulgaria', 'Sweden', 'Brazil', 'Argentina', 'France', 'Brazil', 'Spain', 'South Korea', 'Ukraine', 'Germany', 'Portugal', 'Italy', 'Paraguay', 'Uruguay', 'Brazil', 'Costa Rica', 'Netherlands', 'Argentina', 'Russia', 'Croatia', 'England', 'Croatia']


In [8]:
d['losers'] = []
for row in rows:
    cells = row.find_all('td')
    if cells:
        d['losers'].append(cells[2].text[1:-1]) #spliting string as plain form is '\xa0West Germany\n'

print(d['losers'])

['France', 'Brazil', 'Mexico', 'Spain', 'Romania', 'Yugoslavia', 'Italy', 'England', 'Mexico', 'Romania', 'Italy', 'England', 'Italy', 'Netherlands', 'Republic of Ireland', 'Spain', '\xa0Switzerland', 'Argentina', 'England', 'France', 'Japan', 'Ghana', 'Chile', 'Greece', 'Costa Rica', 'Netherlands', 'Spain', 'Denmark', 'Colombia', 'Russia']


In [9]:
d['losers'][d['losers'].index('\xa0Switzerland')] = 'Switzerland'
print(d['losers'])

['France', 'Brazil', 'Mexico', 'Spain', 'Romania', 'Yugoslavia', 'Italy', 'England', 'Mexico', 'Romania', 'Italy', 'England', 'Italy', 'Netherlands', 'Republic of Ireland', 'Spain', 'Switzerland', 'Argentina', 'England', 'France', 'Japan', 'Ghana', 'Chile', 'Greece', 'Costa Rica', 'Netherlands', 'Spain', 'Denmark', 'Colombia', 'Russia']


In [10]:
d['gameID'] = list(range(1, len(d['winners'])+1))

In [11]:
#Games must have ended in a draw to get to penalties so only need one number for goals scored
d['prePenaltiesGoalsEach'] = []
for row in rows:
    cells = row.find_all('td')
    if cells:
        d['prePenaltiesGoalsEach'].append(cells[1].text[0])

print(d['prePenaltiesGoalsEach'])

['3', '1', '0', '1', '0', '0', '1', '1', '1', '2', '0', '2', '0', '1', '1', '0', '0', '1', '0', '1', '0', '1', '1', '1', '0', '0', '1', '1', '1', '2']


In [12]:
print(d)

{'winners': ['West Germany', 'France', 'West Germany', 'Belgium', 'Republic of Ireland', 'Argentina', 'Argentina', 'West Germany', 'Bulgaria', 'Sweden', 'Brazil', 'Argentina', 'France', 'Brazil', 'Spain', 'South Korea', 'Ukraine', 'Germany', 'Portugal', 'Italy', 'Paraguay', 'Uruguay', 'Brazil', 'Costa Rica', 'Netherlands', 'Argentina', 'Russia', 'Croatia', 'England', 'Croatia'], 'losers': ['France', 'Brazil', 'Mexico', 'Spain', 'Romania', 'Yugoslavia', 'Italy', 'England', 'Mexico', 'Romania', 'Italy', 'England', 'Italy', 'Netherlands', 'Republic of Ireland', 'Spain', 'Switzerland', 'Argentina', 'England', 'France', 'Japan', 'Ghana', 'Chile', 'Greece', 'Costa Rica', 'Netherlands', 'Spain', 'Denmark', 'Colombia', 'Russia'], 'gameID': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30], 'prePenaltiesGoalsEach': ['3', '1', '0', '1', '0', '0', '1', '1', '1', '2', '0', '2', '0', '1', '1', '0', '0', '1', '0', '1', '0', '1', '1', '1',

In [25]:
def penKickerAndResult(cell, splitToKeep):
    kickersAndResults = []
    for i in range(len(cell[1:])):
        kicker = cell[i+1].split('title="')[splitToKeep].split('"')[0]
        
        #some names showing as "Willian (footballer, born 1988)" so clipping off "(footballer..."
        kicker = kicker.split(' (')[0]
        
        if 'Penalty scored' in cell[i+1]:
            result = 1
        else:
            result = 0
        
        kickersAndResults.append((kicker, result))
            
    return kickersAndResults

In [26]:
d['winnerPenalties'] = []
for i in range(len(rows)):
    cells = rows[i].find_all('td')
    if cells: #need test as the two header rows would cause an error when trying to split
        convertedCell = str(cells[7]).split('<span class="nowrap">')
        d['winnerPenalties'].append(penKickerAndResult(convertedCell, 1))

d['winnerPenalties']

[[('Manfred Kaltz', 1),
  ('Paul Breitner', 1),
  ('Uli Stielike', 0),
  ('Pierre Littbarski', 1),
  ('Karl-Heinz Rummenigge', 1),
  ('Horst Hrubesch', 1)],
 [('Yannick Stopyra', 1),
  ('Manuel Amoros', 1),
  ('Bruno Bellone', 1),
  ('Michel Platini', 0),
  ('Luis Fernández', 1)],
 [('Klaus Allofs', 1),
  ('Andreas Brehme', 1),
  ('Lothar Matthäus', 1),
  ('Pierre Littbarski', 1)],
 [('Nico Claesen', 1),
  ('Enzo Scifo', 1),
  ('Hugo Broos', 1),
  ('Patrick Vervoort', 1),
  ('Leo Van der Elst', 1)],
 [('Kevin Sheedy', 1),
  ('Ray Houghton', 1),
  ('Andy Townsend', 1),
  ('Tony Cascarino', 1),
  ("David O'Leary", 1)],
 [('José Serrizuela', 1),
  ('Jorge Burruchaga', 1),
  ('Diego Maradona', 0),
  ('Pedro Troglio', 0),
  ('Gustavo Dezotti', 1)],
 [('José Serrizuela', 1),
  ('Jorge Burruchaga', 1),
  ('Julio Olarticoechea', 1),
  ('Diego Maradona', 1)],
 [('Andreas Brehme', 1),
  ('Lothar Matthäus', 1),
  ('Karl-Heinz Riedle', 1),
  ('Olaf Thon', 1)],
 [('Krasimir Balakov', 0),
  ('Boncho

In [27]:
d['loserPenalties'] = []
for i in range(len(rows)):
    cells = rows[i].find_all('td')
    if cells: #need test as the two header rows would cause an error when trying to split
        convertedCell = str(cells[8]).split('<span class="nowrap">')
        d['loserPenalties'].append(penKickerAndResult(convertedCell, 2))

d['loserPenalties']

[[('Alain Giresse', 1),
  ('Manuel Amoros', 1),
  ('Dominique Rocheteau', 1),
  ('Didier Six', 0),
  ('Michel Platini', 1),
  ('Maxime Bossis', 0)],
 [('Sócrates', 0),
  ('Alemão', 1),
  ('Zico', 1),
  ('Branco', 1),
  ('Júlio César', 0)],
 [('Manuel Negrete Arias', 1), ('Fernando Quirarte', 0), ('Raúl Servín', 0)],
 [('Juan Antonio Señor', 1),
  ('Eloy Olaya', 0),
  ('Chendo', 1),
  ('Emilio Butragueño', 1),
  ('Víctor Muñoz', 1)],
 [('Gheorghe Hagi', 1),
  ('Dănuț Lupu', 1),
  ('Iosif Rotariu', 1),
  ('Ioan Lupescu', 1),
  ('Daniel Timofte', 0)],
 [('Dragan Stojković', 0),
  ('Robert Prosinečki', 1),
  ('Dejan Savićević', 1),
  ('Dragoljub Brnović', 0),
  ('Faruk Hadžibegić', 0)],
 [('Franco Baresi', 1),
  ('Roberto Baggio', 1),
  ('Luigi De Agostini', 1),
  ('Roberto Donadoni', 0),
  ('Aldo Serena', 0)],
 [('Gary Lineker', 1),
  ('Peter Beardsley', 1),
  ('David Platt', 1),
  ('Stuart Pearce', 0),
  ('Chris Waddle', 0)],
 [('Alberto García Aspe', 0),
  ('Marcelino Bernal', 0),
  ('J

In [28]:
#if team won it's first player will be highlighted with a grey background so test if 
#that color's tag is in the cell

d['winnerTookFirstPenalty'] = []
for i in range(len(rows)):
    cells = rows[i].find_all('td')
    if cells: #need test as the two header rows would cause an error when trying to split
        if 'background-color: #C3C3C3' in str(cells[7]):
            d['winnerTookFirstPenalty'].append(1)
        else:
            d['winnerTookFirstPenalty'].append(0)
        
print(d['winnerTookFirstPenalty'])

[0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]


In [29]:
for i in range(2, len(rows)):
    cells = rows[3].find_all('td')
    print(str(cells[10])) #[:-1] to get rid of '\n' at end of string
    break

<td rowspan="3"><a href="/wiki/1986_FIFA_World_Cup" title="1986 FIFA World Cup">1986, Mexico</a>
</td>


In [30]:
d['year'] = []
d['hostCountry'] = []

i = 2 #skip rows 0 and 1 as they're headers
loopAgain = True
while loopAgain:
    cell = rows[i].find_all('td')[10]
    yearAndLocationSplit = cell.text[:-1].split(', ') #[:-1] to strip '\n' off end
    
    d['year'].append(int(yearAndLocationSplit[0]))
    d['hostCountry'].append(yearAndLocationSplit[1])
    
    i += 1
    
    #deal with cells that span multiple rows.
    if "rowspan" in str(cell):
        extraloops = int(str(cell).split('"')[1]) - 1 #-1 as already done one loop above
        for j in range(extraloops):
            d['year'].append(int(yearAndLocationSplit[0]))
            d['hostCountry'].append(yearAndLocationSplit[1])
            i += 1
    
    if i >= len(rows):
        loopAgain = False
        
print(d['year'])
print(d['hostCountry'])

[1982, 1986, 1986, 1986, 1990, 1990, 1990, 1990, 1994, 1994, 1994, 1998, 1998, 1998, 2002, 2002, 2006, 2006, 2006, 2006, 2010, 2010, 2014, 2014, 2014, 2014, 2018, 2018, 2018, 2018]
['Spain', 'Mexico', 'Mexico', 'Mexico', 'Italy', 'Italy', 'Italy', 'Italy', 'USA', 'USA', 'USA', 'France', 'France', 'France', 'Korea/Japan', 'Korea/Japan', 'Germany', 'Germany', 'Germany', 'Germany', 'South Africa', 'South Africa', 'Brazil', 'Brazil', 'Brazil', 'Brazil', 'Russia', 'Russia', 'Russia', 'Russia']


In [31]:
d['stage'] = []

i = 2 #skip rows 0 and 1 as they're headers
loopAgain = True
while loopAgain:
    previousColCell = rows[i].find_all('td')[10]
    
    if '#' in str(previousColCell):
        cell = previousColCell
        stageVar = previousColCell.text
    elif ('style="display:none;"' in str(previousColCell) or 
          'style="white-space:nowrap"' in str(previousColCell)):
        #work around for dealing with game8 where there's a merged
        #cell for stage, but it doesn't start on the same row as the 
        #merged cell for the year/location.
        
        #leave cell as value from last loop
        pass
    else:
        cell = rows[i].find_all('td')[11]
        
    stageVar = cell.text[2:-1] #[1:-1] as text starts as '\xa0 Semi-finals\n'
    d['stage'].append(stageVar)
    
    i += 1
    
    #deal with cells that define how many rows they span
    if "rowspan" in str(cell):
        extraloops = int(str(cell).split('"')[1]) - 1 #-1 as already done one loop above
        for j in range(extraloops):
            d['stage'].append(stageVar)
            i += 1
    
    if i >= len(rows):
        loopAgain = False
        
d['stage']

['Semi-finals',
 'Quarter-finals',
 'Quarter-finals',
 'Quarter-finals',
 'Second round',
 'Quarter-finals',
 'Quarter-finals',
 'Quarter-finals',
 'Second round',
 'Quarter-finals',
 'Final',
 'Second round',
 'Quarter-finals',
 'Semi-finals',
 'Second round',
 'Quarter-finals',
 'Second round',
 'Quarter-finals',
 'Quarter-finals',
 'Final',
 'Second round',
 'Quarter-finals',
 'Second round',
 'Second round',
 'Quarter-finals',
 'Semi-finals',
 'Second round',
 'Second round',
 'Second round',
 'Quarter-finals']

In [65]:
df = pd.DataFrame(d)
df

Unnamed: 0,gameID,hostCountry,loserPenalties,losers,prePenaltiesGoalsEach,stage,winnerPenalties,winnerTookFirstPenalty,winners,year
0,1,Spain,"[(Alain Giresse, 1), (Manuel Amoros, 1), (Domi...",France,3,Semi-finals,"[(Manfred Kaltz, 1), (Paul Breitner, 1), (Uli ...",0,West Germany,1982
1,2,Mexico,"[(Sócrates, 0), (Alemão, 1), (Zico, 1), (Branc...",Brazil,1,Quarter-finals,"[(Yannick Stopyra, 1), (Manuel Amoros, 1), (Br...",0,France,1986
2,3,Mexico,"[(Manuel Negrete Arias, 1), (Fernando Quirarte...",Mexico,0,Quarter-finals,"[(Klaus Allofs, 1), (Andreas Brehme, 1), (Loth...",1,West Germany,1986
3,4,Mexico,"[(Juan Antonio Señor, 1), (Eloy Olaya, 0), (Ch...",Spain,1,Quarter-finals,"[(Nico Claesen, 1), (Enzo Scifo, 1), (Hugo Bro...",0,Belgium,1986
4,5,Italy,"[(Gheorghe Hagi, 1), (Dănuț Lupu, 1), (Iosif R...",Romania,0,Second round,"[(Kevin Sheedy, 1), (Ray Houghton, 1), (Andy T...",0,Republic of Ireland,1990
5,6,Italy,"[(Dragan Stojković, 0), (Robert Prosinečki, 1)...",Yugoslavia,0,Quarter-finals,"[(José Serrizuela, 1), (Jorge Burruchaga, 1), ...",1,Argentina,1990
6,7,Italy,"[(Franco Baresi, 1), (Roberto Baggio, 1), (Lui...",Italy,1,Quarter-finals,"[(José Serrizuela, 1), (Jorge Burruchaga, 1), ...",0,Argentina,1990
7,8,Italy,"[(Gary Lineker, 1), (Peter Beardsley, 1), (Dav...",England,1,Quarter-finals,"[(Andreas Brehme, 1), (Lothar Matthäus, 1), (K...",0,West Germany,1990
8,9,USA,"[(Alberto García Aspe, 0), (Marcelino Bernal, ...",Mexico,1,Second round,"[(Krasimir Balakov, 0), (Boncho Genchev, 1), (...",0,Bulgaria,1994
9,10,USA,"[(Florin Răducioiu, 1), (Gheorghe Hagi, 1), (I...",Romania,2,Quarter-finals,"[(Håkan Mild, 0), (Kennet Andersson, 1), (Toma...",1,Sweden,1994
