In [157]:
from bs4 import BeautifulSoup as bs
import requests
import pandas as pd

In [158]:
url = "https://en.wikipedia.org/wiki/List_of_FIFA_World_Cup_penalty_shoot-outs"
r = requests.get(url)
soup = bs(r.content, "lxml")

In [159]:
t = soup.find_all('table')

In [160]:
table = t[1]

In [161]:
rows = table.find_all('tr')

In [162]:
d = {}

In [163]:
d['winners'] = []
for row in rows:
    cells = row.find_all('td')
    if cells:
        d['winners'].append(cells[0].text[1:-1]) #spliting string as plain form is '\xa0West Germany\n'

print(d['winners'])

['West Germany', 'France', 'West Germany', 'Belgium', 'Republic of Ireland', 'Argentina', 'Argentina', 'West Germany', 'Bulgaria', 'Sweden', 'Brazil', 'Argentina', 'France', 'Brazil', 'Spain', 'South Korea', 'Ukraine', 'Germany', 'Portugal', 'Italy', 'Paraguay', 'Uruguay', 'Brazil', 'Costa Rica', 'Netherlands', 'Argentina', 'Russia', 'Croatia', 'England', 'Croatia']


In [164]:
d['losers'] = []
for row in rows:
    cells = row.find_all('td')
    if cells:
        d['losers'].append(cells[2].text[1:-1]) #spliting string as plain form is '\xa0West Germany\n'

print(d['losers'])

['France', 'Brazil', 'Mexico', 'Spain', 'Romania', 'Yugoslavia', 'Italy', 'England', 'Mexico', 'Romania', 'Italy', 'England', 'Italy', 'Netherlands', 'Republic of Ireland', 'Spain', '\xa0Switzerland', 'Argentina', 'England', 'France', 'Japan', 'Ghana', 'Chile', 'Greece', 'Costa Rica', 'Netherlands', 'Spain', 'Denmark', 'Colombia', 'Russia']


In [165]:
d['losers'][d['losers'].index('\xa0Switzerland')] = 'Switzerland'
print(d['losers'])

['France', 'Brazil', 'Mexico', 'Spain', 'Romania', 'Yugoslavia', 'Italy', 'England', 'Mexico', 'Romania', 'Italy', 'England', 'Italy', 'Netherlands', 'Republic of Ireland', 'Spain', 'Switzerland', 'Argentina', 'England', 'France', 'Japan', 'Ghana', 'Chile', 'Greece', 'Costa Rica', 'Netherlands', 'Spain', 'Denmark', 'Colombia', 'Russia']


In [166]:
d['gameID'] = list(range(1, len(d['winners'])+1))

In [167]:
#Games must have ended in a draw to get to penalties so only need one number for goals scored
d['finalScoreGoalsEach'] = []
for row in rows:
    cells = row.find_all('td')
    if cells:
        d['finalScoreGoalsEach'].append(cells[1].text[0])

print(d['finalScoreGoalsEach'])

['3', '1', '0', '1', '0', '0', '1', '1', '1', '2', '0', '2', '0', '1', '1', '0', '0', '1', '0', '1', '0', '1', '1', '1', '0', '0', '1', '1', '1', '2']


In [168]:
print(d)

{'winners': ['West Germany', 'France', 'West Germany', 'Belgium', 'Republic of Ireland', 'Argentina', 'Argentina', 'West Germany', 'Bulgaria', 'Sweden', 'Brazil', 'Argentina', 'France', 'Brazil', 'Spain', 'South Korea', 'Ukraine', 'Germany', 'Portugal', 'Italy', 'Paraguay', 'Uruguay', 'Brazil', 'Costa Rica', 'Netherlands', 'Argentina', 'Russia', 'Croatia', 'England', 'Croatia'], 'losers': ['France', 'Brazil', 'Mexico', 'Spain', 'Romania', 'Yugoslavia', 'Italy', 'England', 'Mexico', 'Romania', 'Italy', 'England', 'Italy', 'Netherlands', 'Republic of Ireland', 'Spain', 'Switzerland', 'Argentina', 'England', 'France', 'Japan', 'Ghana', 'Chile', 'Greece', 'Costa Rica', 'Netherlands', 'Spain', 'Denmark', 'Colombia', 'Russia'], 'gameID': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30], 'finalScoreGoalsEach': ['3', '1', '0', '1', '0', '0', '1', '1', '1', '2', '0', '2', '0', '1', '1', '0', '0', '1', '0', '1', '0', '1', '1', '1', '

In [169]:
def penKickers(row, splitToKeep):
    kickers = []
    for i in range(len(row[1:])):
        kicker = row[i+1].split('title="')[splitToKeep].split('"')[0]
        
        #some names showing as "Willian (footballer, born 1988)" so clipping off "(footballer..."
        kicker = kicker.split(' (')[0]
        kickers.append(kicker)
        
    return kickers

def penResults(row):
    results = []
    for i in range(len(row[1:])):
        if 'Penalty scored' in row[i+1]:
            results.append(1)
        else:
            results.append(0)
    return results

In [170]:
d['winnerPenalties'] = []
for i in range(len(rows)):
    cells = rows[i].find_all('td')
    if cells: #need test as the two header rows would cause an error when trying to split
        convertedRow = str(cells[7]).split('<span class="nowrap">')
        
        d['winnerPenalties'].append({})
        d['winnerPenalties'][i-2]['kickers'] = penKickers(convertedRow, 1)
        d['winnerPenalties'][i-2]['results'] = penResults(convertedRow)

d['winnerPenalties']

[{'kickers': ['Manfred Kaltz',
   'Paul Breitner',
   'Uli Stielike',
   'Pierre Littbarski',
   'Karl-Heinz Rummenigge',
   'Horst Hrubesch'],
  'results': [1, 1, 0, 1, 1, 1]},
 {'kickers': ['Yannick Stopyra',
   'Manuel Amoros',
   'Bruno Bellone',
   'Michel Platini',
   'Luis Fernández'],
  'results': [1, 1, 1, 0, 1]},
 {'kickers': ['Klaus Allofs',
   'Andreas Brehme',
   'Lothar Matthäus',
   'Pierre Littbarski'],
  'results': [1, 1, 1, 1]},
 {'kickers': ['Nico Claesen',
   'Enzo Scifo',
   'Hugo Broos',
   'Patrick Vervoort',
   'Leo Van der Elst'],
  'results': [1, 1, 1, 1, 1]},
 {'kickers': ['Kevin Sheedy',
   'Ray Houghton',
   'Andy Townsend',
   'Tony Cascarino',
   "David O'Leary"],
  'results': [1, 1, 1, 1, 1]},
 {'kickers': ['José Serrizuela',
   'Jorge Burruchaga',
   'Diego Maradona',
   'Pedro Troglio',
   'Gustavo Dezotti'],
  'results': [1, 1, 0, 0, 1]},
 {'kickers': ['José Serrizuela',
   'Jorge Burruchaga',
   'Julio Olarticoechea',
   'Diego Maradona'],
  'results

In [171]:
d['loserPenalties'] = []
for i in range(len(rows)):
    cells = rows[i].find_all('td')
    if cells: #need test as the two header rows would cause an error when trying to split
        convertedRow = str(cells[8]).split('<span class="nowrap">')
        #print(convertedRow)
        d['loserPenalties'].append({})
        d['loserPenalties'][i-2]['kickers'] = penKickers(convertedRow, 2)
        d['loserPenalties'][i-2]['results'] = penResults(convertedRow)

d['loserPenalties']

[{'kickers': ['Alain Giresse',
   'Manuel Amoros',
   'Dominique Rocheteau',
   'Didier Six',
   'Michel Platini',
   'Maxime Bossis'],
  'results': [1, 1, 1, 0, 1, 0]},
 {'kickers': ['Sócrates', 'Alemão', 'Zico', 'Branco', 'Júlio César'],
  'results': [0, 1, 1, 1, 0]},
 {'kickers': ['Manuel Negrete Arias', 'Fernando Quirarte', 'Raúl Servín'],
  'results': [1, 0, 0]},
 {'kickers': ['Juan Antonio Señor',
   'Eloy Olaya',
   'Chendo',
   'Emilio Butragueño',
   'Víctor Muñoz'],
  'results': [1, 0, 1, 1, 1]},
 {'kickers': ['Gheorghe Hagi',
   'Dănuț Lupu',
   'Iosif Rotariu',
   'Ioan Lupescu',
   'Daniel Timofte'],
  'results': [1, 1, 1, 1, 0]},
 {'kickers': ['Dragan Stojković',
   'Robert Prosinečki',
   'Dejan Savićević',
   'Dragoljub Brnović',
   'Faruk Hadžibegić'],
  'results': [0, 1, 1, 0, 0]},
 {'kickers': ['Franco Baresi',
   'Roberto Baggio',
   'Luigi De Agostini',
   'Roberto Donadoni',
   'Aldo Serena'],
  'results': [1, 1, 1, 0, 0]},
 {'kickers': ['Gary Lineker',
   'Peter 

In [193]:
#if team won it's first player will be highlighted with a grey background so test if 
#that color's tag is in the cell

d['winnerTookFirstPenalty'] = []
for i in range(len(rows)):
    cells = rows[i].find_all('td')
    if cells: #need test as the two header rows would cause an error when trying to split
        if 'background-color: #C3C3C3' in str(cells[7]):
            d['winnerTookFirstPenalty'].append(1)
        else:
            d['winnerTookFirstPenalty'].append(0)
        
print(d['winnerTookFirstPenalty'])

[0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]


In [223]:
for i in range(2, len(rows)):
    cells = rows[3].find_all('td')
    print(str(cells[10])) #[:-1] to get rid of '\n' at end of string
    break

<td rowspan="3"><a href="/wiki/1986_FIFA_World_Cup" title="1986 FIFA World Cup">1986, Mexico</a>
</td>


In [245]:
d['year'] = []
d['hostCountry'] = []

i = 2 #skip rows 0 and 1 as they're headers
loopAgain = True
while loopAgain:
    cell = rows[i].find_all('td')[10]
    yearAndLocationSplit = cell.text[:-1].split(', ') #[:-1] to strip '\n' off end
    
    d['year'].append(int(yearAndLocationSplit[0]))
    d['hostCountry'].append(yearAndLocationSplit[1])
    
    i += 1
    
    #deal with cells that span multiple rows.
    if "rowspan" in str(cell):
        extraloops = int(str(cell).split('"')[1]) - 1 #-1 as already done one loop above
        for j in range(extraloops):
            d['year'].append(int(yearAndLocationSplit[0]))
            d['hostCountry'].append(yearAndLocationSplit[1])
            i += 1
    
    if i>=len(rows):
        loopAgain = False
        
print(d['year'])
print(d['hostCountry'])

[1982, 1986, 1986, 1986, 1990, 1990, 1990, 1990, 1994, 1994, 1994, 1998, 1998, 1998, 2002, 2002, 2006, 2006, 2006, 2006, 2010, 2010, 2014, 2014, 2014, 2014, 2018, 2018, 2018, 2018]
['Spain', 'Mexico', 'Mexico', 'Mexico', 'Italy', 'Italy', 'Italy', 'Italy', 'USA', 'USA', 'USA', 'France', 'France', 'France', 'Korea/Japan', 'Korea/Japan', 'Germany', 'Germany', 'Germany', 'Germany', 'South Africa', 'South Africa', 'Brazil', 'Brazil', 'Brazil', 'Brazil', 'Russia', 'Russia', 'Russia', 'Russia']
