In [98]:
from bs4 import BeautifulSoup as bs
import requests
import pandas as pd
import numpy as np

In [2]:
url = "https://en.wikipedia.org/wiki/List_of_FIFA_World_Cup_penalty_shoot-outs"
r = requests.get(url)
soup = bs(r.content, "lxml")

In [3]:
t = soup.find_all('table')

In [4]:
table = t[1]

In [5]:
rows = table.find_all('tr')

In [74]:
d = {}

In [75]:
d['winners'] = []
for row in rows:
    cells = row.find_all('td')
    if cells:
        d['winners'].append(cells[0].text[1:-1]) #spliting string as plain form is '\xa0West Germany\n'

print(d['winners'])

['West Germany', 'France', 'West Germany', 'Belgium', 'Republic of Ireland', 'Argentina', 'Argentina', 'West Germany', 'Bulgaria', 'Sweden', 'Brazil', 'Argentina', 'France', 'Brazil', 'Spain', 'South Korea', 'Ukraine', 'Germany', 'Portugal', 'Italy', 'Paraguay', 'Uruguay', 'Brazil', 'Costa Rica', 'Netherlands', 'Argentina', 'Russia', 'Croatia', 'England', 'Croatia']


In [76]:
d['losers'] = []
for row in rows:
    cells = row.find_all('td')
    if cells:
        d['losers'].append(cells[2].text[1:-1]) #spliting string as plain form is '\xa0West Germany\n'

print(d['losers'])

['France', 'Brazil', 'Mexico', 'Spain', 'Romania', 'Yugoslavia', 'Italy', 'England', 'Mexico', 'Romania', 'Italy', 'England', 'Italy', 'Netherlands', 'Republic of Ireland', 'Spain', '\xa0Switzerland', 'Argentina', 'England', 'France', 'Japan', 'Ghana', 'Chile', 'Greece', 'Costa Rica', 'Netherlands', 'Spain', 'Denmark', 'Colombia', 'Russia']


In [77]:
d['losers'][d['losers'].index('\xa0Switzerland')] = 'Switzerland'
print(d['losers'])

['France', 'Brazil', 'Mexico', 'Spain', 'Romania', 'Yugoslavia', 'Italy', 'England', 'Mexico', 'Romania', 'Italy', 'England', 'Italy', 'Netherlands', 'Republic of Ireland', 'Spain', 'Switzerland', 'Argentina', 'England', 'France', 'Japan', 'Ghana', 'Chile', 'Greece', 'Costa Rica', 'Netherlands', 'Spain', 'Denmark', 'Colombia', 'Russia']


In [78]:
d['gameID'] = list(range(1, len(d['winners'])+1))

In [79]:
#Games must have ended in a draw to get to penalties so only need one number for goals scored
d['prePenaltiesGoalsEach'] = []
for row in rows:
    cells = row.find_all('td')
    if cells:
        d['prePenaltiesGoalsEach'].append(cells[1].text[0])

print(d['prePenaltiesGoalsEach'])

['3', '1', '0', '1', '0', '0', '1', '1', '1', '2', '0', '2', '0', '1', '1', '0', '0', '1', '0', '1', '0', '1', '1', '1', '0', '0', '1', '1', '1', '2']


In [80]:
print(d)

{'winners': ['West Germany', 'France', 'West Germany', 'Belgium', 'Republic of Ireland', 'Argentina', 'Argentina', 'West Germany', 'Bulgaria', 'Sweden', 'Brazil', 'Argentina', 'France', 'Brazil', 'Spain', 'South Korea', 'Ukraine', 'Germany', 'Portugal', 'Italy', 'Paraguay', 'Uruguay', 'Brazil', 'Costa Rica', 'Netherlands', 'Argentina', 'Russia', 'Croatia', 'England', 'Croatia'], 'losers': ['France', 'Brazil', 'Mexico', 'Spain', 'Romania', 'Yugoslavia', 'Italy', 'England', 'Mexico', 'Romania', 'Italy', 'England', 'Italy', 'Netherlands', 'Republic of Ireland', 'Spain', 'Switzerland', 'Argentina', 'England', 'France', 'Japan', 'Ghana', 'Chile', 'Greece', 'Costa Rica', 'Netherlands', 'Spain', 'Denmark', 'Colombia', 'Russia'], 'gameID': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30], 'prePenaltiesGoalsEach': ['3', '1', '0', '1', '0', '0', '1', '1', '1', '2', '0', '2', '0', '1', '1', '0', '0', '1', '0', '1', '0', '1', '1', '1',

In [81]:
def penKickerAndResult(cell, splitToKeep):
    kickersAndResults = []
    for i in range(len(cell[1:])):
        kicker = cell[i+1].split('title="')[splitToKeep].split('"')[0]
        
        #some names showing as "Willian (footballer, born 1988)" so clipping off "(footballer..."
        kicker = kicker.split(' (')[0]
        
        if 'Penalty scored' in cell[i+1]:
            result = 1
        else:
            result = 0
        
        kickersAndResults.append((kicker, result))
            
    return kickersAndResults

In [82]:
d['winnerPenalties'] = []
for i in range(len(rows)):
    cells = rows[i].find_all('td')
    if cells: #need test as the two header rows would cause an error when trying to split
        convertedCell = str(cells[7]).split('<span class="nowrap">')
        d['winnerPenalties'].append(penKickerAndResult(convertedCell, 1))

d['winnerPenalties']

[[('Manfred Kaltz', 1),
  ('Paul Breitner', 1),
  ('Uli Stielike', 0),
  ('Pierre Littbarski', 1),
  ('Karl-Heinz Rummenigge', 1),
  ('Horst Hrubesch', 1)],
 [('Yannick Stopyra', 1),
  ('Manuel Amoros', 1),
  ('Bruno Bellone', 1),
  ('Michel Platini', 0),
  ('Luis Fernández', 1)],
 [('Klaus Allofs', 1),
  ('Andreas Brehme', 1),
  ('Lothar Matthäus', 1),
  ('Pierre Littbarski', 1)],
 [('Nico Claesen', 1),
  ('Enzo Scifo', 1),
  ('Hugo Broos', 1),
  ('Patrick Vervoort', 1),
  ('Leo Van der Elst', 1)],
 [('Kevin Sheedy', 1),
  ('Ray Houghton', 1),
  ('Andy Townsend', 1),
  ('Tony Cascarino', 1),
  ("David O'Leary", 1)],
 [('José Serrizuela', 1),
  ('Jorge Burruchaga', 1),
  ('Diego Maradona', 0),
  ('Pedro Troglio', 0),
  ('Gustavo Dezotti', 1)],
 [('José Serrizuela', 1),
  ('Jorge Burruchaga', 1),
  ('Julio Olarticoechea', 1),
  ('Diego Maradona', 1)],
 [('Andreas Brehme', 1),
  ('Lothar Matthäus', 1),
  ('Karl-Heinz Riedle', 1),
  ('Olaf Thon', 1)],
 [('Krasimir Balakov', 0),
  ('Boncho

In [83]:
d['loserPenalties'] = []
for i in range(len(rows)):
    cells = rows[i].find_all('td')
    if cells: #need test as the two header rows would cause an error when trying to split
        convertedCell = str(cells[8]).split('<span class="nowrap">')
        d['loserPenalties'].append(penKickerAndResult(convertedCell, 2))

d['loserPenalties']

[[('Alain Giresse', 1),
  ('Manuel Amoros', 1),
  ('Dominique Rocheteau', 1),
  ('Didier Six', 0),
  ('Michel Platini', 1),
  ('Maxime Bossis', 0)],
 [('Sócrates', 0),
  ('Alemão', 1),
  ('Zico', 1),
  ('Branco', 1),
  ('Júlio César', 0)],
 [('Manuel Negrete Arias', 1), ('Fernando Quirarte', 0), ('Raúl Servín', 0)],
 [('Juan Antonio Señor', 1),
  ('Eloy Olaya', 0),
  ('Chendo', 1),
  ('Emilio Butragueño', 1),
  ('Víctor Muñoz', 1)],
 [('Gheorghe Hagi', 1),
  ('Dănuț Lupu', 1),
  ('Iosif Rotariu', 1),
  ('Ioan Lupescu', 1),
  ('Daniel Timofte', 0)],
 [('Dragan Stojković', 0),
  ('Robert Prosinečki', 1),
  ('Dejan Savićević', 1),
  ('Dragoljub Brnović', 0),
  ('Faruk Hadžibegić', 0)],
 [('Franco Baresi', 1),
  ('Roberto Baggio', 1),
  ('Luigi De Agostini', 1),
  ('Roberto Donadoni', 0),
  ('Aldo Serena', 0)],
 [('Gary Lineker', 1),
  ('Peter Beardsley', 1),
  ('David Platt', 1),
  ('Stuart Pearce', 0),
  ('Chris Waddle', 0)],
 [('Alberto García Aspe', 0),
  ('Marcelino Bernal', 0),
  ('J

In [84]:
#if team won it's first player will be highlighted with a grey background so test if 
#that color's tag is in the cell

d['winnerTookFirstPenalty'] = []
for i in range(len(rows)):
    cells = rows[i].find_all('td')
    if cells: #need test as the two header rows would cause an error when trying to split
        if 'background-color: #C3C3C3' in str(cells[7]):
            d['winnerTookFirstPenalty'].append(True)
        else:
            d['winnerTookFirstPenalty'].append(False)
        
print(d['winnerTookFirstPenalty'])

[False, False, True, False, False, True, False, False, False, True, False, True, True, True, False, True, True, True, True, True, True, True, True, True, False, False, False, False, False, False]


In [85]:
for i in range(2, len(rows)):
    cells = rows[3].find_all('td')
    print(str(cells[10])) #[:-1] to get rid of '\n' at end of string
    break

<td rowspan="3"><a href="/wiki/1986_FIFA_World_Cup" title="1986 FIFA World Cup">1986, Mexico</a>
</td>


In [86]:
d['year'] = []
d['hostCountry'] = []

i = 2 #skip rows 0 and 1 as they're headers
loopAgain = True
while loopAgain:
    cell = rows[i].find_all('td')[10]
    yearAndLocationSplit = cell.text[:-1].split(', ') #[:-1] to strip '\n' off end
    
    d['year'].append(int(yearAndLocationSplit[0]))
    d['hostCountry'].append(yearAndLocationSplit[1])
    
    i += 1
    
    #deal with cells that span multiple rows.
    if "rowspan" in str(cell):
        extraloops = int(str(cell).split('"')[1]) - 1 #-1 as already done one loop above
        for j in range(extraloops):
            d['year'].append(int(yearAndLocationSplit[0]))
            d['hostCountry'].append(yearAndLocationSplit[1])
            i += 1
    
    if i >= len(rows):
        loopAgain = False
        
print(d['year'])
print(d['hostCountry'])

[1982, 1986, 1986, 1986, 1990, 1990, 1990, 1990, 1994, 1994, 1994, 1998, 1998, 1998, 2002, 2002, 2006, 2006, 2006, 2006, 2010, 2010, 2014, 2014, 2014, 2014, 2018, 2018, 2018, 2018]
['Spain', 'Mexico', 'Mexico', 'Mexico', 'Italy', 'Italy', 'Italy', 'Italy', 'USA', 'USA', 'USA', 'France', 'France', 'France', 'Korea/Japan', 'Korea/Japan', 'Germany', 'Germany', 'Germany', 'Germany', 'South Africa', 'South Africa', 'Brazil', 'Brazil', 'Brazil', 'Brazil', 'Russia', 'Russia', 'Russia', 'Russia']


In [87]:
d['stage'] = []

i = 2 #skip rows 0 and 1 as they're headers
loopAgain = True
while loopAgain:
    previousColCell = rows[i].find_all('td')[10]
    
    if '#' in str(previousColCell):
        cell = previousColCell
        stageVar = previousColCell.text
    elif ('style="display:none;"' in str(previousColCell) or 
          'style="white-space:nowrap"' in str(previousColCell)):
        #work around for dealing with game8 where there's a merged
        #cell for stage, but it doesn't start on the same row as the 
        #merged cell for the year/location.
        
        #leave cell as value from last loop
        pass
    else:
        cell = rows[i].find_all('td')[11]
        
    stageVar = cell.text[2:-1] #[1:-1] as text starts as '\xa0 Semi-finals\n'
    d['stage'].append(stageVar)
    
    i += 1
    
    #deal with cells that define how many rows they span
    if "rowspan" in str(cell):
        extraloops = int(str(cell).split('"')[1]) - 1 #-1 as already done one loop above
        for j in range(extraloops):
            d['stage'].append(stageVar)
            i += 1
    
    if i >= len(rows):
        loopAgain = False
        
d['stage']

['Semi-finals',
 'Quarter-finals',
 'Quarter-finals',
 'Quarter-finals',
 'Second round',
 'Quarter-finals',
 'Quarter-finals',
 'Quarter-finals',
 'Second round',
 'Quarter-finals',
 'Final',
 'Second round',
 'Quarter-finals',
 'Semi-finals',
 'Second round',
 'Quarter-finals',
 'Second round',
 'Quarter-finals',
 'Quarter-finals',
 'Final',
 'Second round',
 'Quarter-finals',
 'Second round',
 'Second round',
 'Quarter-finals',
 'Semi-finals',
 'Second round',
 'Second round',
 'Second round',
 'Quarter-finals']

In [138]:
df = pd.DataFrame(d)
df

Unnamed: 0,gameID,hostCountry,loserPenalties,losers,prePenaltiesGoalsEach,stage,winnerPenalties,winnerTookFirstPenalty,winners,year
0,1,Spain,"[(Alain Giresse, 1), (Manuel Amoros, 1), (Domi...",France,3,Semi-finals,"[(Manfred Kaltz, 1), (Paul Breitner, 1), (Uli ...",False,West Germany,1982
1,2,Mexico,"[(Sócrates, 0), (Alemão, 1), (Zico, 1), (Branc...",Brazil,1,Quarter-finals,"[(Yannick Stopyra, 1), (Manuel Amoros, 1), (Br...",False,France,1986
2,3,Mexico,"[(Manuel Negrete Arias, 1), (Fernando Quirarte...",Mexico,0,Quarter-finals,"[(Klaus Allofs, 1), (Andreas Brehme, 1), (Loth...",True,West Germany,1986
3,4,Mexico,"[(Juan Antonio Señor, 1), (Eloy Olaya, 0), (Ch...",Spain,1,Quarter-finals,"[(Nico Claesen, 1), (Enzo Scifo, 1), (Hugo Bro...",False,Belgium,1986
4,5,Italy,"[(Gheorghe Hagi, 1), (Dănuț Lupu, 1), (Iosif R...",Romania,0,Second round,"[(Kevin Sheedy, 1), (Ray Houghton, 1), (Andy T...",False,Republic of Ireland,1990
5,6,Italy,"[(Dragan Stojković, 0), (Robert Prosinečki, 1)...",Yugoslavia,0,Quarter-finals,"[(José Serrizuela, 1), (Jorge Burruchaga, 1), ...",True,Argentina,1990
6,7,Italy,"[(Franco Baresi, 1), (Roberto Baggio, 1), (Lui...",Italy,1,Quarter-finals,"[(José Serrizuela, 1), (Jorge Burruchaga, 1), ...",False,Argentina,1990
7,8,Italy,"[(Gary Lineker, 1), (Peter Beardsley, 1), (Dav...",England,1,Quarter-finals,"[(Andreas Brehme, 1), (Lothar Matthäus, 1), (K...",False,West Germany,1990
8,9,USA,"[(Alberto García Aspe, 0), (Marcelino Bernal, ...",Mexico,1,Second round,"[(Krasimir Balakov, 0), (Boncho Genchev, 1), (...",False,Bulgaria,1994
9,10,USA,"[(Florin Răducioiu, 1), (Gheorghe Hagi, 1), (I...",Romania,2,Quarter-finals,"[(Håkan Mild, 0), (Kennet Andersson, 1), (Toma...",True,Sweden,1994


In [139]:
df.columns

Index(['gameID', 'hostCountry', 'loserPenalties', 'losers',
       'prePenaltiesGoalsEach', 'stage', 'winnerPenalties',
       'winnerTookFirstPenalty', 'winners', 'year'],
      dtype='object')

In [140]:
nonWinLoserCols = list(df.columns)
nonWinLoserCols

['gameID',
 'hostCountry',
 'loserPenalties',
 'losers',
 'prePenaltiesGoalsEach',
 'stage',
 'winnerPenalties',
 'winnerTookFirstPenalty',
 'winners',
 'year']

In [141]:
nonWinLoserCols.remove('winners')
nonWinLoserCols.remove('losers')
nonWinLoserCols

['gameID',
 'hostCountry',
 'loserPenalties',
 'prePenaltiesGoalsEach',
 'stage',
 'winnerPenalties',
 'winnerTookFirstPenalty',
 'year']

In [142]:
df = df.melt(id_vars = nonWinLoserCols, value_vars = ['winners', 'losers'], 
             var_name='won', value_name='country')
df

Unnamed: 0,gameID,hostCountry,loserPenalties,prePenaltiesGoalsEach,stage,winnerPenalties,winnerTookFirstPenalty,year,won,country
0,1,Spain,"[(Alain Giresse, 1), (Manuel Amoros, 1), (Domi...",3,Semi-finals,"[(Manfred Kaltz, 1), (Paul Breitner, 1), (Uli ...",False,1982,winners,West Germany
1,2,Mexico,"[(Sócrates, 0), (Alemão, 1), (Zico, 1), (Branc...",1,Quarter-finals,"[(Yannick Stopyra, 1), (Manuel Amoros, 1), (Br...",False,1986,winners,France
2,3,Mexico,"[(Manuel Negrete Arias, 1), (Fernando Quirarte...",0,Quarter-finals,"[(Klaus Allofs, 1), (Andreas Brehme, 1), (Loth...",True,1986,winners,West Germany
3,4,Mexico,"[(Juan Antonio Señor, 1), (Eloy Olaya, 0), (Ch...",1,Quarter-finals,"[(Nico Claesen, 1), (Enzo Scifo, 1), (Hugo Bro...",False,1986,winners,Belgium
4,5,Italy,"[(Gheorghe Hagi, 1), (Dănuț Lupu, 1), (Iosif R...",0,Second round,"[(Kevin Sheedy, 1), (Ray Houghton, 1), (Andy T...",False,1990,winners,Republic of Ireland
5,6,Italy,"[(Dragan Stojković, 0), (Robert Prosinečki, 1)...",0,Quarter-finals,"[(José Serrizuela, 1), (Jorge Burruchaga, 1), ...",True,1990,winners,Argentina
6,7,Italy,"[(Franco Baresi, 1), (Roberto Baggio, 1), (Lui...",1,Quarter-finals,"[(José Serrizuela, 1), (Jorge Burruchaga, 1), ...",False,1990,winners,Argentina
7,8,Italy,"[(Gary Lineker, 1), (Peter Beardsley, 1), (Dav...",1,Quarter-finals,"[(Andreas Brehme, 1), (Lothar Matthäus, 1), (K...",False,1990,winners,West Germany
8,9,USA,"[(Alberto García Aspe, 0), (Marcelino Bernal, ...",1,Second round,"[(Krasimir Balakov, 0), (Boncho Genchev, 1), (...",False,1994,winners,Bulgaria
9,10,USA,"[(Florin Răducioiu, 1), (Gheorghe Hagi, 1), (I...",2,Quarter-finals,"[(Håkan Mild, 0), (Kennet Andersson, 1), (Toma...",True,1994,winners,Sweden


In [143]:
df.won = df.won.apply(lambda x: True if x == 'winners' else False)
df

Unnamed: 0,gameID,hostCountry,loserPenalties,prePenaltiesGoalsEach,stage,winnerPenalties,winnerTookFirstPenalty,year,won,country
0,1,Spain,"[(Alain Giresse, 1), (Manuel Amoros, 1), (Domi...",3,Semi-finals,"[(Manfred Kaltz, 1), (Paul Breitner, 1), (Uli ...",False,1982,True,West Germany
1,2,Mexico,"[(Sócrates, 0), (Alemão, 1), (Zico, 1), (Branc...",1,Quarter-finals,"[(Yannick Stopyra, 1), (Manuel Amoros, 1), (Br...",False,1986,True,France
2,3,Mexico,"[(Manuel Negrete Arias, 1), (Fernando Quirarte...",0,Quarter-finals,"[(Klaus Allofs, 1), (Andreas Brehme, 1), (Loth...",True,1986,True,West Germany
3,4,Mexico,"[(Juan Antonio Señor, 1), (Eloy Olaya, 0), (Ch...",1,Quarter-finals,"[(Nico Claesen, 1), (Enzo Scifo, 1), (Hugo Bro...",False,1986,True,Belgium
4,5,Italy,"[(Gheorghe Hagi, 1), (Dănuț Lupu, 1), (Iosif R...",0,Second round,"[(Kevin Sheedy, 1), (Ray Houghton, 1), (Andy T...",False,1990,True,Republic of Ireland
5,6,Italy,"[(Dragan Stojković, 0), (Robert Prosinečki, 1)...",0,Quarter-finals,"[(José Serrizuela, 1), (Jorge Burruchaga, 1), ...",True,1990,True,Argentina
6,7,Italy,"[(Franco Baresi, 1), (Roberto Baggio, 1), (Lui...",1,Quarter-finals,"[(José Serrizuela, 1), (Jorge Burruchaga, 1), ...",False,1990,True,Argentina
7,8,Italy,"[(Gary Lineker, 1), (Peter Beardsley, 1), (Dav...",1,Quarter-finals,"[(Andreas Brehme, 1), (Lothar Matthäus, 1), (K...",False,1990,True,West Germany
8,9,USA,"[(Alberto García Aspe, 0), (Marcelino Bernal, ...",1,Second round,"[(Krasimir Balakov, 0), (Boncho Genchev, 1), (...",False,1994,True,Bulgaria
9,10,USA,"[(Florin Răducioiu, 1), (Gheorghe Hagi, 1), (I...",2,Quarter-finals,"[(Håkan Mild, 0), (Kennet Andersson, 1), (Toma...",True,1994,True,Sweden


In [144]:
df['penKicksAndResults'] = np.nan
df

Unnamed: 0,gameID,hostCountry,loserPenalties,prePenaltiesGoalsEach,stage,winnerPenalties,winnerTookFirstPenalty,year,won,country,penKicksAndResults
0,1,Spain,"[(Alain Giresse, 1), (Manuel Amoros, 1), (Domi...",3,Semi-finals,"[(Manfred Kaltz, 1), (Paul Breitner, 1), (Uli ...",False,1982,True,West Germany,
1,2,Mexico,"[(Sócrates, 0), (Alemão, 1), (Zico, 1), (Branc...",1,Quarter-finals,"[(Yannick Stopyra, 1), (Manuel Amoros, 1), (Br...",False,1986,True,France,
2,3,Mexico,"[(Manuel Negrete Arias, 1), (Fernando Quirarte...",0,Quarter-finals,"[(Klaus Allofs, 1), (Andreas Brehme, 1), (Loth...",True,1986,True,West Germany,
3,4,Mexico,"[(Juan Antonio Señor, 1), (Eloy Olaya, 0), (Ch...",1,Quarter-finals,"[(Nico Claesen, 1), (Enzo Scifo, 1), (Hugo Bro...",False,1986,True,Belgium,
4,5,Italy,"[(Gheorghe Hagi, 1), (Dănuț Lupu, 1), (Iosif R...",0,Second round,"[(Kevin Sheedy, 1), (Ray Houghton, 1), (Andy T...",False,1990,True,Republic of Ireland,
5,6,Italy,"[(Dragan Stojković, 0), (Robert Prosinečki, 1)...",0,Quarter-finals,"[(José Serrizuela, 1), (Jorge Burruchaga, 1), ...",True,1990,True,Argentina,
6,7,Italy,"[(Franco Baresi, 1), (Roberto Baggio, 1), (Lui...",1,Quarter-finals,"[(José Serrizuela, 1), (Jorge Burruchaga, 1), ...",False,1990,True,Argentina,
7,8,Italy,"[(Gary Lineker, 1), (Peter Beardsley, 1), (Dav...",1,Quarter-finals,"[(Andreas Brehme, 1), (Lothar Matthäus, 1), (K...",False,1990,True,West Germany,
8,9,USA,"[(Alberto García Aspe, 0), (Marcelino Bernal, ...",1,Second round,"[(Krasimir Balakov, 0), (Boncho Genchev, 1), (...",False,1994,True,Bulgaria,
9,10,USA,"[(Florin Răducioiu, 1), (Gheorghe Hagi, 1), (I...",2,Quarter-finals,"[(Håkan Mild, 0), (Kennet Andersson, 1), (Toma...",True,1994,True,Sweden,


In [145]:
def rowPenaltyDetails(row):
    if row.won:
        return row.winnerPenalties
    else:
        return row.loserPenalties    

In [146]:
df.penKicksAndResults = df.apply(rowPenaltyDetails, axis = 1)
df

Unnamed: 0,gameID,hostCountry,loserPenalties,prePenaltiesGoalsEach,stage,winnerPenalties,winnerTookFirstPenalty,year,won,country,penKicksAndResults
0,1,Spain,"[(Alain Giresse, 1), (Manuel Amoros, 1), (Domi...",3,Semi-finals,"[(Manfred Kaltz, 1), (Paul Breitner, 1), (Uli ...",False,1982,True,West Germany,"[(Manfred Kaltz, 1), (Paul Breitner, 1), (Uli ..."
1,2,Mexico,"[(Sócrates, 0), (Alemão, 1), (Zico, 1), (Branc...",1,Quarter-finals,"[(Yannick Stopyra, 1), (Manuel Amoros, 1), (Br...",False,1986,True,France,"[(Yannick Stopyra, 1), (Manuel Amoros, 1), (Br..."
2,3,Mexico,"[(Manuel Negrete Arias, 1), (Fernando Quirarte...",0,Quarter-finals,"[(Klaus Allofs, 1), (Andreas Brehme, 1), (Loth...",True,1986,True,West Germany,"[(Klaus Allofs, 1), (Andreas Brehme, 1), (Loth..."
3,4,Mexico,"[(Juan Antonio Señor, 1), (Eloy Olaya, 0), (Ch...",1,Quarter-finals,"[(Nico Claesen, 1), (Enzo Scifo, 1), (Hugo Bro...",False,1986,True,Belgium,"[(Nico Claesen, 1), (Enzo Scifo, 1), (Hugo Bro..."
4,5,Italy,"[(Gheorghe Hagi, 1), (Dănuț Lupu, 1), (Iosif R...",0,Second round,"[(Kevin Sheedy, 1), (Ray Houghton, 1), (Andy T...",False,1990,True,Republic of Ireland,"[(Kevin Sheedy, 1), (Ray Houghton, 1), (Andy T..."
5,6,Italy,"[(Dragan Stojković, 0), (Robert Prosinečki, 1)...",0,Quarter-finals,"[(José Serrizuela, 1), (Jorge Burruchaga, 1), ...",True,1990,True,Argentina,"[(José Serrizuela, 1), (Jorge Burruchaga, 1), ..."
6,7,Italy,"[(Franco Baresi, 1), (Roberto Baggio, 1), (Lui...",1,Quarter-finals,"[(José Serrizuela, 1), (Jorge Burruchaga, 1), ...",False,1990,True,Argentina,"[(José Serrizuela, 1), (Jorge Burruchaga, 1), ..."
7,8,Italy,"[(Gary Lineker, 1), (Peter Beardsley, 1), (Dav...",1,Quarter-finals,"[(Andreas Brehme, 1), (Lothar Matthäus, 1), (K...",False,1990,True,West Germany,"[(Andreas Brehme, 1), (Lothar Matthäus, 1), (K..."
8,9,USA,"[(Alberto García Aspe, 0), (Marcelino Bernal, ...",1,Second round,"[(Krasimir Balakov, 0), (Boncho Genchev, 1), (...",False,1994,True,Bulgaria,"[(Krasimir Balakov, 0), (Boncho Genchev, 1), (..."
9,10,USA,"[(Florin Răducioiu, 1), (Gheorghe Hagi, 1), (I...",2,Quarter-finals,"[(Håkan Mild, 0), (Kennet Andersson, 1), (Toma...",True,1994,True,Sweden,"[(Håkan Mild, 0), (Kennet Andersson, 1), (Toma..."


In [147]:
df.drop(['winnerPenalties', 'loserPenalties'], axis = 1, inplace = True)

In [148]:
df

Unnamed: 0,gameID,hostCountry,prePenaltiesGoalsEach,stage,winnerTookFirstPenalty,year,won,country,penKicksAndResults
0,1,Spain,3,Semi-finals,False,1982,True,West Germany,"[(Manfred Kaltz, 1), (Paul Breitner, 1), (Uli ..."
1,2,Mexico,1,Quarter-finals,False,1986,True,France,"[(Yannick Stopyra, 1), (Manuel Amoros, 1), (Br..."
2,3,Mexico,0,Quarter-finals,True,1986,True,West Germany,"[(Klaus Allofs, 1), (Andreas Brehme, 1), (Loth..."
3,4,Mexico,1,Quarter-finals,False,1986,True,Belgium,"[(Nico Claesen, 1), (Enzo Scifo, 1), (Hugo Bro..."
4,5,Italy,0,Second round,False,1990,True,Republic of Ireland,"[(Kevin Sheedy, 1), (Ray Houghton, 1), (Andy T..."
5,6,Italy,0,Quarter-finals,True,1990,True,Argentina,"[(José Serrizuela, 1), (Jorge Burruchaga, 1), ..."
6,7,Italy,1,Quarter-finals,False,1990,True,Argentina,"[(José Serrizuela, 1), (Jorge Burruchaga, 1), ..."
7,8,Italy,1,Quarter-finals,False,1990,True,West Germany,"[(Andreas Brehme, 1), (Lothar Matthäus, 1), (K..."
8,9,USA,1,Second round,False,1994,True,Bulgaria,"[(Krasimir Balakov, 0), (Boncho Genchev, 1), (..."
9,10,USA,2,Quarter-finals,True,1994,True,Sweden,"[(Håkan Mild, 0), (Kennet Andersson, 1), (Toma..."


In [149]:
#create columns to seperate the penalties kicker/result out into
#by finding the max amount of rounds we can make enough columns for all rows
#afterwards table will be melted and the nan rows of shorter games can be deleted

maxRoundsOfPenalties = max(df.penKicksAndResults.apply(len))
maxRoundsOfPenalties

6

In [155]:
for i in range(maxRoundsOfPenalties):
    df[i+1] = np.nan

In [156]:
df

Unnamed: 0,gameID,hostCountry,prePenaltiesGoalsEach,stage,winnerTookFirstPenalty,year,won,country,penKicksAndResults,1,2,3,4,5,6
0,1,Spain,3,Semi-finals,False,1982,True,West Germany,"[(Manfred Kaltz, 1), (Paul Breitner, 1), (Uli ...",,,,,,
1,2,Mexico,1,Quarter-finals,False,1986,True,France,"[(Yannick Stopyra, 1), (Manuel Amoros, 1), (Br...",,,,,,
2,3,Mexico,0,Quarter-finals,True,1986,True,West Germany,"[(Klaus Allofs, 1), (Andreas Brehme, 1), (Loth...",,,,,,
3,4,Mexico,1,Quarter-finals,False,1986,True,Belgium,"[(Nico Claesen, 1), (Enzo Scifo, 1), (Hugo Bro...",,,,,,
4,5,Italy,0,Second round,False,1990,True,Republic of Ireland,"[(Kevin Sheedy, 1), (Ray Houghton, 1), (Andy T...",,,,,,
5,6,Italy,0,Quarter-finals,True,1990,True,Argentina,"[(José Serrizuela, 1), (Jorge Burruchaga, 1), ...",,,,,,
6,7,Italy,1,Quarter-finals,False,1990,True,Argentina,"[(José Serrizuela, 1), (Jorge Burruchaga, 1), ...",,,,,,
7,8,Italy,1,Quarter-finals,False,1990,True,West Germany,"[(Andreas Brehme, 1), (Lothar Matthäus, 1), (K...",,,,,,
8,9,USA,1,Second round,False,1994,True,Bulgaria,"[(Krasimir Balakov, 0), (Boncho Genchev, 1), (...",,,,,,
9,10,USA,2,Quarter-finals,True,1994,True,Sweden,"[(Håkan Mild, 0), (Kennet Andersson, 1), (Toma...",,,,,,


In [203]:
def splitOutPenKickersAndResults(row):
    vals = [np.nan]*maxRoundsOfPenalties
    for i in range(len(row.penKicksAndResults)):
        vals[i] = row.penKicksAndResults[i]
    return pd.Series(vals) #throws error about 'Must have equal len keys and value...' if it's not cast as a Series

In [204]:
penColsList = list(range(1, maxRoundsOfPenalties + 1))
penColsList

[1, 2, 3, 4, 5, 6]

In [205]:
df[penColsList] = df.apply(splitOutPenKickersAndResults, axis=1)

In [206]:
df

Unnamed: 0,gameID,hostCountry,prePenaltiesGoalsEach,stage,winnerTookFirstPenalty,year,won,country,penKicksAndResults,1,2,3,4,5,6
0,1,Spain,3,Semi-finals,False,1982,True,West Germany,"[(Manfred Kaltz, 1), (Paul Breitner, 1), (Uli ...","(Manfred Kaltz, 1)","(Paul Breitner, 1)","(Uli Stielike, 0)","(Pierre Littbarski, 1)","(Karl-Heinz Rummenigge, 1)","(Horst Hrubesch, 1)"
1,2,Mexico,1,Quarter-finals,False,1986,True,France,"[(Yannick Stopyra, 1), (Manuel Amoros, 1), (Br...","(Yannick Stopyra, 1)","(Manuel Amoros, 1)","(Bruno Bellone, 1)","(Michel Platini, 0)","(Luis Fernández, 1)",
2,3,Mexico,0,Quarter-finals,True,1986,True,West Germany,"[(Klaus Allofs, 1), (Andreas Brehme, 1), (Loth...","(Klaus Allofs, 1)","(Andreas Brehme, 1)","(Lothar Matthäus, 1)","(Pierre Littbarski, 1)",,
3,4,Mexico,1,Quarter-finals,False,1986,True,Belgium,"[(Nico Claesen, 1), (Enzo Scifo, 1), (Hugo Bro...","(Nico Claesen, 1)","(Enzo Scifo, 1)","(Hugo Broos, 1)","(Patrick Vervoort, 1)","(Leo Van der Elst, 1)",
4,5,Italy,0,Second round,False,1990,True,Republic of Ireland,"[(Kevin Sheedy, 1), (Ray Houghton, 1), (Andy T...","(Kevin Sheedy, 1)","(Ray Houghton, 1)","(Andy Townsend, 1)","(Tony Cascarino, 1)","(David O'Leary, 1)",
5,6,Italy,0,Quarter-finals,True,1990,True,Argentina,"[(José Serrizuela, 1), (Jorge Burruchaga, 1), ...","(José Serrizuela, 1)","(Jorge Burruchaga, 1)","(Diego Maradona, 0)","(Pedro Troglio, 0)","(Gustavo Dezotti, 1)",
6,7,Italy,1,Quarter-finals,False,1990,True,Argentina,"[(José Serrizuela, 1), (Jorge Burruchaga, 1), ...","(José Serrizuela, 1)","(Jorge Burruchaga, 1)","(Julio Olarticoechea, 1)","(Diego Maradona, 1)",,
7,8,Italy,1,Quarter-finals,False,1990,True,West Germany,"[(Andreas Brehme, 1), (Lothar Matthäus, 1), (K...","(Andreas Brehme, 1)","(Lothar Matthäus, 1)","(Karl-Heinz Riedle, 1)","(Olaf Thon, 1)",,
8,9,USA,1,Second round,False,1994,True,Bulgaria,"[(Krasimir Balakov, 0), (Boncho Genchev, 1), (...","(Krasimir Balakov, 0)","(Boncho Genchev, 1)","(Daniel Borimirov, 1)","(Yordan Letchkov, 1)",,
9,10,USA,2,Quarter-finals,True,1994,True,Sweden,"[(Håkan Mild, 0), (Kennet Andersson, 1), (Toma...","(Håkan Mild, 0)","(Kennet Andersson, 1)","(Tomas Brolin, 1)","(Klas Ingesson, 1)","(Roland Nilsson, 1)","(Henrik Larsson, 1)"


In [220]:
df.drop('penKicksAndResults', axis = 1, inplace = True)
df

Unnamed: 0,gameID,hostCountry,prePenaltiesGoalsEach,stage,winnerTookFirstPenalty,year,won,country,1,2,3,4,5,6
0,1,Spain,3,Semi-finals,False,1982,True,West Germany,"(Manfred Kaltz, 1)","(Paul Breitner, 1)","(Uli Stielike, 0)","(Pierre Littbarski, 1)","(Karl-Heinz Rummenigge, 1)","(Horst Hrubesch, 1)"
1,2,Mexico,1,Quarter-finals,False,1986,True,France,"(Yannick Stopyra, 1)","(Manuel Amoros, 1)","(Bruno Bellone, 1)","(Michel Platini, 0)","(Luis Fernández, 1)",
2,3,Mexico,0,Quarter-finals,True,1986,True,West Germany,"(Klaus Allofs, 1)","(Andreas Brehme, 1)","(Lothar Matthäus, 1)","(Pierre Littbarski, 1)",,
3,4,Mexico,1,Quarter-finals,False,1986,True,Belgium,"(Nico Claesen, 1)","(Enzo Scifo, 1)","(Hugo Broos, 1)","(Patrick Vervoort, 1)","(Leo Van der Elst, 1)",
4,5,Italy,0,Second round,False,1990,True,Republic of Ireland,"(Kevin Sheedy, 1)","(Ray Houghton, 1)","(Andy Townsend, 1)","(Tony Cascarino, 1)","(David O'Leary, 1)",
5,6,Italy,0,Quarter-finals,True,1990,True,Argentina,"(José Serrizuela, 1)","(Jorge Burruchaga, 1)","(Diego Maradona, 0)","(Pedro Troglio, 0)","(Gustavo Dezotti, 1)",
6,7,Italy,1,Quarter-finals,False,1990,True,Argentina,"(José Serrizuela, 1)","(Jorge Burruchaga, 1)","(Julio Olarticoechea, 1)","(Diego Maradona, 1)",,
7,8,Italy,1,Quarter-finals,False,1990,True,West Germany,"(Andreas Brehme, 1)","(Lothar Matthäus, 1)","(Karl-Heinz Riedle, 1)","(Olaf Thon, 1)",,
8,9,USA,1,Second round,False,1994,True,Bulgaria,"(Krasimir Balakov, 0)","(Boncho Genchev, 1)","(Daniel Borimirov, 1)","(Yordan Letchkov, 1)",,
9,10,USA,2,Quarter-finals,True,1994,True,Sweden,"(Håkan Mild, 0)","(Kennet Andersson, 1)","(Tomas Brolin, 1)","(Klas Ingesson, 1)","(Roland Nilsson, 1)","(Henrik Larsson, 1)"
