In [521]:
from bs4 import BeautifulSoup as bs
import requests
import pandas as pd
import numpy as np


def penKickerAndResult(cell, splitToKeep):
    kickersAndResults = []
    for i in range(len(cell[1:])):
        kicker = cell[i+1].split('title="')[splitToKeep].split('"')[0]
        
        #some names showing as "Willian (footballer, born 1988)" so clipping off "(footballer..."
        kicker = kicker.split(' (')[0]
        
        if 'Penalty scored' in cell[i+1]:
            result = True
        else:
            result = False
        
        kickersAndResults.append((kicker, result))
            
    return kickersAndResults


def remove_xa0_from_start(initalList):
    cleanList = []
    for i in initialList:
        if i[0] == '\xa0':
            cleanList.append(i[1:])
        else:
            cleanList.append(i)
    return cleanList


def rowPenaltyDetails(row):
    if row.won:
        return row.winner_penalties
    else:
        return row.loser_penalties   
    

def splitOutPenKickersAndResults(row):
    vals = [np.nan]*maxRoundsOfPenalties
    for i in range(len(row.pen_kicks_and_results)):
        vals[i] = row.pen_kicks_and_results[i]
    return pd.Series(vals) #casting as series as throws error about 'Must have equal len keys and value...' if it's not



def extractPenaltyData(url):
    r = requests.get(url)
    soup = bs(r.content, "lxml")
    t = soup.find_all('table')
    table = t[1]
    rows = table.find_all('tr')

    #blank dictionary to be filled with lists for each column
    #will be passed to pandas to create the dataframe at the end
    d = {}

    #fields that can be pulled relatively easily from the table
    d['winners'] = []
    d['losers'] = []
    d['pre_penalties_score'] = []
    d['winner_penalties'] = []
    d['loser_penalties'] = []
    d['winner_took_first_penalty'] = []
    for row in rows:
        cells = row.find_all('td')
        if cells:
            d['winners'].append(cells[0].text[1:-1]) #spliting string as plain form is '\xa0West Germany\n'
            d['losers'].append(cells[2].text[1:-1])
            d['pre_penalties_score'].append(cells[1].text[0])

            convertedCellWinners = str(cells[7]).split('<span class="nowrap">')
            convertedCellLosers = str(cells[8]).split('<span class="nowrap">')
            d['winner_penalties'].append(penKickerAndResult(convertedCellWinners, 1))
            d['loser_penalties'].append(penKickerAndResult(convertedCellLosers, 2))

            #if team won it's first player will be highlighted with a grey background so test if 
            #that color's tag is in the cell
            if 'background-color: #C3C3C3' in str(cells[7]):
                d['winner_took_first_penalty'].append(True)
            else:
                d['winner_took_first_penalty'].append(False)


    #One value in the losers column still started with \xa0 so making sure they're removed.
    d['losers'] = remove_xa0_from_list(d['losers'])
    d['winners'] = remove_xa0_from_list(d['winners'])

    #creating IDs for each game as will melt table later so want to be able to match the teams for each game
    d['game_ID'] = list(range(1, len(d['winners'])+1))



    ################
    # year/host_country and stage are in merged cells that span multiple rows so require 
    # more work to extract
    ################

    d['year'] = []
    d['host_country'] = []
    i = 2 #skip rows 0 and 1 as they're headers
    loopAgain = True
    while loopAgain:
        cell = rows[i].find_all('td')[10]
        yearAndLocationSplit = cell.text[:-1].split(', ') #[:-1] to strip '\n' off end

        d['year'].append(int(yearAndLocationSplit[0]))
        d['host_country'].append(yearAndLocationSplit[1])

        i += 1

        #deal with cells that span multiple rows.
        #add more instances of values to list for how many rows the cell spans
        if "rowspan" in str(cell):
            extraloops = int(str(cell).split('"')[1]) - 1 #-1 as already done one loop above
            for j in range(extraloops):
                d['year'].append(int(yearAndLocationSplit[0]))
                d['host_country'].append(yearAndLocationSplit[1])
                i += 1

        if i >= len(rows):
            loopAgain = False


    d['stage'] = []
    i = 2 #skip rows 0 and 1 as they're headers
    loopAgain = True
    while loopAgain:
        previousColCell = rows[i].find_all('td')[10]

        if '#' in str(previousColCell):
            cell = previousColCell
            stageVar = previousColCell.text
        elif ('style="display:none;"' in str(previousColCell) or 
              'style="white-space:nowrap"' in str(previousColCell)):
            #work around for dealing with game8 where there's a merged
            #cell for stage, but it doesn't start on the same row as the 
            #merged cell for the year/location.

            #leave cell as value from last loop
            pass
        else:
            cell = rows[i].find_all('td')[11]

        stageVar = cell.text[1:-1] #[1:-1] as text comes in as ' Semi-finals\n'
        d['stage'].append(stageVar)

        i += 1

        #deal with cells that define how many rows they span
        if "rowspan" in str(cell):
            extraloops = int(str(cell).split('"')[1]) - 1 #-1 as already done one loop above
            for j in range(extraloops):
                d['stage'].append(stageVar)
                i += 1

        if i >= len(rows):
            loopAgain = False



    ############
    # create and melt dataframe
    ############

    df = pd.DataFrame(d)

    #winner and loser will be the columns melted on so make list of others to be the id cols
    nonWinLoserCols = list(df.columns)
    nonWinLoserCols.remove('winners')
    nonWinLoserCols.remove('losers')


    df = df.melt(id_vars = nonWinLoserCols, value_vars = ['winners', 'losers'], 
                 var_name='won', value_name='team')


    df.won = df.won.apply(lambda x: True if x == 'winners' else False)

    #move relevant values from winner or loser penalties into one new column
    df['pen_kicks_and_results'] = np.nan 
    df.pen_kicks_and_results = df.apply(rowPenaltyDetails, axis = 1)
    df.drop(['winner_penalties', 'loser_penalties'], axis = 1, inplace = True)


    #create columns to seperate the penalties kicker/result for each round out.
    #By finding the max amount of rounds we can make enough columns for all rows.
    #Afterwards table will be melted again and the nan rows of shorter games can be deleted
    maxRoundsOfPenalties = max(df.pen_kicks_and_results.apply(len))
    for i in range(maxRoundsOfPenalties):
        df[i+1] = np.nan

    penColsList = list(range(1, maxRoundsOfPenalties + 1))
    df[penColsList] = df.apply(splitOutPenKickersAndResults, axis=1)

    #source column no longer needed
    df.drop('pen_kicks_and_results', axis = 1, inplace = True)


    #melt on the penalty rounds columns
    nonPenCols = set(list(df.columns)) - set(penColsList)
    df = df.melt(id_vars = nonPenCols, value_vars = penColsList, 
            var_name='penalty_round', value_name='kicker_and_result')


    #drop rows that didn't have a penalty in a round
    df.dropna(subset=['kicker_and_result'], inplace=True)


    #seperate the kickers and if they scored or not into two new columns
    df['penalty_kicker'] = df.apply(lambda x: x.kicker_and_result[0], axis=1)
    df['penalty_scored'] = df.apply(lambda x: x.kicker_and_result[1], axis=1)

    #get rid of source column
    df.drop('kicker_and_result', axis = 1, inplace = True)


    #rearrange columns for tidiness sake
    newColOrder = ['game_ID',
                   'year',
                   'host_country', 
                   'stage',
                   'team',
                   'pre_penalties_score', 
                   'won', 
                   'winner_took_first_penalty',
                   'penalty_round',
                   'penalty_scored',
                   'penalty_kicker']

    return df[newColOrder]

In [522]:
penaltiesDf = extractPenaltyData('https://en.wikipedia.org/wiki/List_of_FIFA_World_Cup_penalty_shoot-outs')
penaltiesDf.to_csv('world_cup_penalty_data.csv', index=False) #saves to folder of this notebook if no filepath given.