## Display Features

In [2]:
import pandas as pd
import numpy as np
import sqlite3
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import warnings

from xml.etree.ElementTree import fromstring, ElementTree
import xml.etree.ElementTree as ET

In [3]:
%matplotlib inline
pd.options.display.max_rows = 300000
pd.options.display.max_columns = 999
pd.options.display.max_colwidth = 500

# Load Dataset

In [4]:
football = sqlite3.connect(
    "C:\\Users\\Space\\Documents\\py\\Projects\\TuringCollege\\Football\\DataSets\\database.sqlite"
)

# List of Tables

In [9]:
football_db = {}

result = football.execute("SELECT name FROM sqlite_master WHERE type='table';")

for table in result:
    query = "SELECT * from " + str(table[0]) + ";"
    football_db[table[0]] = pd.read_sql_query(query, football)

football_db.keys()

dict_keys(['sqlite_sequence', 'Player_Attributes', 'Player', 'Match', 'League', 'Country', 'Team', 'Team_Attributes'])

# Country

In [8]:
country = football_db["Country"].copy()

In [9]:
country.rename(columns={"id": "LeagueID", "name": "CountryName"}, inplace=True)

In [10]:
country

Unnamed: 0,LeagueID,CountryName
0,1,Belgium
1,1729,England
2,4769,France
3,7809,Germany
4,10257,Italy
5,13274,Netherlands
6,15722,Poland
7,17642,Portugal
8,19694,Scotland
9,21518,Spain


In [11]:
country.shape

(11, 2)

# League

In [12]:
league = football_db["League"].copy()

In [13]:
del league["id"]

In [14]:
league.rename(columns={"country_id": "LeagueID", "name": "LeagueName"}, inplace=True)

In [15]:
league

Unnamed: 0,LeagueID,LeagueName
0,1,Belgium Jupiler League
1,1729,England Premier League
2,4769,France Ligue 1
3,7809,Germany 1. Bundesliga
4,10257,Italy Serie A
5,13274,Netherlands Eredivisie
6,15722,Poland Ekstraklasa
7,17642,Portugal Liga ZON Sagres
8,19694,Scotland Premier League
9,21518,Spain LIGA BBVA


In [16]:
league.shape

(11, 2)

# Player

In [21]:
player = football_db["Player"].copy()

In [22]:
player["birthday"] = pd.DatetimeIndex(player["birthday"]).year

In [23]:
player.drop(columns=["id", "player_fifa_api_id"], inplace=True)

In [24]:
player.rename(
    columns={
        "player_api_id": "PlayerID",
        "player_name": "PlayerName",
        "birthday": "Birthday",
        "height": "Height",
        "weight": "Weight",
    },
    inplace=True,
)

In [25]:
player["BMI"] = np.round((player["Weight"] / 2.205) / ((player["Height"] / 100.0) ** 2),2)

In [26]:
player.head()

Unnamed: 0,PlayerID,PlayerName,Birthday,Height,Weight,BMI
0,505942,Aaron Appindangoye,1992,182.88,187,25.36
1,155782,Aaron Cresswell,1989,170.18,146,22.86
2,162549,Aaron Doran,1991,170.18,163,25.52
3,30572,Aaron Galindo,1982,182.88,198,26.85
4,23780,Aaron Hughes,1979,182.88,154,20.88


In [27]:
player.shape

(11060, 6)

In [28]:
player[player.duplicated(keep=False)].any()

PlayerID      False
PlayerName    False
Birthday      False
Height        False
Weight        False
BMI           False
dtype: bool

In [29]:
103869
player[player['PlayerID'] == 103869.0]

Unnamed: 0,PlayerID,PlayerName,Birthday,Height,Weight,BMI
10958,103869,Younousse Sankhare,1989,182.88,168,22.78


# Team

In [25]:
team = football_db["Team"].copy()

In [26]:
team.drop(
    columns=[
        "id",
        "team_fifa_api_id",
    ],
    inplace=True,
)

In [27]:
team.rename(
    columns={
        "team_api_id": "TeamID",
        "team_long_name": "TeamLongName",
        "team_short_name": "TeamShortName",
    },
    inplace=True,
)

In [28]:
team.head()

Unnamed: 0,TeamID,TeamLongName,TeamShortName
0,9987,KRC Genk,GEN
1,9993,Beerschot AC,BAC
2,10000,SV Zulte-Waregem,ZUL
3,9994,Sporting Lokeren,LOK
4,9984,KSV Cercle Brugge,CEB


In [29]:
team.shape

(299, 3)

In [30]:
team[team.duplicated(keep=False)].any()

TeamID           False
TeamLongName     False
TeamShortName    False
dtype: bool

# Player Attributes

In [381]:
player_attributes = football_db["Player_Attributes"].copy()

In [382]:
player_attributes.head()

Unnamed: 0,id,player_fifa_api_id,player_api_id,date,overall_rating,potential,preferred_foot,attacking_work_rate,defensive_work_rate,crossing,finishing,heading_accuracy,short_passing,volleys,dribbling,curve,free_kick_accuracy,long_passing,ball_control,acceleration,sprint_speed,agility,reactions,balance,shot_power,jumping,stamina,strength,long_shots,aggression,interceptions,positioning,vision,penalties,marking,standing_tackle,sliding_tackle,gk_diving,gk_handling,gk_kicking,gk_positioning,gk_reflexes
0,1,218353,505942,2016-02-18 00:00:00,67.0,71.0,right,medium,medium,49.0,44.0,71.0,61.0,44.0,51.0,45.0,39.0,64.0,49.0,60.0,64.0,59.0,47.0,65.0,55.0,58.0,54.0,76.0,35.0,71.0,70.0,45.0,54.0,48.0,65.0,69.0,69.0,6.0,11.0,10.0,8.0,8.0
1,2,218353,505942,2015-11-19 00:00:00,67.0,71.0,right,medium,medium,49.0,44.0,71.0,61.0,44.0,51.0,45.0,39.0,64.0,49.0,60.0,64.0,59.0,47.0,65.0,55.0,58.0,54.0,76.0,35.0,71.0,70.0,45.0,54.0,48.0,65.0,69.0,69.0,6.0,11.0,10.0,8.0,8.0
2,3,218353,505942,2015-09-21 00:00:00,62.0,66.0,right,medium,medium,49.0,44.0,71.0,61.0,44.0,51.0,45.0,39.0,64.0,49.0,60.0,64.0,59.0,47.0,65.0,55.0,58.0,54.0,76.0,35.0,63.0,41.0,45.0,54.0,48.0,65.0,66.0,69.0,6.0,11.0,10.0,8.0,8.0
3,4,218353,505942,2015-03-20 00:00:00,61.0,65.0,right,medium,medium,48.0,43.0,70.0,60.0,43.0,50.0,44.0,38.0,63.0,48.0,60.0,64.0,59.0,46.0,65.0,54.0,58.0,54.0,76.0,34.0,62.0,40.0,44.0,53.0,47.0,62.0,63.0,66.0,5.0,10.0,9.0,7.0,7.0
4,5,218353,505942,2007-02-22 00:00:00,61.0,65.0,right,medium,medium,48.0,43.0,70.0,60.0,43.0,50.0,44.0,38.0,63.0,48.0,60.0,64.0,59.0,46.0,65.0,54.0,58.0,54.0,76.0,34.0,62.0,40.0,44.0,53.0,47.0,62.0,63.0,66.0,5.0,10.0,9.0,7.0,7.0


In [383]:
player_attributes.drop(
    columns=[
        "id",
        "player_fifa_api_id",
        "gk_diving",
        "gk_handling",
        "gk_kicking",
        "gk_positioning",
        "gk_reflexes",
    ],
    inplace=True,
)

In [384]:
player_attributes.rename(
    columns={
        "player_api_id": "PlayerID",
        "date": "Date",
        "overall_rating": "OverallRating",
        "potential": "Potential",
        "preferred_foot": "PreferredFoot",
        "attacking_work_rate": "AttachWorkRate",
        "defensive_work_rate": "DefenceWorkRate",
        "crossing": "Crossing",
        "finishing": "Finishing",
        "heading_accuracy": "HeadingAccuracy",
        "short_passing": "ShortPassing",
        "volleys": "Volleys",
        "dribbling": "Dribbling",
        "curve": "Curve",
        "free_kick_accuracy": "FreeKickAccuracy",
        "long_passing": "LongPassing",
        "ball_control": "BallControl",
        "acceleration": "Acceleration",
        "sprint_speed": "SprintSpeed",
        "agility": "Agility",
        "reactions": "Reactions",
        "balance": "Balance",
        "shot_power": "ShotPower",
        "jumping": "Jumping",
        "stamina": "Stamina",
        "strength": "Strength",
        "long_shots": "LongShots",
        "aggression": "Aggression",
        "interceptions": "Interceptions",
        "positioning": "Positioning",
        "vision": "Vision",
        "penalties": "Penalties",
        "marking": "Marking",
        "standing_tackle": "StandingTackle",
        "sliding_tackle": "SlidingTackle",
    },
    inplace=True,
)

In [385]:
player_attributes["Date"] = pd.DatetimeIndex(player_attributes["Date"]).year

In [386]:
null_instances = player_attributes[player_attributes["OverallRating"].isnull()].index

In [387]:
player_attributes.drop(null_instances, inplace=True)

In [388]:
null_instances = player_attributes[player_attributes["Volleys"].isnull()].index

In [389]:
player_attributes.drop(null_instances, inplace=True)

In [390]:
players = pd.merge(player, player_attributes, on="PlayerID", how="inner")

In [391]:
players.head()

Unnamed: 0,PlayerID,PlayerName,Birthday,Height,Weight,BMI,Date,OverallRating,Potential,PreferredFoot,AttachWorkRate,DefenceWorkRate,Crossing,Finishing,HeadingAccuracy,ShortPassing,Volleys,Dribbling,Curve,FreeKickAccuracy,LongPassing,BallControl,Acceleration,SprintSpeed,Agility,Reactions,Balance,ShotPower,Jumping,Stamina,Strength,LongShots,Aggression,Interceptions,Positioning,Vision,Penalties,Marking,StandingTackle,SlidingTackle
0,505942,Aaron Appindangoye,1992,182.88,187,25.36,2016,67.0,71.0,right,medium,medium,49.0,44.0,71.0,61.0,44.0,51.0,45.0,39.0,64.0,49.0,60.0,64.0,59.0,47.0,65.0,55.0,58.0,54.0,76.0,35.0,71.0,70.0,45.0,54.0,48.0,65.0,69.0,69.0
1,505942,Aaron Appindangoye,1992,182.88,187,25.36,2015,67.0,71.0,right,medium,medium,49.0,44.0,71.0,61.0,44.0,51.0,45.0,39.0,64.0,49.0,60.0,64.0,59.0,47.0,65.0,55.0,58.0,54.0,76.0,35.0,71.0,70.0,45.0,54.0,48.0,65.0,69.0,69.0
2,505942,Aaron Appindangoye,1992,182.88,187,25.36,2015,62.0,66.0,right,medium,medium,49.0,44.0,71.0,61.0,44.0,51.0,45.0,39.0,64.0,49.0,60.0,64.0,59.0,47.0,65.0,55.0,58.0,54.0,76.0,35.0,63.0,41.0,45.0,54.0,48.0,65.0,66.0,69.0
3,505942,Aaron Appindangoye,1992,182.88,187,25.36,2015,61.0,65.0,right,medium,medium,48.0,43.0,70.0,60.0,43.0,50.0,44.0,38.0,63.0,48.0,60.0,64.0,59.0,46.0,65.0,54.0,58.0,54.0,76.0,34.0,62.0,40.0,44.0,53.0,47.0,62.0,63.0,66.0
4,505942,Aaron Appindangoye,1992,182.88,187,25.36,2007,61.0,65.0,right,medium,medium,48.0,43.0,70.0,60.0,43.0,50.0,44.0,38.0,63.0,48.0,60.0,64.0,59.0,46.0,65.0,54.0,58.0,54.0,76.0,34.0,62.0,40.0,44.0,53.0,47.0,62.0,63.0,66.0


In [392]:
def break_ties(col: str):
    most_common = pd.Series.mode(col)

    if len(most_common) > 1:
        return most_common[0]
    else:
        return most_common

In [393]:
players_categorical = (
    players.groupby(["PlayerID", "Date"])[
        ["PreferredFoot", "AttachWorkRate", "DefenceWorkRate"]
    ]
    .agg(lambda x: break_ties(x))
    .reset_index()
)

In [394]:
players_numerical = (
    players.groupby(["PlayerID", "Date"])
    .agg("mean", numeric_only=True)
    .reset_index()
)

In [395]:
players = pd.merge(
    players_categorical,
    players_numerical,
    on=["PlayerID", "Date"],
    how="inner",
)

In [397]:
players.to_csv(
    "C:\\Users\\Space\\Documents\\py\\Projects\\TuringCollege\\Football\\DataSets\\players.csv",
    index=True,
    header=True,
    encoding="utf-8",
)

In [398]:
players = pd.read_csv(
    "C:\\Users\\Space\\Documents\\py\\Projects\\TuringCollege\\Football\\DataSets\\players.csv",
    index_col=0,
)

In [399]:
players.head()

Unnamed: 0,PlayerID,Date,PreferredFoot,AttachWorkRate,DefenceWorkRate,Birthday,Height,Weight,BMI,OverallRating,Potential,Crossing,Finishing,HeadingAccuracy,ShortPassing,Volleys,Dribbling,Curve,FreeKickAccuracy,LongPassing,BallControl,Acceleration,SprintSpeed,Agility,Reactions,Balance,ShotPower,Jumping,Stamina,Strength,LongShots,Aggression,Interceptions,Positioning,Vision,Penalties,Marking,StandingTackle,SlidingTackle
0,2625,2007,right,high,medium,1981.0,175.26,154.0,22.74,63.0,64.0,48.0,48.0,47.0,64.0,38.0,57.0,50.0,48.5,67.0,57.0,67.0,64.0,59.0,52.0,49.0,61.0,56.0,78.0,56.0,59.0,72.0,52.0,55.0,56.0,46.0,64.0,66.0,63.0
1,2625,2008,right,high,medium,1981.0,175.26,154.0,22.74,60.0,64.0,48.0,48.0,47.0,64.0,38.0,57.0,50.0,51.0,67.0,57.0,67.0,64.0,59.0,52.0,49.0,61.0,56.0,78.0,56.0,59.0,72.0,52.0,55.0,56.0,46.0,64.0,66.0,63.0
2,2625,2010,right,high,medium,1981.0,175.26,154.0,22.74,60.0,64.0,48.0,48.0,47.0,64.0,38.0,57.0,50.0,51.0,67.0,57.0,67.0,64.0,59.0,50.0,49.0,71.0,56.0,78.0,56.0,59.0,72.0,71.0,50.0,56.0,69.0,64.0,66.0,63.0
3,2625,2011,right,high,medium,1981.0,175.26,154.0,22.74,58.5,60.5,52.0,47.0,46.0,58.0,37.0,56.0,49.0,50.0,66.0,58.0,66.0,63.0,58.0,49.0,61.5,68.0,55.0,77.0,55.0,58.0,71.0,65.0,49.0,55.0,66.0,63.0,63.0,57.0
4,2625,2012,right,medium,medium,1981.0,175.26,154.0,22.74,58.0,58.0,52.0,47.0,46.0,53.0,37.0,56.0,49.0,50.0,66.0,58.0,67.0,63.0,67.0,49.0,76.0,68.0,55.0,77.0,55.5,58.0,71.0,64.0,49.0,55.0,66.0,63.0,63.0,52.0


In [400]:
players.shape

(71615, 39)

# Team Attributes

In [148]:
team_attributes = football_db["Team_Attributes"].copy()

In [149]:
team_attributes.shape

(1458, 25)

In [150]:
team_attributes.drop(
    columns=[
        "id",
        "team_fifa_api_id",
        "buildUpPlayDribbling",
        "buildUpPlaySpeedClass",
        "buildUpPlayDribblingClass",
        "buildUpPlayPassingClass",
        "buildUpPlayPositioningClass",
        "chanceCreationPassingClass",
        "chanceCreationCrossingClass",
        "chanceCreationShootingClass",
        "chanceCreationPositioningClass",
        "defencePressureClass",
        "defenceAggressionClass",
        "defenceTeamWidthClass",
        "defenceDefenderLineClass",
    ],
    inplace=True,
)

In [151]:
team_attributes.rename(
    columns={
        "team_api_id": "TeamID",
        "date": "Date",
        "buildUpPlaySpeed": "BuildUpPlaySpeed",
        "buildUpPlayPassing": "BuildUpPlayPassing",
        "chanceCreationPassing": "ChanceCreationPassing",
        "chanceCreationCrossing": "ChanceCreationCrossing",
        "chanceCreationShooting": "ChanceCreationShooting",
        "defencePressure": "DefencePressure",
        "defenceAggression": "DefenceAggression",
        "defenceTeamWidth": "DefenceTeamWidth",
    },
    inplace=True,
)

In [152]:
team_attributes["Date"] = pd.DatetimeIndex(team_attributes["Date"]).year

In [153]:
teams = pd.merge(team, team_attributes, on="TeamID", how="inner")

In [154]:
teams.head()

Unnamed: 0,TeamID,TeamLongName,TeamShortName,Date,BuildUpPlaySpeed,BuildUpPlayPassing,ChanceCreationPassing,ChanceCreationCrossing,ChanceCreationShooting,DefencePressure,DefenceAggression,DefenceTeamWidth
0,9987,KRC Genk,GEN,2010,45,45,50,35,60,70,65,70
1,9987,KRC Genk,GEN,2011,66,52,65,66,51,48,47,54
2,9987,KRC Genk,GEN,2012,53,55,55,48,56,47,45,55
3,9987,KRC Genk,GEN,2013,58,38,67,48,56,47,45,55
4,9987,KRC Genk,GEN,2014,58,38,67,48,56,47,45,55


In [155]:
teams.to_csv(
    "C:\\Users\\Space\\Documents\\py\\Projects\\TuringCollege\\Football\\DataSets\\teams.csv",
    index=True,
    header=True,
    encoding="utf-8",
)

In [156]:
teams = pd.read_csv(
    "C:\\Users\\Space\\Documents\\py\\Projects\\TuringCollege\\Football\\DataSets\\teams.csv",
    index_col=0,
)

In [157]:
teams.head()

Unnamed: 0,TeamID,TeamLongName,TeamShortName,Date,BuildUpPlaySpeed,BuildUpPlayPassing,ChanceCreationPassing,ChanceCreationCrossing,ChanceCreationShooting,DefencePressure,DefenceAggression,DefenceTeamWidth
0,9987,KRC Genk,GEN,2010,45,45,50,35,60,70,65,70
1,9987,KRC Genk,GEN,2011,66,52,65,66,51,48,47,54
2,9987,KRC Genk,GEN,2012,53,55,55,48,56,47,45,55
3,9987,KRC Genk,GEN,2013,58,38,67,48,56,47,45,55
4,9987,KRC Genk,GEN,2014,58,38,67,48,56,47,45,55


In [401]:
teams.shape

(1458, 12)

# Match

In [5]:
def extract_xml(row, col_name, xml_key, away_home):
    count = 0

    element = row[col_name]
    team_id = row[away_home + "_team_api_id"]

    if type(element) == int:
        return element

    elif element != None:
        tree = ElementTree(fromstring(element))
        root = tree.getroot()

        for child in root.iter(xml_key):
            if str(team_id) == child.text:
                count += 1
        return count
    else:
        return np.nan

In [6]:
def extract_possession_xml(row, col_name, xml_key):
    count = 0
    sum_pos = 0

    element = row[col_name]

    if type(element) == int:
        return element

    elif element != None:
        tree = ElementTree(fromstring(element))
        root = tree.getroot()
        for child in root.iter(xml_key):
            count += 1
            sum_pos += int(child.text)

        if count == 0:
            return np.nan
        else:
            return sum_pos / count
    else:
        return np.nan

In [10]:
match = football_db["Match"].copy()

In [11]:
match.head()

Unnamed: 0,id,country_id,league_id,season,stage,date,match_api_id,home_team_api_id,away_team_api_id,home_team_goal,away_team_goal,home_player_X1,home_player_X2,home_player_X3,home_player_X4,home_player_X5,home_player_X6,home_player_X7,home_player_X8,home_player_X9,home_player_X10,home_player_X11,away_player_X1,away_player_X2,away_player_X3,away_player_X4,away_player_X5,away_player_X6,away_player_X7,away_player_X8,away_player_X9,away_player_X10,away_player_X11,home_player_Y1,home_player_Y2,home_player_Y3,home_player_Y4,home_player_Y5,home_player_Y6,home_player_Y7,home_player_Y8,home_player_Y9,home_player_Y10,home_player_Y11,away_player_Y1,away_player_Y2,away_player_Y3,away_player_Y4,away_player_Y5,away_player_Y6,away_player_Y7,away_player_Y8,away_player_Y9,away_player_Y10,away_player_Y11,home_player_1,home_player_2,home_player_3,home_player_4,home_player_5,home_player_6,home_player_7,home_player_8,home_player_9,home_player_10,home_player_11,away_player_1,away_player_2,away_player_3,away_player_4,away_player_5,away_player_6,away_player_7,away_player_8,away_player_9,away_player_10,away_player_11,goal,shoton,shotoff,foulcommit,card,cross,corner,possession,B365H,B365D,B365A,BWH,BWD,BWA,IWH,IWD,IWA,LBH,LBD,LBA,PSH,PSD,PSA,WHH,WHD,WHA,SJH,SJD,SJA,VCH,VCD,VCA,GBH,GBD,GBA,BSH,BSD,BSA
0,1,1,1,2008/2009,1,2008-08-17 00:00:00,492473,9987,9993,1,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.73,3.4,5.0,1.75,3.35,4.2,1.85,3.2,3.5,1.8,3.3,3.75,,,,1.7,3.3,4.33,1.9,3.3,4.0,1.65,3.4,4.5,1.78,3.25,4.0,1.73,3.4,4.2
1,2,1,1,2008/2009,1,2008-08-16 00:00:00,492474,10000,9994,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.95,3.2,3.6,1.8,3.3,3.95,1.9,3.2,3.5,1.9,3.2,3.5,,,,1.83,3.3,3.6,1.95,3.3,3.8,2.0,3.25,3.25,1.85,3.25,3.75,1.91,3.25,3.6
2,3,1,1,2008/2009,1,2008-08-16 00:00:00,492475,9984,8635,0,3,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.38,3.3,2.75,2.4,3.3,2.55,2.6,3.1,2.3,2.5,3.2,2.5,,,,2.5,3.25,2.4,2.63,3.3,2.5,2.35,3.25,2.65,2.5,3.2,2.5,2.3,3.2,2.75
3,4,1,1,2008/2009,1,2008-08-17 00:00:00,492476,9991,9998,5,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.44,3.75,7.5,1.4,4.0,6.8,1.4,3.9,6.0,1.44,3.6,6.5,,,,1.44,3.75,6.0,1.44,4.0,7.5,1.45,3.75,6.5,1.5,3.75,5.5,1.44,3.75,6.5
4,5,1,1,2008/2009,1,2008-08-16 00:00:00,492477,7947,9985,1,3,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,5.0,3.5,1.65,5.0,3.5,1.6,4.0,3.3,1.7,4.0,3.4,1.72,,,,4.2,3.4,1.7,4.5,3.5,1.73,4.5,3.4,1.65,4.5,3.5,1.65,4.75,3.3,1.67


In [19]:
match.isnull().sum()

id                      0
country_id              0
league_id               0
season                  0
stage                   0
date                    0
match_api_id            0
home_team_api_id        0
away_team_api_id        0
home_team_goal          0
away_team_goal          0
home_player_X1       1821
home_player_X2       1821
home_player_X3       1832
home_player_X4       1832
home_player_X5       1832
home_player_X6       1832
home_player_X7       1832
home_player_X8       1832
home_player_X9       1832
home_player_X10      1832
home_player_X11      1832
away_player_X1       1832
away_player_X2       1832
away_player_X3       1832
away_player_X4       1832
away_player_X5       1832
away_player_X6       1832
away_player_X7       1832
away_player_X8       1832
away_player_X9       1833
away_player_X10      1833
away_player_X11      1839
home_player_Y1       1821
home_player_Y2       1821
home_player_Y3       1832
home_player_Y4       1832
home_player_Y5       1832
home_player_

In [30]:
match['home_player_X3'] 

0        NaN
1        NaN
2        NaN
3        NaN
4        NaN
5        NaN
6        NaN
7        NaN
8        NaN
9        NaN
10       NaN
11       NaN
12       NaN
13       NaN
14       NaN
15       NaN
16       NaN
17       NaN
18       NaN
19       NaN
20       NaN
21       NaN
22       NaN
23       NaN
24       NaN
25       NaN
26       NaN
27       NaN
28       NaN
29       NaN
30       NaN
31       NaN
32       NaN
33       NaN
34       NaN
35       NaN
36       NaN
37       NaN
38       NaN
39       NaN
40       NaN
41       NaN
42       NaN
43       NaN
44       NaN
45       NaN
46       NaN
47       NaN
48       NaN
49       NaN
50       NaN
51       NaN
52       NaN
53       NaN
54       NaN
55       NaN
56       NaN
57       NaN
58       NaN
59       NaN
60       NaN
61       NaN
62       NaN
63       NaN
64       NaN
65       NaN
66       NaN
67       NaN
68       NaN
69       NaN
70       NaN
71       NaN
72       NaN
73       NaN
74       NaN
75       NaN
76       NaN

In [346]:
match["TotalGoals"] = np.nan

In [347]:
match["TotalGoals"] = match["home_team_goal"] + match["away_team_goal"]

In [12]:
match["GoalHome"] = np.nan
match["GoalAway"] = np.nan

In [13]:
match["GoalHome"]= match.apply(
    lambda x: extract_xml(x, "goal", "team", "home"), axis=1
)
match["GoalAway"] = match.apply(
    lambda x: extract_xml(x, "goal", "team", "away"), axis=1
)

In [348]:
match["ShotonHome"] = np.nan
match["ShotonAway"] = np.nan

In [349]:
match["ShotonHome"] = match.apply(
    lambda x: extract_xml(x, "shoton", "team", "home"), axis=1
)
match["ShotonAway"] = match.apply(
    lambda x: extract_xml(x, "shoton", "team", "away"), axis=1
)

In [350]:
match["ShotoffHome"] = np.nan
match["ShotoffAway"] = np.nan

In [351]:
match["ShotoffHome"] = match.apply(
    lambda x: extract_xml(x, "shotoff", "team", "home"), axis=1
)
match["ShotoffAway"] = match.apply(
    lambda x: extract_xml(x, "shotoff", "team", "away"), axis=1
)

In [352]:
match["FoulCommitHome"] = np.nan
match["FoulCommitAway"] = np.nan

In [353]:
match["FoulCommitHome"] = match.apply(
    lambda x: extract_xml(x, "foulcommit", "team", "home"), axis=1
)
match["FoulCommitAway"] = match.apply(
    lambda x: extract_xml(x, "foulcommit", "team", "away"), axis=1
)

In [354]:
match["CardHome"] = np.nan
match["CardAway"] = np.nan

In [355]:
match["CardHome"] = match.apply(
    lambda x: extract_xml(x, "card", "team", "home"), axis=1
)
match["CardAway"] = match.apply(
    lambda x: extract_xml(x, "card", "team", "away"), axis=1
)

In [356]:
match["CrossHome"] = np.nan
match["CrossAway"] = np.nan

In [357]:
match["CrossHome"] = match.apply(
    lambda x: extract_xml(x, "cross", "team", "home"), axis=1
)
match["CrossAway"] = match.apply(
    lambda x: extract_xml(x, "cross", "team", "away"), axis=1
)

In [358]:
match["CornerHome"] = np.nan
match["CornerAway"] = np.nan

In [359]:
match["CornerHome"] = match.apply(
    lambda x: extract_xml(x, "corner", "team", "home"), axis=1
)
match["CornerAway"] = match.apply(
    lambda x: extract_xml(x, "corner", "team", "away"), axis=1
)

In [360]:
match["PossessionHome"] = np.nan
match["PossessionAway"] = np.nan

In [361]:
match["PossessionHome"] = match.apply(
    lambda x: extract_possession_xml(x, "possession", "homepos"), axis=1
)
match["PossessionAway"] = match.apply(
    lambda x: extract_possession_xml(x, "possession", "awaypos"), axis=1
)

In [362]:
match.drop(
    columns=["id", "stage", "country_id", "season"],
    inplace=True,
)

In [363]:
match.rename(
    columns={
        "league_id": "LeagueID",
        "match_api_id": "MatchID",
        "home_team_api_id": "HomeTeamID",
        "away_team_api_id": "AwayTeamID",
        "date": "Date",
        "home_team_goal": "HomeTeamGoal",
        "away_team_goal": "AwayTeamGoal",
        "home_player_1": "HomePlayer1",
        "home_player_2": "HomePlayer2",
        "home_player_3": "HomePlayer3",
        "home_player_4": "HomePlayer4",
        "home_player_5": "HomePlayer5",
        "home_player_6": "HomePlayer6",
        "home_player_7": "HomePlayer7",
        "home_player_8": "HomePlayer8",
        "home_player_9": "HomePlayer9",
        "home_player_10": "HomePlayer10",
        "home_player_11": "HomePlayer11",
        "away_player_1" :"AwayPlayer1",
        "away_player_2" :"AwayPlayer2",
        "away_player_3" :"AwayPlayer3",
        "away_player_4" :"AwayPlayer4",
        "away_player_5" :"AwayPlayer5",
        "away_player_6" :"AwayPlayer6",
        "away_player_7" :"AwayPlayer7",
        "away_player_8" :"AwayPlayer8",
        "away_player_9" :"AwayPlayer9",
        "away_player_10" :"AwayPlayer10",
        "away_player_11" :"AwayPlayer11",
        
    },
    inplace=True,
)

In [14]:
match.head()

Unnamed: 0,id,country_id,league_id,season,stage,date,match_api_id,home_team_api_id,away_team_api_id,home_team_goal,away_team_goal,home_player_X1,home_player_X2,home_player_X3,home_player_X4,home_player_X5,home_player_X6,home_player_X7,home_player_X8,home_player_X9,home_player_X10,home_player_X11,away_player_X1,away_player_X2,away_player_X3,away_player_X4,away_player_X5,away_player_X6,away_player_X7,away_player_X8,away_player_X9,away_player_X10,away_player_X11,home_player_Y1,home_player_Y2,home_player_Y3,home_player_Y4,home_player_Y5,home_player_Y6,home_player_Y7,home_player_Y8,home_player_Y9,home_player_Y10,home_player_Y11,away_player_Y1,away_player_Y2,away_player_Y3,away_player_Y4,away_player_Y5,away_player_Y6,away_player_Y7,away_player_Y8,away_player_Y9,away_player_Y10,away_player_Y11,home_player_1,home_player_2,home_player_3,home_player_4,home_player_5,home_player_6,home_player_7,home_player_8,home_player_9,home_player_10,home_player_11,away_player_1,away_player_2,away_player_3,away_player_4,away_player_5,away_player_6,away_player_7,away_player_8,away_player_9,away_player_10,away_player_11,goal,shoton,shotoff,foulcommit,card,cross,corner,possession,B365H,B365D,B365A,BWH,BWD,BWA,IWH,IWD,IWA,LBH,LBD,LBA,PSH,PSD,PSA,WHH,WHD,WHA,SJH,SJD,SJA,VCH,VCD,VCA,GBH,GBD,GBA,BSH,BSD,BSA,GoalHome,GoalAway
0,1,1,1,2008/2009,1,2008-08-17 00:00:00,492473,9987,9993,1,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.73,3.4,5.0,1.75,3.35,4.2,1.85,3.2,3.5,1.8,3.3,3.75,,,,1.7,3.3,4.33,1.9,3.3,4.0,1.65,3.4,4.5,1.78,3.25,4.0,1.73,3.4,4.2,,
1,2,1,1,2008/2009,1,2008-08-16 00:00:00,492474,10000,9994,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.95,3.2,3.6,1.8,3.3,3.95,1.9,3.2,3.5,1.9,3.2,3.5,,,,1.83,3.3,3.6,1.95,3.3,3.8,2.0,3.25,3.25,1.85,3.25,3.75,1.91,3.25,3.6,,
2,3,1,1,2008/2009,1,2008-08-16 00:00:00,492475,9984,8635,0,3,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.38,3.3,2.75,2.4,3.3,2.55,2.6,3.1,2.3,2.5,3.2,2.5,,,,2.5,3.25,2.4,2.63,3.3,2.5,2.35,3.25,2.65,2.5,3.2,2.5,2.3,3.2,2.75,,
3,4,1,1,2008/2009,1,2008-08-17 00:00:00,492476,9991,9998,5,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.44,3.75,7.5,1.4,4.0,6.8,1.4,3.9,6.0,1.44,3.6,6.5,,,,1.44,3.75,6.0,1.44,4.0,7.5,1.45,3.75,6.5,1.5,3.75,5.5,1.44,3.75,6.5,,
4,5,1,1,2008/2009,1,2008-08-16 00:00:00,492477,7947,9985,1,3,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,5.0,3.5,1.65,5.0,3.5,1.6,4.0,3.3,1.7,4.0,3.4,1.72,,,,4.2,3.4,1.7,4.5,3.5,1.73,4.5,3.4,1.65,4.5,3.5,1.65,4.75,3.3,1.67,,


In [16]:
match['GoalHome']

0         NaN
1         NaN
2         NaN
3         NaN
4         NaN
5         NaN
6         NaN
7         NaN
8         NaN
9         NaN
10        NaN
11        NaN
12        NaN
13        NaN
14        NaN
15        NaN
16        NaN
17        NaN
18        NaN
19        NaN
20        NaN
21        NaN
22        NaN
23        NaN
24        NaN
25        NaN
26        NaN
27        NaN
28        NaN
29        NaN
30        NaN
31        NaN
32        NaN
33        NaN
34        NaN
35        NaN
36        NaN
37        NaN
38        NaN
39        NaN
40        NaN
41        NaN
42        NaN
43        NaN
44        NaN
45        NaN
46        NaN
47        NaN
48        NaN
49        NaN
50        NaN
51        NaN
52        NaN
53        NaN
54        NaN
55        NaN
56        NaN
57        NaN
58        NaN
59        NaN
60        NaN
61        NaN
62        NaN
63        NaN
64        NaN
65        NaN
66        NaN
67        NaN
68        NaN
69        NaN
70        NaN
71    

In [18]:
first_row = match.iloc[15592]
first_row

id                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                15593
country_id                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      

In [365]:
match.drop(match.iloc[:, 7:51], inplace = True, axis = 1)

In [366]:
match.head()

Unnamed: 0,LeagueID,Date,MatchID,HomeTeamID,AwayTeamID,HomeTeamGoal,AwayTeamGoal,HomePlayer1,HomePlayer2,HomePlayer3,HomePlayer4,HomePlayer5,HomePlayer6,HomePlayer7,HomePlayer8,HomePlayer9,HomePlayer10,HomePlayer11,AwayPlayer1,AwayPlayer2,AwayPlayer3,AwayPlayer4,AwayPlayer5,AwayPlayer6,AwayPlayer7,AwayPlayer8,AwayPlayer9,AwayPlayer10,AwayPlayer11,goal,shoton,shotoff,foulcommit,card,cross,corner,possession,B365H,B365D,B365A,BWH,BWD,BWA,IWH,IWD,IWA,LBH,LBD,LBA,PSH,PSD,PSA,WHH,WHD,WHA,SJH,SJD,SJA,VCH,VCD,VCA,GBH,GBD,GBA,BSH,BSD,BSA,TotalGoals,ShotonHome,ShotonAway,ShotoffHome,ShotoffAway,FoulCommitHome,FoulCommitAway,CardHome,CardAway,CrossHome,CrossAway,CornerHome,CornerAway,PossessionHome,PossessionAway
0,1,2008-08-17 00:00:00,492473,9987,9993,1,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.73,3.4,5.0,1.75,3.35,4.2,1.85,3.2,3.5,1.8,3.3,3.75,,,,1.7,3.3,4.33,1.9,3.3,4.0,1.65,3.4,4.5,1.78,3.25,4.0,1.73,3.4,4.2,2,,,,,,,,,,,,,,
1,1,2008-08-16 00:00:00,492474,10000,9994,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.95,3.2,3.6,1.8,3.3,3.95,1.9,3.2,3.5,1.9,3.2,3.5,,,,1.83,3.3,3.6,1.95,3.3,3.8,2.0,3.25,3.25,1.85,3.25,3.75,1.91,3.25,3.6,0,,,,,,,,,,,,,,
2,1,2008-08-16 00:00:00,492475,9984,8635,0,3,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.38,3.3,2.75,2.4,3.3,2.55,2.6,3.1,2.3,2.5,3.2,2.5,,,,2.5,3.25,2.4,2.63,3.3,2.5,2.35,3.25,2.65,2.5,3.2,2.5,2.3,3.2,2.75,3,,,,,,,,,,,,,,
3,1,2008-08-17 00:00:00,492476,9991,9998,5,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.44,3.75,7.5,1.4,4.0,6.8,1.4,3.9,6.0,1.44,3.6,6.5,,,,1.44,3.75,6.0,1.44,4.0,7.5,1.45,3.75,6.5,1.5,3.75,5.5,1.44,3.75,6.5,5,,,,,,,,,,,,,,
4,1,2008-08-16 00:00:00,492477,7947,9985,1,3,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,5.0,3.5,1.65,5.0,3.5,1.6,4.0,3.3,1.7,4.0,3.4,1.72,,,,4.2,3.4,1.7,4.5,3.5,1.73,4.5,3.4,1.65,4.5,3.5,1.65,4.75,3.3,1.67,4,,,,,,,,,,,,,,


In [367]:
match.drop(match.iloc[:, 29:67], inplace = True, axis = 1)

In [368]:
match.head()

Unnamed: 0,LeagueID,Date,MatchID,HomeTeamID,AwayTeamID,HomeTeamGoal,AwayTeamGoal,HomePlayer1,HomePlayer2,HomePlayer3,HomePlayer4,HomePlayer5,HomePlayer6,HomePlayer7,HomePlayer8,HomePlayer9,HomePlayer10,HomePlayer11,AwayPlayer1,AwayPlayer2,AwayPlayer3,AwayPlayer4,AwayPlayer5,AwayPlayer6,AwayPlayer7,AwayPlayer8,AwayPlayer9,AwayPlayer10,AwayPlayer11,TotalGoals,ShotonHome,ShotonAway,ShotoffHome,ShotoffAway,FoulCommitHome,FoulCommitAway,CardHome,CardAway,CrossHome,CrossAway,CornerHome,CornerAway,PossessionHome,PossessionAway
0,1,2008-08-17 00:00:00,492473,9987,9993,1,1,,,,,,,,,,,,,,,,,,,,,,,2,,,,,,,,,,,,,,
1,1,2008-08-16 00:00:00,492474,10000,9994,0,0,,,,,,,,,,,,,,,,,,,,,,,0,,,,,,,,,,,,,,
2,1,2008-08-16 00:00:00,492475,9984,8635,0,3,,,,,,,,,,,,,,,,,,,,,,,3,,,,,,,,,,,,,,
3,1,2008-08-17 00:00:00,492476,9991,9998,5,0,,,,,,,,,,,,,,,,,,,,,,,5,,,,,,,,,,,,,,
4,1,2008-08-16 00:00:00,492477,7947,9985,1,3,,,,,,,,,,,,,,,,,,,,,,,4,,,,,,,,,,,,,,


In [369]:
match["Date"] = pd.DatetimeIndex(match["Date"]).year

In [370]:
matches = pd.merge(league, match, on="LeagueID", how="inner")

In [371]:
matches.head()

Unnamed: 0,LeagueID,LeagueName,Date,MatchID,HomeTeamID,AwayTeamID,HomeTeamGoal,AwayTeamGoal,HomePlayer1,HomePlayer2,HomePlayer3,HomePlayer4,HomePlayer5,HomePlayer6,HomePlayer7,HomePlayer8,HomePlayer9,HomePlayer10,HomePlayer11,AwayPlayer1,AwayPlayer2,AwayPlayer3,AwayPlayer4,AwayPlayer5,AwayPlayer6,AwayPlayer7,AwayPlayer8,AwayPlayer9,AwayPlayer10,AwayPlayer11,TotalGoals,ShotonHome,ShotonAway,ShotoffHome,ShotoffAway,FoulCommitHome,FoulCommitAway,CardHome,CardAway,CrossHome,CrossAway,CornerHome,CornerAway,PossessionHome,PossessionAway
0,1,Belgium Jupiler League,2008,492473,9987,9993,1,1,,,,,,,,,,,,,,,,,,,,,,,2,,,,,,,,,,,,,,
1,1,Belgium Jupiler League,2008,492474,10000,9994,0,0,,,,,,,,,,,,,,,,,,,,,,,0,,,,,,,,,,,,,,
2,1,Belgium Jupiler League,2008,492475,9984,8635,0,3,,,,,,,,,,,,,,,,,,,,,,,3,,,,,,,,,,,,,,
3,1,Belgium Jupiler League,2008,492476,9991,9998,5,0,,,,,,,,,,,,,,,,,,,,,,,5,,,,,,,,,,,,,,
4,1,Belgium Jupiler League,2008,492477,7947,9985,1,3,,,,,,,,,,,,,,,,,,,,,,,4,,,,,,,,,,,,,,


In [None]:
total_goals = matches.pop('TotalGoals')
matches.insert(8, 'Total', total_goals)

In [376]:
matches.head()

Unnamed: 0,LeagueID,LeagueName,Date,MatchID,HomeTeamID,AwayTeamID,HomeTeamGoal,AwayTeamGoal,Total,HomePlayer1,HomePlayer2,HomePlayer3,HomePlayer4,HomePlayer5,HomePlayer6,HomePlayer7,HomePlayer8,HomePlayer9,HomePlayer10,HomePlayer11,AwayPlayer1,AwayPlayer2,AwayPlayer3,AwayPlayer4,AwayPlayer5,AwayPlayer6,AwayPlayer7,AwayPlayer8,AwayPlayer9,AwayPlayer10,AwayPlayer11,ShotonHome,ShotonAway,ShotoffHome,ShotoffAway,FoulCommitHome,FoulCommitAway,CardHome,CardAway,CrossHome,CrossAway,CornerHome,CornerAway,PossessionHome,PossessionAway
0,1,Belgium Jupiler League,2008,492473,9987,9993,1,1,2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,1,Belgium Jupiler League,2008,492474,10000,9994,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,1,Belgium Jupiler League,2008,492475,9984,8635,0,3,3,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,1,Belgium Jupiler League,2008,492476,9991,9998,5,0,5,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,1,Belgium Jupiler League,2008,492477,7947,9985,1,3,4,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [377]:
matches.shape

(25979, 45)

In [378]:
matches.to_csv(
    "C:\\Users\\Space\\Documents\\py\\Projects\\TuringCollege\\Football\\DataSets\\matches.csv",
    index=True,
    header=True,
    encoding="utf-8",
)

In [379]:
matches = pd.read_csv(
    "C:\\Users\\Space\\Documents\\py\\Projects\\TuringCollege\\Football\\DataSets\\matches.csv",
    index_col=0,
)

In [380]:
matches.head()

Unnamed: 0,LeagueID,LeagueName,Date,MatchID,HomeTeamID,AwayTeamID,HomeTeamGoal,AwayTeamGoal,Total,HomePlayer1,HomePlayer2,HomePlayer3,HomePlayer4,HomePlayer5,HomePlayer6,HomePlayer7,HomePlayer8,HomePlayer9,HomePlayer10,HomePlayer11,AwayPlayer1,AwayPlayer2,AwayPlayer3,AwayPlayer4,AwayPlayer5,AwayPlayer6,AwayPlayer7,AwayPlayer8,AwayPlayer9,AwayPlayer10,AwayPlayer11,ShotonHome,ShotonAway,ShotoffHome,ShotoffAway,FoulCommitHome,FoulCommitAway,CardHome,CardAway,CrossHome,CrossAway,CornerHome,CornerAway,PossessionHome,PossessionAway
0,1,Belgium Jupiler League,2008,492473,9987,9993,1,1,2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,1,Belgium Jupiler League,2008,492474,10000,9994,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,1,Belgium Jupiler League,2008,492475,9984,8635,0,3,3,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,1,Belgium Jupiler League,2008,492476,9991,9998,5,0,5,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,1,Belgium Jupiler League,2008,492477,7947,9985,1,3,4,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [402]:
matches.shape

(25979, 45)