# Data Overview and Cleaning Analysis of the European Football Leagues

# Libraries

In [1]:
import pandas as pd
import numpy as np
import sqlite3
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import warnings
import xml.etree.ElementTree as ET
import eli5 as eli

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from xml.etree.ElementTree import fromstring, ElementTree
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.svm import SVC
from eli5.sklearn import PermutationImportance
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV

# The Raw Data

For this report, I used the Ultimate 25k+ Matches Football Database - European dataset from Kaggle. This data set is comprised of seven (7) tables.
1. Country
2. Leauge
3. Player
4. Plater Attribute
5. Team
6. Team Attribute
7. Match

# Data Overview and Cleaning

In this section, I provide and overview of the data and clean the data to be used at later parts of this analysis.

In [17]:
football = sqlite3.connect(
    "C:\\Users\\Space\\Documents\\py\\Projects\\TuringCollege\\Football\\database.sqlite"
)

In [18]:
football_db = {}

result = football.execute("SELECT name FROM sqlite_master WHERE type='table';")

for table in result:
    query = "SELECT * from " + str(table[0]) + ";"
    football_db[table[0]] = pd.read_sql_query(query, football)

In this section, I provide and overview of the data and clean the data to be used at later parts of this analysis.

## Country

The section below contains information about the Country table and how I cleaned this table for further analysis.

In [7]:
country = football_db["Country"].copy()

In [8]:
country.head()

Unnamed: 0,id,name
0,1,Belgium
1,1729,England
2,4769,France
3,7809,Germany
4,10257,Italy


In [9]:
country.rename(columns={"id": "CountryID", "name": "CountryName"}, inplace=True)

In [10]:
country

Unnamed: 0,CountryID,CountryName
0,1,Belgium
1,1729,England
2,4769,France
3,7809,Germany
4,10257,Italy
5,13274,Netherlands
6,15722,Poland
7,17642,Portugal
8,19694,Scotland
9,21518,Spain


In [11]:
country.shape

(11, 2)

### Section Summary
> * Eleven (11) countries are represented in this data set. 
> * I changed the headers to conform with Standard 2 from the Standards Section mentioned above.  

## League

The section below contains information about the League table and how I cleaned this table for further analysis.

In [12]:
league = football_db["League"].copy()

In [13]:
league.head()

Unnamed: 0,id,country_id,name
0,1,1,Belgium Jupiler League
1,1729,1729,England Premier League
2,4769,4769,France Ligue 1
3,7809,7809,Germany 1. Bundesliga
4,10257,10257,Italy Serie A


In [14]:
del league["id"]
league = league.rename(columns={"country_id": "LeagueID", "name": "LeagueName"})

In [15]:
league.head()

Unnamed: 0,LeagueID,LeagueName
0,1,Belgium Jupiler League
1,1729,England Premier League
2,4769,France Ligue 1
3,7809,Germany 1. Bundesliga
4,10257,Italy Serie A


In [16]:
league.shape

(11, 2)

### Section Summary
> * Eleven (11) leagues are represented in this data set, one from each country specified in the Country table. 
> * Conforming with the Standard 4 of the Standards Section mentioned above, I dropped the "id" column. 
> * I changed the headers to conform with Standard 2 from the Standards Section mentioned above.  

## Player

The section below contains information about the Player table and how I cleaned this table for further analysis.

In [17]:
player = football_db["Player"].copy()

In [18]:
player.head()

Unnamed: 0,id,player_api_id,player_name,player_fifa_api_id,birthday,height,weight
0,1,505942,Aaron Appindangoye,218353,1992-02-29 00:00:00,182.88,187
1,2,155782,Aaron Cresswell,189615,1989-12-15 00:00:00,170.18,146
2,3,162549,Aaron Doran,186170,1991-05-13 00:00:00,170.18,163
3,4,30572,Aaron Galindo,140161,1982-05-08 00:00:00,182.88,198
4,5,23780,Aaron Hughes,17725,1979-11-08 00:00:00,182.88,154


In [19]:
player[player.duplicated()].sum()

id                    0.0
player_api_id         0.0
player_name           0.0
player_fifa_api_id    0.0
birthday              0.0
height                0.0
weight                0.0
dtype: float64

In [20]:
player.isna().sum()

id                    0
player_api_id         0
player_name           0
player_fifa_api_id    0
birthday              0
height                0
weight                0
dtype: int64

In [21]:
player.drop(columns=["id", "player_fifa_api_id"], inplace=True)
player = player.set_index("player_api_id")
player = player.rename_axis("player_api_id").reset_index()

In [22]:
player.rename(
    columns={
        "player_api_id": "PlayerID",
        "player_name": "PlayerName",
        "birthday": "Birthday",
        "height": "Height",
        "weight": "Weight",
    },
    inplace=True,
)

In [23]:
player["Birthday"] = pd.DatetimeIndex(player["Birthday"]).year

In [24]:
player.head()

Unnamed: 0,PlayerID,PlayerName,Birthday,Height,Weight
0,505942,Aaron Appindangoye,1992,182.88,187
1,155782,Aaron Cresswell,1989,170.18,146
2,162549,Aaron Doran,1991,170.18,163
3,30572,Aaron Galindo,1982,182.88,198
4,23780,Aaron Hughes,1979,182.88,154


In [25]:
player.shape

(11060, 5)

### Section Summary
> * This table contains no duplicates.
> * This table contains no  missing data.
> * Conforming with Standard 2 from the Standards Section mentioned above, I eliminated these columns from the Player table: "id", "player_fifa_api_id". 
> * I changed the headers to conform with Standard 2 from the Standards Section mentioned above.  
> * I modified the player's birthday to show only the year.

## Player Attributes

The section below contains information about the Player Attributes table and how I cleaned this table for further analysis.

In [26]:
player_attributes = football_db["Player_Attributes"].copy()

In [27]:
player_attributes.head()

Unnamed: 0,id,player_fifa_api_id,player_api_id,date,overall_rating,potential,preferred_foot,attacking_work_rate,defensive_work_rate,crossing,finishing,heading_accuracy,short_passing,volleys,dribbling,curve,free_kick_accuracy,long_passing,ball_control,acceleration,sprint_speed,agility,reactions,balance,shot_power,jumping,stamina,strength,long_shots,aggression,interceptions,positioning,vision,penalties,marking,standing_tackle,sliding_tackle,gk_diving,gk_handling,gk_kicking,gk_positioning,gk_reflexes
0,1,218353,505942,2016-02-18 00:00:00,67.0,71.0,right,medium,medium,49.0,44.0,71.0,61.0,44.0,51.0,45.0,39.0,64.0,49.0,60.0,64.0,59.0,47.0,65.0,55.0,58.0,54.0,76.0,35.0,71.0,70.0,45.0,54.0,48.0,65.0,69.0,69.0,6.0,11.0,10.0,8.0,8.0
1,2,218353,505942,2015-11-19 00:00:00,67.0,71.0,right,medium,medium,49.0,44.0,71.0,61.0,44.0,51.0,45.0,39.0,64.0,49.0,60.0,64.0,59.0,47.0,65.0,55.0,58.0,54.0,76.0,35.0,71.0,70.0,45.0,54.0,48.0,65.0,69.0,69.0,6.0,11.0,10.0,8.0,8.0
2,3,218353,505942,2015-09-21 00:00:00,62.0,66.0,right,medium,medium,49.0,44.0,71.0,61.0,44.0,51.0,45.0,39.0,64.0,49.0,60.0,64.0,59.0,47.0,65.0,55.0,58.0,54.0,76.0,35.0,63.0,41.0,45.0,54.0,48.0,65.0,66.0,69.0,6.0,11.0,10.0,8.0,8.0
3,4,218353,505942,2015-03-20 00:00:00,61.0,65.0,right,medium,medium,48.0,43.0,70.0,60.0,43.0,50.0,44.0,38.0,63.0,48.0,60.0,64.0,59.0,46.0,65.0,54.0,58.0,54.0,76.0,34.0,62.0,40.0,44.0,53.0,47.0,62.0,63.0,66.0,5.0,10.0,9.0,7.0,7.0
4,5,218353,505942,2007-02-22 00:00:00,61.0,65.0,right,medium,medium,48.0,43.0,70.0,60.0,43.0,50.0,44.0,38.0,63.0,48.0,60.0,64.0,59.0,46.0,65.0,54.0,58.0,54.0,76.0,34.0,62.0,40.0,44.0,53.0,47.0,62.0,63.0,66.0,5.0,10.0,9.0,7.0,7.0


In [28]:
player_attributes.shape

(183978, 42)

In [29]:
player_attributes[player_attributes.duplicated()].sum()

id                     0.0
player_fifa_api_id     0.0
player_api_id          0.0
date                   0.0
overall_rating         0.0
potential              0.0
preferred_foot         0.0
attacking_work_rate    0.0
defensive_work_rate    0.0
crossing               0.0
finishing              0.0
heading_accuracy       0.0
short_passing          0.0
volleys                0.0
dribbling              0.0
curve                  0.0
free_kick_accuracy     0.0
long_passing           0.0
ball_control           0.0
acceleration           0.0
sprint_speed           0.0
agility                0.0
reactions              0.0
balance                0.0
shot_power             0.0
jumping                0.0
stamina                0.0
strength               0.0
long_shots             0.0
aggression             0.0
interceptions          0.0
positioning            0.0
vision                 0.0
penalties              0.0
marking                0.0
standing_tackle        0.0
sliding_tackle         0.0
g

In [30]:
player_attributes.isna().sum()

id                        0
player_fifa_api_id        0
player_api_id             0
date                      0
overall_rating          836
potential               836
preferred_foot          836
attacking_work_rate    3230
defensive_work_rate     836
crossing                836
finishing               836
heading_accuracy        836
short_passing           836
volleys                2713
dribbling               836
curve                  2713
free_kick_accuracy      836
long_passing            836
ball_control            836
acceleration            836
sprint_speed            836
agility                2713
reactions               836
balance                2713
shot_power              836
jumping                2713
stamina                 836
strength                836
long_shots              836
aggression              836
interceptions           836
positioning             836
vision                 2713
penalties               836
marking                 836
standing_tackle     

In [31]:
player_attributes.shape

(183978, 42)

In [32]:
null_instances = player_attributes[
    player_attributes["attacking_work_rate"].isnull()
].index
player_attributes.drop(null_instances, inplace=True)

In [33]:
null_instances = player_attributes[player_attributes["volleys"].isnull()].index
player_attributes.drop(null_instances, inplace=True)

In [34]:
player_attributes.shape

(180354, 42)

In [35]:
player_attributes.isna().sum()

id                     0
player_fifa_api_id     0
player_api_id          0
date                   0
overall_rating         0
potential              0
preferred_foot         0
attacking_work_rate    0
defensive_work_rate    0
crossing               0
finishing              0
heading_accuracy       0
short_passing          0
volleys                0
dribbling              0
curve                  0
free_kick_accuracy     0
long_passing           0
ball_control           0
acceleration           0
sprint_speed           0
agility                0
reactions              0
balance                0
shot_power             0
jumping                0
stamina                0
strength               0
long_shots             0
aggression             0
interceptions          0
positioning            0
vision                 0
penalties              0
marking                0
standing_tackle        0
sliding_tackle         0
gk_diving              0
gk_handling            0
gk_kicking             0


In [36]:
player_attributes.shape

(180354, 42)

In [37]:
player_attributes.drop(
    columns=[
        "id",
        "player_fifa_api_id",
        "gk_diving",
        "gk_handling",
        "gk_kicking",
        "gk_positioning",
        "gk_reflexes",
    ],
    inplace=True,
)

In [38]:
def break_ties(col: str):
    '''
    Determines the mode of a column and inserts the mode where there are missing values in the column.
    '''     
    most_common = pd.Series.mode(col)

    if len(most_common) > 1:
        return most_common[0]
    else:
        return most_common

In [39]:
players_categorical = (
    player_attributes.groupby(["player_api_id", "date"])[
        ["preferred_foot", "attacking_work_rate", "defensive_work_rate"]
    ]
    .agg(lambda x: break_ties(x))
    .reset_index()
)

In [40]:
players_numerical = (
    player_attributes.groupby(["player_api_id", "date"])
    .agg("mean", numeric_only=True)
    .reset_index()
)

In [41]:
player_attributes = pd.merge(
    players_categorical,
    players_numerical,
    on=["player_api_id", "date"],
    how="inner",
)

In [42]:
player_attributes.attacking_work_rate.unique()

array(['high', 'medium', 'None', 'low', 'norm', 'stoc', 'le', 'y'],
      dtype=object)

In [43]:
player_attributes.defensive_work_rate.unique()

array(['medium', 'low', '7', 'high', 'o', 'ormal', '4', 'tocky', '2', '9',
       '6', '3', 'ean', '1', '5', 'es', '0', '8'], dtype=object)

In [44]:
player_attributes["attacking_work_rate"].replace(
    ["norm", "stoc", "le", "y"], "None", inplace=True
)
player_attributes["defensive_work_rate"].replace(
    [
        "7",
        "o",
        "ormal",
        "4",
        "tocky",
        "2",
        "9",
        "6",
        "3",
        "ean",
        "1",
        "5",
        "es",
        "0",
        "8",
    ],
    "None",
    inplace=True,
)

In [45]:
player_attributes.rename(
    columns={
        "player_api_id": "PlayerID",
        "date": "Date",
        "overall_rating": "OverallRating",
        "potential": "Potential",
        "preferred_foot": "PreferredFoot",
        "attacking_work_rate": "AttackWorkRate",
        "defensive_work_rate": "DefenceWorkRate",
        "crossing": "Crossing",
        "finishing": "Finishing",
        "heading_accuracy": "HeadingAccuracy",
        "short_passing": "ShortPassing",
        "volleys": "Volleys",
        "dribbling": "Dribbling",
        "curve": "Curve",
        "free_kick_accuracy": "FreeKickAccuracy",
        "long_passing": "LongPassing",
        "ball_control": "BallControl",
        "acceleration": "Acceleration",
        "sprint_speed": "SprintSpeed",
        "agility": "Agility",
        "reactions": "Reactions",
        "balance": "Balance",
        "shot_power": "ShotPower",
        "jumping": "Jumping",
        "stamina": "Stamina",
        "strength": "Strength",
        "long_shots": "LongShots",
        "aggression": "Aggression",
        "interceptions": "Interceptions",
        "positioning": "Positioning",
        "vision": "Vision",
        "penalties": "Penalties",
        "marking": "Marking",
        "standing_tackle": "StandingTackle",
        "sliding_tackle": "SlidingTackle",
    },
    inplace=True,
)

In [46]:
player_attributes["Date"] = pd.DatetimeIndex(player_attributes["Date"]).year

In [47]:
player_attributes.head()

Unnamed: 0,PlayerID,Date,PreferredFoot,AttackWorkRate,DefenceWorkRate,OverallRating,Potential,Crossing,Finishing,HeadingAccuracy,ShortPassing,Volleys,Dribbling,Curve,FreeKickAccuracy,LongPassing,BallControl,Acceleration,SprintSpeed,Agility,Reactions,Balance,ShotPower,Jumping,Stamina,Strength,LongShots,Aggression,Interceptions,Positioning,Vision,Penalties,Marking,StandingTackle,SlidingTackle
0,2625,2007,right,high,medium,63.0,64.0,48.0,48.0,47.0,64.0,38.0,57.0,50.0,46.0,67.0,57.0,67.0,64.0,59.0,52.0,49.0,61.0,56.0,78.0,56.0,59.0,72.0,52.0,55.0,56.0,46.0,64.0,66.0,63.0
1,2625,2007,right,high,medium,63.0,64.0,48.0,48.0,47.0,64.0,38.0,57.0,50.0,51.0,67.0,57.0,67.0,64.0,59.0,52.0,49.0,61.0,56.0,78.0,56.0,59.0,72.0,52.0,55.0,56.0,46.0,64.0,66.0,63.0
2,2625,2008,right,high,medium,60.0,64.0,48.0,48.0,47.0,64.0,38.0,57.0,50.0,51.0,67.0,57.0,67.0,64.0,59.0,52.0,49.0,61.0,56.0,78.0,56.0,59.0,72.0,52.0,55.0,56.0,46.0,64.0,66.0,63.0
3,2625,2010,right,high,medium,60.0,64.0,48.0,48.0,47.0,64.0,38.0,57.0,50.0,51.0,67.0,57.0,67.0,64.0,59.0,50.0,49.0,71.0,56.0,78.0,56.0,59.0,72.0,71.0,50.0,56.0,69.0,64.0,66.0,63.0
4,2625,2011,right,high,medium,59.0,63.0,52.0,47.0,46.0,63.0,37.0,56.0,49.0,50.0,66.0,58.0,66.0,63.0,58.0,49.0,48.0,68.0,55.0,77.0,55.0,58.0,71.0,70.0,49.0,55.0,66.0,63.0,63.0,62.0


In [48]:
player_attributes.shape

(180354, 35)

### Attribute Rating

An attribute is an ability that a player possesses. Most of the attributes are rated in the range of 0 to 99. The average rating of these attributes is reflected in the overall_rating field. The higher the value of these attributes, the better the player's ability. We can classify the value of an attribute with the following quality labels. Some of the attributes have a categorical label such as, "High," "Medium," or "low."<BR>
	- **Excellent:** 90 - 99 <BR>
	- **Very Good:** 80 - 89 <BR>
	- **Good:** 70 - 79 <BR>
	- **Fair:** 50 - 69 <BR>
	- **Poor:** 40 - 49 <BR>
	- **Very Poor:** 0 - 39

### Player Attribute Description

**OverallRating:** Overall rating of the player's skills based on the average of the other attributes. <BR>
**Potential:** Estimate of the maximum overall rating of a player his professional career.<BR>
**PreferredFoot:** Which leg the player tends to play with.<BR>
**AttackWorkRate:** Level of involvement or participation of a player in attacking movements. These attributes can have the values high, medium and low.<BR> 
**DefenseWorkRate:** Level of involvement or participation of a player in defensive movements. These attributes can have the values high, medium and low.<BR>     
**Card:** Penalty cards are used by referees to indicate that a player has committed an foul. 
**Crossing:** Quality and precision of a player to make a pass towards the center of the field, near the opponent's goal. <BR>
**Finishing:** Ability to score a goal.<BR> 
**HeadingAccuracy:** Accuracy in passing or clearing the ball using their head.<BR> 
**ShortPassing:** Ability to make short passes <BR> 
**Volleys:** Ability to take shots or passes while the ball is in the air. <BR> 
**Dribbling:** Ability to maintain possession of the ball against an opponent. <BR> 
**Curve:** Ability to perform passes or shots with a curve effect.<BR> 
**FreeKicAccuracy:** Accuracy or skill of the player in free kicks.<BR> 
**LongPassing:** Ability to make long passes<BR> 
**BallControl:** Level of control of the player over the ball.<BR> 
**Acceleration:** Ability to increase the speed of the player on the field.<BR> 
**SprintSpeed:** Speed of the player<BR> 
**Agility:** How fast a player can handle the ball.<BR> 
**Reactions:** Response time of the player to events that happen around him.<BR> 
**Balance:** Balance level.<BR> 
**ShotPower:** Power of a player's shots.<BR> 
**Jumping:** Quality of jumping.<BR> 
**Stamina:** Ability to maintain physical effort.<BR> 
**Strength:** Pphysical strength of the player.<BR> 
**LongShots:** Ability to take long shots.<BR> 
**Aggression:** Frequency of pushing, pulling and blocking members of the opposing team.<BR> 
**Interceptions:** Ability to intercept the ball during passes from the opposing team.<BR> 
**Positioning:** Ability of to take the correct position on the field.<BR> 
**Vision:** Mental ability to remember the positions of his teammates.<BR> 
**Penalties:** Accuracy to make penalty shots.<BR> 
**Marking:** Ability to defend against a play by the opposing team.<BR> 
**StandingTackle:** Ability to make standing tackles -- extend the leg to move the ball away from the opponent.<BR> 
**SlidingTackle:** Ability make sliding tackles -- extend the leg to move the ball away from the opponent   

### Section Summary
> * This table originally contained over 180,000 entries.
> * The table contains no duplicates
> * The table contains a large number missing values.
> * I changed the date column to show only the year and combined the information for each player at different times in one year into one row of data for the year.
> * I eliminated the rows with missing values in the "AttackWorkRate" and "Volleys" columns, which eliminated most of the missing values in the other columns as well. For the remaining missing values, I used the strategy outlined below. 
    * I filled the missing values in the categorical columns with the mode of that column. 
    * I filled the missing values in the numerical columns with the mean of that column. 
> * The AttackWorkRate and DefenseWorkRate columns contained a large number of incomprehensible enteries. I replaced those entries with "None".
> * I modified the names of all columns according to Standard 2 in the Standards section of this report. 
> * After modifications, the table now containes 70,941 entries.

## Team

The section below contains information about the Team table and how I cleaned this table for further analysis.

In [49]:
team = football_db["Team"].copy()

In [50]:
team.head()

Unnamed: 0,id,team_api_id,team_fifa_api_id,team_long_name,team_short_name
0,1,9987,673.0,KRC Genk,GEN
1,2,9993,675.0,Beerschot AC,BAC
2,3,10000,15005.0,SV Zulte-Waregem,ZUL
3,4,9994,2007.0,Sporting Lokeren,LOK
4,5,9984,1750.0,KSV Cercle Brugge,CEB


In [51]:
team.drop(columns=["id", "team_fifa_api_id"], inplace=True)

In [52]:
team.rename(
    columns={
        "team_api_id": "TeamID",
        "team_long_name": "TeamLongName",
        "team_short_name": "TeamShortName",
    },
    inplace=True,
)

In [53]:
team[team.duplicated()].sum()

TeamID           0.0
TeamLongName     0.0
TeamShortName    0.0
dtype: float64

In [54]:
team.isnull().sum()

TeamID           0
TeamLongName     0
TeamShortName    0
dtype: int64

In [55]:
team.head()

Unnamed: 0,TeamID,TeamLongName,TeamShortName
0,9987,KRC Genk,GEN
1,9993,Beerschot AC,BAC
2,10000,SV Zulte-Waregem,ZUL
3,9994,Sporting Lokeren,LOK
4,9984,KSV Cercle Brugge,CEB


In [56]:
team.shape

(299, 3)

### Section Summary
> * There are 299 teams in this table.
> * This table contains no duplicates.
> * This table contains a large number of missing values.
> * Conforming to Standard 4 of the Standards Section, I eliminated the "id", "team_fifa_api_id" columns. 
> * I changed the header of each of the other columns to conform with Standard 2 of the Standards section of this report. 

## Team Attributes

The section below contains information about the Team Attributes table and how I cleaned this table for further analysis.

In [57]:
team_attributes = football_db["Team_Attributes"].copy()

In [58]:
team_attributes.head()

Unnamed: 0,id,team_fifa_api_id,team_api_id,date,buildUpPlaySpeed,buildUpPlaySpeedClass,buildUpPlayDribbling,buildUpPlayDribblingClass,buildUpPlayPassing,buildUpPlayPassingClass,buildUpPlayPositioningClass,chanceCreationPassing,chanceCreationPassingClass,chanceCreationCrossing,chanceCreationCrossingClass,chanceCreationShooting,chanceCreationShootingClass,chanceCreationPositioningClass,defencePressure,defencePressureClass,defenceAggression,defenceAggressionClass,defenceTeamWidth,defenceTeamWidthClass,defenceDefenderLineClass
0,1,434,9930,2010-02-22 00:00:00,60,Balanced,,Little,50,Mixed,Organised,60,Normal,65,Normal,55,Normal,Organised,50,Medium,55,Press,45,Normal,Cover
1,2,434,9930,2014-09-19 00:00:00,52,Balanced,48.0,Normal,56,Mixed,Organised,54,Normal,63,Normal,64,Normal,Organised,47,Medium,44,Press,54,Normal,Cover
2,3,434,9930,2015-09-10 00:00:00,47,Balanced,41.0,Normal,54,Mixed,Organised,54,Normal,63,Normal,64,Normal,Organised,47,Medium,44,Press,54,Normal,Cover
3,4,77,8485,2010-02-22 00:00:00,70,Fast,,Little,70,Long,Organised,70,Risky,70,Lots,70,Lots,Organised,60,Medium,70,Double,70,Wide,Cover
4,5,77,8485,2011-02-22 00:00:00,47,Balanced,,Little,52,Mixed,Organised,53,Normal,48,Normal,52,Normal,Organised,47,Medium,47,Press,52,Normal,Cover


In [59]:
team_attributes.shape

(1458, 25)

In [60]:
team_attributes[team_attributes.duplicated()].sum()

id                                0.0
team_fifa_api_id                  0.0
team_api_id                       0.0
date                              0.0
buildUpPlaySpeed                  0.0
buildUpPlaySpeedClass             0.0
buildUpPlayDribbling              0.0
buildUpPlayDribblingClass         0.0
buildUpPlayPassing                0.0
buildUpPlayPassingClass           0.0
buildUpPlayPositioningClass       0.0
chanceCreationPassing             0.0
chanceCreationPassingClass        0.0
chanceCreationCrossing            0.0
chanceCreationCrossingClass       0.0
chanceCreationShooting            0.0
chanceCreationShootingClass       0.0
chanceCreationPositioningClass    0.0
defencePressure                   0.0
defencePressureClass              0.0
defenceAggression                 0.0
defenceAggressionClass            0.0
defenceTeamWidth                  0.0
defenceTeamWidthClass             0.0
defenceDefenderLineClass          0.0
dtype: float64

In [61]:
team_attributes.isnull().sum()

id                                  0
team_fifa_api_id                    0
team_api_id                         0
date                                0
buildUpPlaySpeed                    0
buildUpPlaySpeedClass               0
buildUpPlayDribbling              969
buildUpPlayDribblingClass           0
buildUpPlayPassing                  0
buildUpPlayPassingClass             0
buildUpPlayPositioningClass         0
chanceCreationPassing               0
chanceCreationPassingClass          0
chanceCreationCrossing              0
chanceCreationCrossingClass         0
chanceCreationShooting              0
chanceCreationShootingClass         0
chanceCreationPositioningClass      0
defencePressure                     0
defencePressureClass                0
defenceAggression                   0
defenceAggressionClass              0
defenceTeamWidth                    0
defenceTeamWidthClass               0
defenceDefenderLineClass            0
dtype: int64

In [62]:
team_attributes.drop(
    columns=[
        "id",
        "team_fifa_api_id",
        "buildUpPlayDribbling",
        "buildUpPlaySpeedClass",
        "buildUpPlayDribblingClass",
        "buildUpPlayPassingClass",
        "buildUpPlayPositioningClass",
        "chanceCreationPassingClass",
        "chanceCreationCrossingClass",
        "chanceCreationShootingClass",
        "chanceCreationPositioningClass",
        "defencePressureClass",
        "defenceAggressionClass",
        "defenceTeamWidthClass",
        "defenceDefenderLineClass",
    ],
    inplace=True,
)

In [63]:
team_attributes.rename(
    columns={
        "team_api_id": "TeamID",
        "date": "Date",
        "buildUpPlaySpeed": "BuildUpPlaySpeed",
        "buildUpPlayPassing": "BuildUpPlayPassing",
        "chanceCreationPassing": "ChanceCreationPassing",
        "chanceCreationCrossing": "ChanceCreationCrossing",
        "chanceCreationShooting": "ChanceCreationShooting",
        "defencePressure": "DefencePressure",
        "defenceAggression": "DefenceAggression",
        "defenceTeamWidth": "DefenceTeamWidth",
    },
    inplace=True,
)

In [64]:
team_attributes["Date"] = pd.DatetimeIndex(team_attributes["Date"]).year

In [65]:
team_attributes.isnull().sum()

TeamID                    0
Date                      0
BuildUpPlaySpeed          0
BuildUpPlayPassing        0
ChanceCreationPassing     0
ChanceCreationCrossing    0
ChanceCreationShooting    0
DefencePressure           0
DefenceAggression         0
DefenceTeamWidth          0
dtype: int64

In [66]:
team_attributes.head()

Unnamed: 0,TeamID,Date,BuildUpPlaySpeed,BuildUpPlayPassing,ChanceCreationPassing,ChanceCreationCrossing,ChanceCreationShooting,DefencePressure,DefenceAggression,DefenceTeamWidth
0,9930,2010,60,50,60,65,55,50,55,45
1,9930,2014,52,56,54,63,64,47,44,54
2,9930,2015,47,54,54,63,64,47,44,54
3,8485,2010,70,70,70,70,70,60,70,70
4,8485,2011,47,52,53,48,52,47,47,52


In [67]:
team_attributes.shape

(1458, 10)

## Team Attribute Descriptions

**BuildUpPlaySpeed:** Speed at which the team can stage an attack. <BR>
**BuildUpPlayPassing:** Ability to pass the ball to team members.<BR>
**ChanceCreationPassing:** Passes made by players that lead to a scoring opportunity for their team.<BR>
**ChanceCreationCrossing:** Passes made by players toward the center of the pitch, with the intention of creating a goal-scoring opportunity for their team.<BR> 
**ChanceCreationShooting:** Ability of a player to create a scoring opportunity for their team by taking a shot on goal or setting up a teammate for a shot.<BR>     
**DefencePressure:** Putting pressure on the opposing team's players when they are in possession of the ball.<BR>
**DefenceAggression:** A tactical approach where the defenders of a team play in a highly assertive manner to win back possession of the ball or stop the opposition's attack. <BR>
**DefenceTeamWidth:** The positioning of the defensive players in a way that maximizes the width of the team's defensive shape.<BR> 

### Section Summary
> * There were 1458 rows and 26 columns in this table.
> * This table contained no duplicates.
> * One column, "buildUpPlayDribbling" contained 969 missing values, which is more than 75% of the rows. According to Standard 3, in the Standards section of this report, I eliminated this column. 
> * I eliminated the "id," and the "team_fifa_api_id," columns to conform to Standard 4 of the Standard Section of this report. > * I kept the "team_api_id" column as identifier.
> * I dropped all "class" columns, such as "buildUpPlaySpeedClass," and "buildUpPlayPassingClass," with catagorical values, since each of them has a corresponding column with numerical values. 
> * I changed the header of each of the other columns to conform standard 2 in the Standard Section of this report. 
> * The cleaned table contains 1458 rows and 10 columns.

## Match

The section below contains information about the Match table and how I cleaned this table for further analysis.

In [68]:
match = football_db["Match"].copy()

In [69]:
match.head()

Unnamed: 0,id,country_id,league_id,season,stage,date,match_api_id,home_team_api_id,away_team_api_id,home_team_goal,away_team_goal,home_player_X1,home_player_X2,home_player_X3,home_player_X4,home_player_X5,home_player_X6,home_player_X7,home_player_X8,home_player_X9,home_player_X10,home_player_X11,away_player_X1,away_player_X2,away_player_X3,away_player_X4,away_player_X5,away_player_X6,away_player_X7,away_player_X8,away_player_X9,away_player_X10,away_player_X11,home_player_Y1,home_player_Y2,home_player_Y3,home_player_Y4,home_player_Y5,home_player_Y6,home_player_Y7,home_player_Y8,home_player_Y9,home_player_Y10,home_player_Y11,away_player_Y1,away_player_Y2,away_player_Y3,away_player_Y4,away_player_Y5,away_player_Y6,away_player_Y7,away_player_Y8,away_player_Y9,away_player_Y10,away_player_Y11,home_player_1,home_player_2,home_player_3,home_player_4,home_player_5,home_player_6,home_player_7,home_player_8,home_player_9,home_player_10,home_player_11,away_player_1,away_player_2,away_player_3,away_player_4,away_player_5,away_player_6,away_player_7,away_player_8,away_player_9,away_player_10,away_player_11,goal,shoton,shotoff,foulcommit,card,cross,corner,possession,B365H,B365D,B365A,BWH,BWD,BWA,IWH,IWD,IWA,LBH,LBD,LBA,PSH,PSD,PSA,WHH,WHD,WHA,SJH,SJD,SJA,VCH,VCD,VCA,GBH,GBD,GBA,BSH,BSD,BSA
0,1,1,1,2008/2009,1,2008-08-17 00:00:00,492473,9987,9993,1,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.73,3.4,5.0,1.75,3.35,4.2,1.85,3.2,3.5,1.8,3.3,3.75,,,,1.7,3.3,4.33,1.9,3.3,4.0,1.65,3.4,4.5,1.78,3.25,4.0,1.73,3.4,4.2
1,2,1,1,2008/2009,1,2008-08-16 00:00:00,492474,10000,9994,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.95,3.2,3.6,1.8,3.3,3.95,1.9,3.2,3.5,1.9,3.2,3.5,,,,1.83,3.3,3.6,1.95,3.3,3.8,2.0,3.25,3.25,1.85,3.25,3.75,1.91,3.25,3.6
2,3,1,1,2008/2009,1,2008-08-16 00:00:00,492475,9984,8635,0,3,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.38,3.3,2.75,2.4,3.3,2.55,2.6,3.1,2.3,2.5,3.2,2.5,,,,2.5,3.25,2.4,2.63,3.3,2.5,2.35,3.25,2.65,2.5,3.2,2.5,2.3,3.2,2.75
3,4,1,1,2008/2009,1,2008-08-17 00:00:00,492476,9991,9998,5,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.44,3.75,7.5,1.4,4.0,6.8,1.4,3.9,6.0,1.44,3.6,6.5,,,,1.44,3.75,6.0,1.44,4.0,7.5,1.45,3.75,6.5,1.5,3.75,5.5,1.44,3.75,6.5
4,5,1,1,2008/2009,1,2008-08-16 00:00:00,492477,7947,9985,1,3,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,5.0,3.5,1.65,5.0,3.5,1.6,4.0,3.3,1.7,4.0,3.4,1.72,,,,4.2,3.4,1.7,4.5,3.5,1.73,4.5,3.4,1.65,4.5,3.5,1.65,4.75,3.3,1.67


In [70]:
match.shape

(25979, 115)

In [71]:
match[match.duplicated()].sum()

id                  0.0
country_id          0.0
league_id           0.0
season              0.0
stage               0.0
date                0.0
match_api_id        0.0
home_team_api_id    0.0
away_team_api_id    0.0
home_team_goal      0.0
away_team_goal      0.0
home_player_X1      0.0
home_player_X2      0.0
home_player_X3      0.0
home_player_X4      0.0
home_player_X5      0.0
home_player_X6      0.0
home_player_X7      0.0
home_player_X8      0.0
home_player_X9      0.0
home_player_X10     0.0
home_player_X11     0.0
away_player_X1      0.0
away_player_X2      0.0
away_player_X3      0.0
away_player_X4      0.0
away_player_X5      0.0
away_player_X6      0.0
away_player_X7      0.0
away_player_X8      0.0
away_player_X9      0.0
away_player_X10     0.0
away_player_X11     0.0
home_player_Y1      0.0
home_player_Y2      0.0
home_player_Y3      0.0
home_player_Y4      0.0
home_player_Y5      0.0
home_player_Y6      0.0
home_player_Y7      0.0
home_player_Y8      0.0
home_player_Y9  

In [72]:
team_attributes.isnull().sum()

TeamID                    0
Date                      0
BuildUpPlaySpeed          0
BuildUpPlayPassing        0
ChanceCreationPassing     0
ChanceCreationCrossing    0
ChanceCreationShooting    0
DefencePressure           0
DefenceAggression         0
DefenceTeamWidth          0
dtype: int64

In [73]:
def extract_xml(row, col_name, xml_key, away_home):
    """
    Changes the xml entry to text entry
    """
    count = 0

    element = row[col_name]
    team_id = row[away_home + "_team_api_id"]

    if type(element) == int:
        return element

    elif element != None:
        tree = ElementTree(fromstring(element))
        root = tree.getroot()

        for child in root.iter(xml_key):
            if str(team_id) == child.text:
                count += 1
        return count
    else:
        return np.nan

In [74]:
def extract_possession_xml(row, col_name, xml_key):
    """
    Changes the xml entry for HomePossession and AwayPossession to text entry
    """
    count = 0
    sum_pos = 0

    element = row[col_name]

    if type(element) == int:
        return element

    elif element != None:
        tree = ElementTree(fromstring(element))
        root = tree.getroot()
        for child in root.iter(xml_key):
            count += 1
            sum_pos += int(child.text)

        if count == 0:
            return np.nan
        else:
            return sum_pos / count
    else:
        return np.nan

In [75]:
match["HomeShoton"] = np.nan
match["AwayShoton"] = np.nan

In [76]:
match["HomeShoton"] = match.apply(
    lambda x: extract_xml(x, "shoton", "team", "home"), axis=1
)
match["AwayShoton"] = match.apply(
    lambda x: extract_xml(x, "shoton", "team", "away"), axis=1
)

In [77]:
match["HomeShotoff"] = np.nan
match["AwayShotoff"] = np.nan

In [78]:
match["HomeShotoff"] = match.apply(
    lambda x: extract_xml(x, "shotoff", "team", "home"), axis=1
)
match["AwayShotoff"] = match.apply(
    lambda x: extract_xml(x, "shotoff", "team", "away"), axis=1
)

In [79]:
match["HomeFoulCommit"] = np.nan
match["AwayFoulCommit"] = np.nan

In [80]:
match["HomeFoulCommit"] = match.apply(
    lambda x: extract_xml(x, "foulcommit", "team", "home"), axis=1
)
match["AwayFoulCommit"] = match.apply(
    lambda x: extract_xml(x, "foulcommit", "team", "away"), axis=1
)

In [81]:
match["HomeCard"] = np.nan
match["AwayCard"] = np.nan

In [82]:
match["HomeCard"] = match.apply(
    lambda x: extract_xml(x, "card", "team", "home"), axis=1
)
match["AwayCard"] = match.apply(
    lambda x: extract_xml(x, "card", "team", "away"), axis=1
)

In [83]:
match["HomeCross"] = np.nan
match["AwayCross"] = np.nan

In [84]:
match["HomeCross"] = match.apply(
    lambda x: extract_xml(x, "cross", "team", "home"), axis=1
)
match["AwayCross"] = match.apply(
    lambda x: extract_xml(x, "cross", "team", "away"), axis=1
)

In [85]:
match["HomeCorner"] = np.nan
match["AwayCorner"] = np.nan

In [86]:
match["HomeCorner"] = match.apply(
    lambda x: extract_xml(x, "corner", "team", "home"), axis=1
)
match["AwayCorner"] = match.apply(
    lambda x: extract_xml(x, "corner", "team", "away"), axis=1
)

In [87]:
match["HomePossession"] = np.nan
match["AwayPossession"] = np.nan

In [88]:
match["HomePossession"] = match.apply(
    lambda x: extract_possession_xml(x, "possession", "homepos"), axis=1
)
match["AwayPossession"] = match.apply(
    lambda x: extract_possession_xml(x, "possession", "awaypos"), axis=1
)

In [89]:
match.drop(
    columns=["id", "stage", "country_id", "season"],
    inplace=True,
)

In [90]:
match.rename(
    columns={
        "league_id": "LeagueID",
        "match_api_id": "MatchID",
        "home_team_api_id": "HomeTeamID",
        "away_team_api_id": "AwayTeamID",
        "date": "Date",
        "home_team_goal": "HomeGoal",
        "away_team_goal": "AwayGoal",
        "home_player_1": "HomePlayer1",
        "home_player_2": "HomePlayer2",
        "home_player_3": "HomePlayer3",
        "home_player_4": "HomePlayer4",
        "home_player_5": "HomePlayer5",
        "home_player_6": "HomePlayer6",
        "home_player_7": "HomePlayer7",
        "home_player_8": "HomePlayer8",
        "home_player_9": "HomePlayer9",
        "home_player_10": "HomePlayer10",
        "home_player_11": "HomePlayer11",
        "away_player_1": "AwayPlayer1",
        "away_player_2": "AwayPlayer2",
        "away_player_3": "AwayPlayer3",
        "away_player_4": "AwayPlayer4",
        "away_player_5": "AwayPlayer5",
        "away_player_6": "AwayPlayer6",
        "away_player_7": "AwayPlayer7",
        "away_player_8": "AwayPlayer8",
        "away_player_9": "AwayPlayer9",
        "away_player_10": "AwayPlayer10",
        "away_player_11": "AwayPlayer11",
    },
    inplace=True,
)

In [91]:
match.drop(match.iloc[:, 7:51], inplace=True, axis=1)

In [92]:
match.drop(match.iloc[:, 29:67], inplace=True, axis=1)

In [93]:
match["Date"] = pd.DatetimeIndex(match["Date"]).year

In [94]:
match.head()

Unnamed: 0,LeagueID,Date,MatchID,HomeTeamID,AwayTeamID,HomeGoal,AwayGoal,HomePlayer1,HomePlayer2,HomePlayer3,HomePlayer4,HomePlayer5,HomePlayer6,HomePlayer7,HomePlayer8,HomePlayer9,HomePlayer10,HomePlayer11,AwayPlayer1,AwayPlayer2,AwayPlayer3,AwayPlayer4,AwayPlayer5,AwayPlayer6,AwayPlayer7,AwayPlayer8,AwayPlayer9,AwayPlayer10,AwayPlayer11,HomeShoton,AwayShoton,HomeShotoff,AwayShotoff,HomeFoulCommit,AwayFoulCommit,HomeCard,AwayCard,HomeCross,AwayCross,HomeCorner,AwayCorner,HomePossession,AwayPossession
0,1,2008,492473,9987,9993,1,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,1,2008,492474,10000,9994,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,1,2008,492475,9984,8635,0,3,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,1,2008,492476,9991,9998,5,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,1,2008,492477,7947,9985,1,3,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [95]:
match.shape

(25979, 43)

### Section Summary
> * This table contained 25,979 rows and 115 columns. 
> * This table contained no duplicates.
> * This table contained no columns with missing data.
> * Some of the data were in XML format that I extracted.
> * In accordance with Standard 4 in the Standards section of this report, I dropped all columns that showed position of the players such as,"home_player_X4".
> * After cleaning the table now contains 25,979 rows and 43 columns. 

# Creation of New Datasets

## Players

The section below contains information about how I merged the Player Table and the Player Attributes table to create a new dataframe with information about the players.

In [96]:
player.head()

Unnamed: 0,PlayerID,PlayerName,Birthday,Height,Weight
0,505942,Aaron Appindangoye,1992,182.88,187
1,155782,Aaron Cresswell,1989,170.18,146
2,162549,Aaron Doran,1991,170.18,163
3,30572,Aaron Galindo,1982,182.88,198
4,23780,Aaron Hughes,1979,182.88,154


In [97]:
player_attributes.head()

Unnamed: 0,PlayerID,Date,PreferredFoot,AttackWorkRate,DefenceWorkRate,OverallRating,Potential,Crossing,Finishing,HeadingAccuracy,ShortPassing,Volleys,Dribbling,Curve,FreeKickAccuracy,LongPassing,BallControl,Acceleration,SprintSpeed,Agility,Reactions,Balance,ShotPower,Jumping,Stamina,Strength,LongShots,Aggression,Interceptions,Positioning,Vision,Penalties,Marking,StandingTackle,SlidingTackle
0,2625,2007,right,high,medium,63.0,64.0,48.0,48.0,47.0,64.0,38.0,57.0,50.0,46.0,67.0,57.0,67.0,64.0,59.0,52.0,49.0,61.0,56.0,78.0,56.0,59.0,72.0,52.0,55.0,56.0,46.0,64.0,66.0,63.0
1,2625,2007,right,high,medium,63.0,64.0,48.0,48.0,47.0,64.0,38.0,57.0,50.0,51.0,67.0,57.0,67.0,64.0,59.0,52.0,49.0,61.0,56.0,78.0,56.0,59.0,72.0,52.0,55.0,56.0,46.0,64.0,66.0,63.0
2,2625,2008,right,high,medium,60.0,64.0,48.0,48.0,47.0,64.0,38.0,57.0,50.0,51.0,67.0,57.0,67.0,64.0,59.0,52.0,49.0,61.0,56.0,78.0,56.0,59.0,72.0,52.0,55.0,56.0,46.0,64.0,66.0,63.0
3,2625,2010,right,high,medium,60.0,64.0,48.0,48.0,47.0,64.0,38.0,57.0,50.0,51.0,67.0,57.0,67.0,64.0,59.0,50.0,49.0,71.0,56.0,78.0,56.0,59.0,72.0,71.0,50.0,56.0,69.0,64.0,66.0,63.0
4,2625,2011,right,high,medium,59.0,63.0,52.0,47.0,46.0,63.0,37.0,56.0,49.0,50.0,66.0,58.0,66.0,63.0,58.0,49.0,48.0,68.0,55.0,77.0,55.0,58.0,71.0,70.0,49.0,55.0,66.0,63.0,63.0,62.0


In [98]:
players = pd.merge(player, player_attributes, on="PlayerID", how="inner")

In [99]:
players.head()

Unnamed: 0,PlayerID,PlayerName,Birthday,Height,Weight,Date,PreferredFoot,AttackWorkRate,DefenceWorkRate,OverallRating,Potential,Crossing,Finishing,HeadingAccuracy,ShortPassing,Volleys,Dribbling,Curve,FreeKickAccuracy,LongPassing,BallControl,Acceleration,SprintSpeed,Agility,Reactions,Balance,ShotPower,Jumping,Stamina,Strength,LongShots,Aggression,Interceptions,Positioning,Vision,Penalties,Marking,StandingTackle,SlidingTackle
0,505942,Aaron Appindangoye,1992,182.88,187,2007,right,medium,medium,61.0,65.0,48.0,43.0,70.0,60.0,43.0,50.0,44.0,38.0,63.0,48.0,60.0,64.0,59.0,46.0,65.0,54.0,58.0,54.0,76.0,34.0,62.0,40.0,44.0,53.0,47.0,62.0,63.0,66.0
1,505942,Aaron Appindangoye,1992,182.88,187,2015,right,medium,medium,61.0,65.0,48.0,43.0,70.0,60.0,43.0,50.0,44.0,38.0,63.0,48.0,60.0,64.0,59.0,46.0,65.0,54.0,58.0,54.0,76.0,34.0,62.0,40.0,44.0,53.0,47.0,62.0,63.0,66.0
2,505942,Aaron Appindangoye,1992,182.88,187,2015,right,medium,medium,62.0,66.0,49.0,44.0,71.0,61.0,44.0,51.0,45.0,39.0,64.0,49.0,60.0,64.0,59.0,47.0,65.0,55.0,58.0,54.0,76.0,35.0,63.0,41.0,45.0,54.0,48.0,65.0,66.0,69.0
3,505942,Aaron Appindangoye,1992,182.88,187,2015,right,medium,medium,67.0,71.0,49.0,44.0,71.0,61.0,44.0,51.0,45.0,39.0,64.0,49.0,60.0,64.0,59.0,47.0,65.0,55.0,58.0,54.0,76.0,35.0,71.0,70.0,45.0,54.0,48.0,65.0,69.0,69.0
4,505942,Aaron Appindangoye,1992,182.88,187,2016,right,medium,medium,67.0,71.0,49.0,44.0,71.0,61.0,44.0,51.0,45.0,39.0,64.0,49.0,60.0,64.0,59.0,47.0,65.0,55.0,58.0,54.0,76.0,35.0,71.0,70.0,45.0,54.0,48.0,65.0,69.0,69.0


In [100]:
players.to_csv(
    "C:\\Users\\Space\\Documents\\py\\Projects\\TuringCollege\\Football\\DataSets\\players.csv",
    index=True,
)

### Section Summary
> * I merged the Players table with the Players Attributes table to make a new dataframe with information about the players.
> * I called this new dataframe players.
> * I saved the players dataframe as a csv file and used it in the Exploratory Data Analysis (EDA) section of this report. 

## Teams

The section below contains information about how I merged the Team table and the Team Attributes table to create a new dataframe with information about the teams.

In [101]:
teams = pd.merge(team, team_attributes, on="TeamID", how="inner")

In [102]:
teams.to_csv(
    "C:\\Users\\Space\\Documents\\py\\Projects\\TuringCollege\\Football\\DataSets\\teams.csv",
    index=True,
)

### Section Summary
> * I merged the Team and Team Attributes dataframes to make one dataframe with information about the teams. 
> * I called this new dataframe teams.
> * I saved the teams dataframe as a csv file and used it in the Exploratory Data Analysis (EDA) section of this report.

## Matches

The section below contains information about how I merged the League, Team, and Match table, in addtion to the players dataframe to create a new dataframe with information about the matches. This will be the dataframe that I will use in the Machince Learning section of this report.

In [103]:
matches = pd.merge(league, match, on="LeagueID", how="inner")

In [104]:
df_inner_Home = pd.merge(
    matches, teams, left_on=["HomeTeamID", "Date"], right_on=["TeamID", "Date"]
)

In [105]:
df_inner_Home.rename(
    columns={
        "TeamID": "HomeTeamID",
        "TeamLongName": "HomeTeamLongName",
        "TeamShortName": "HomeTeamShortName",
        "BuildUpPlaySpeed": "HomeBuildUpPlaySpeed",
        "BuildUpPlayPassing": "HomeBuildUpPlayPassing",
        "ChanceCreationPassing": "HomeChanceCreationPassing",
        "ChanceCreationCrossing": "HomeChanceCreationCrossing",
        "ChanceCreationShooting": "HomeChanceCreationShooting",
        "DefencePressure": "HomeDefencePressure",
        "DefenceAggression": "HomeDefenceAggression",
        "DefenceTeamWidth": "HomeDefenceTeamWidth",
    },
    inplace=True,
)

In [106]:
df_inner_Away = pd.merge(
    df_inner_Home, teams, left_on=["AwayTeamID", "Date"], right_on=["TeamID", "Date"]
)

In [107]:
df_inner_Away.rename(
    columns={
        "TeamID": "AwayTeamID",
        "TeamLongName": "AwayTeamLongName",
        "TeamShortName": "AwayTeamShortName",
        "BuildUpPlaySpeed": "AwayBuildUpPlaySpeed",
        "BuildUpPlayPassing": "AwayBuildUpPlayPassing",
        "ChanceCreationPassing": "AwayChanceCreationPassing",
        "ChanceCreationCrossing": "AwayChanceCreationCrossing",
        "ChanceCreationShooting": "AwayChanceCreationShooting",
        "DefencePressure": "AwayDefencePressure",
        "DefenceAggression": "AwayDefenceAggression",
        "DefenceTeamWidth": "AwayDefenceTeamWidth",
    },
    inplace=True,
)

In [108]:
matches = df_inner_Away

In [109]:
matches.head()

Unnamed: 0,LeagueID,LeagueName,Date,MatchID,HomeTeamID,AwayTeamID,HomeGoal,AwayGoal,HomePlayer1,HomePlayer2,HomePlayer3,HomePlayer4,HomePlayer5,HomePlayer6,HomePlayer7,HomePlayer8,HomePlayer9,HomePlayer10,HomePlayer11,AwayPlayer1,AwayPlayer2,AwayPlayer3,AwayPlayer4,AwayPlayer5,AwayPlayer6,AwayPlayer7,AwayPlayer8,AwayPlayer9,AwayPlayer10,AwayPlayer11,HomeShoton,AwayShoton,HomeShotoff,AwayShotoff,HomeFoulCommit,AwayFoulCommit,HomeCard,AwayCard,HomeCross,AwayCross,HomeCorner,AwayCorner,HomePossession,AwayPossession,HomeTeamID.1,HomeTeamLongName,HomeTeamShortName,HomeBuildUpPlaySpeed,HomeBuildUpPlayPassing,HomeChanceCreationPassing,HomeChanceCreationCrossing,HomeChanceCreationShooting,HomeDefencePressure,HomeDefenceAggression,HomeDefenceTeamWidth,AwayTeamID.1,AwayTeamLongName,AwayTeamShortName,AwayBuildUpPlaySpeed,AwayBuildUpPlayPassing,AwayChanceCreationPassing,AwayChanceCreationCrossing,AwayChanceCreationShooting,AwayDefencePressure,AwayDefenceAggression,AwayDefenceTeamWidth
0,1,Belgium Jupiler League,2010,665626,8635,8342,3,2,38391.0,33620.0,38388.0,38389.0,38253.0,69713.0,38393.0,148302.0,38378.0,38383.0,181276.0,37990.0,36832.0,21812.0,38336.0,39578.0,37979.0,11736.0,42594.0,38366.0,163670.0,75405.0,,,,,,,,,,,,,,,8635,RSC Anderlecht,AND,50,35,70,50,60,70,50,70,8342,Club Brugge KV,CLB,35,40,45,50,45,60,70,70
1,1,Belgium Jupiler League,2010,838635,8635,8342,2,2,38391.0,38389.0,38253.0,69713.0,94553.0,38378.0,69653.0,148302.0,12692.0,46552.0,181276.0,31226.0,36832.0,27508.0,39878.0,47411.0,39578.0,38366.0,52280.0,35412.0,163670.0,75405.0,,,,,,,,,,,,,,,8635,RSC Anderlecht,AND,50,35,70,50,60,70,50,70,8342,Club Brugge KV,CLB,35,40,45,50,45,60,70,70
2,1,Belgium Jupiler League,2010,838544,9986,8342,0,5,104388.0,39977.0,93054.0,46666.0,93344.0,39145.0,26613.0,45469.0,40520.0,,38732.0,31226.0,36832.0,39578.0,42594.0,47411.0,35412.0,51841.0,38336.0,52280.0,38366.0,163670.0,,,,,,,,,,,,,,,9986,Sporting Charleroi,CHA,40,50,45,43,60,70,70,70,8342,Club Brugge KV,CLB,35,40,45,50,45,60,70,70
3,1,Belgium Jupiler League,2010,838714,8203,8342,0,1,39573.0,67950.0,39389.0,80184.0,67941.0,37112.0,38969.0,178291.0,67957.0,148286.0,148315.0,37990.0,36832.0,27508.0,39878.0,42594.0,39578.0,38336.0,38366.0,163670.0,52280.0,38440.0,,,,,,,,,,,,,,,8203,KV Mechelen,MEC,65,60,50,40,50,60,70,60,8342,Club Brugge KV,CLB,35,40,45,50,45,60,70,70
4,1,Belgium Jupiler League,2010,665737,9997,8342,1,1,37868.0,33595.0,37866.0,3329.0,149150.0,38354.0,178486.0,36836.0,45490.0,68120.0,5016.0,37990.0,36832.0,21812.0,11736.0,42594.0,39578.0,37979.0,38366.0,163670.0,27423.0,38440.0,,,,,,,,,,,,,,,9997,Sint-Truidense VV,STT,50,50,50,50,50,50,50,50,8342,Club Brugge KV,CLB,35,40,45,50,45,60,70,70


In [128]:
matches = matches.fillna(0)

In [130]:
object_cols = matches.select_dtypes(include="float64").columns

In [131]:
matches[object_cols] = matches[object_cols].astype("int64")

In [134]:
def overall_function(player, date, player_id, rating_column, left_dataframe, right_dataframe):
    '''
    Inserts the overall ratings of the player into the matches table for the player.
    '''    

    df_merge = pd.merge(
        left_dataframe,
        right_dataframe,
        left_on=[date, player_id],
        right_on=[date, player],
        how="right",
    )
    df_merge[player] = df_merge[rating_column]
    del df_merge[rating_column]
    del df_merge[player_id]

    return df_merge

In [135]:
overall_rating = players[["Date", "PlayerID", "OverallRating"]]

In [136]:
player = "HomePlayer1"
date = "Date"
player_id = "PlayerID"
rating_column = "OverallRating"
left_dataframe = overall_rating
right_dataframe = matches

df_merge = overall_function(
    player, date, player_id, rating_column, left_dataframe, right_dataframe
)

In [137]:
player = "HomePlayer2"
date = "Date"
player_id = "PlayerID"
rating_column = "OverallRating"
left_dataframe = overall_rating
right_dataframe = df_merge


df_merge = overall_function(
    player, date, player_id, rating_column, left_dataframe, right_dataframe
)

In [138]:
player = "HomePlayer3"
date = "Date"
player_id = "PlayerID"
rating_column = "OverallRating"
left_dataframe = overall_rating
right_dataframe = df_merge


df_merge = overall_function(
    player, date, player_id, rating_column, left_dataframe, right_dataframe
)

In [139]:
player = "HomePlayer4"
date = "Date"
player_id = "PlayerID"
rating_column = "OverallRating"
left_dataframe = overall_rating
right_dataframe = df_merge


df_merge = overall_function(
    player, date, player_id, rating_column, left_dataframe, right_dataframe
)

In [140]:
player = "HomePlayer5"
date = "Date"
player_id = "PlayerID"
rating_column = "OverallRating"
left_dataframe = overall_rating
right_dataframe = df_merge


df_merge = overall_function(
    player, date, player_id, rating_column, left_dataframe, right_dataframe
)

In [141]:
player = "HomePlayer6"
date = "Date"
player_id = "PlayerID"
rating_column = "OverallRating"
left_dataframe = overall_rating
right_dataframe = df_merge


df_merge = overall_function(
    player, date, player_id, rating_column, left_dataframe, right_dataframe
)

In [142]:
player = "HomePlayer7"
date = "Date"
player_id = "PlayerID"
rating_column = "OverallRating"
left_dataframe = overall_rating
right_dataframe = df_merge


df_merge = overall_function(
    player, date, player_id, rating_column, left_dataframe, right_dataframe
)

In [143]:
player = "HomePlayer8"
date = "Date"
player_id = "PlayerID"
rating_column = "OverallRating"
left_dataframe = overall_rating
right_dataframe = df_merge


df_merge = overall_function(
    player, date, player_id, rating_column, left_dataframe, right_dataframe
)

In [144]:
player = "HomePlayer9"
date = "Date"
player_id = "PlayerID"
rating_column = "OverallRating"
left_dataframe = overall_rating
right_dataframe = df_merge


df_merge = overall_function(
    player, date, player_id, rating_column, left_dataframe, right_dataframe
)

In [145]:
player = "HomePlayer10"
date = "Date"
player_id = "PlayerID"
rating_column = "OverallRating"
left_dataframe = overall_rating
right_dataframe = df_merge


df_merge = overall_function(
    player, date, player_id, rating_column, left_dataframe, right_dataframe
)

In [146]:
player = "HomePlayer11"
date = "Date"
player_id = "PlayerID"
rating_column = "OverallRating"
left_dataframe = overall_rating
right_dataframe = df_merge


df_merge = overall_function(
    player, date, player_id, rating_column, left_dataframe, right_dataframe
)

In [147]:
player = "AwayPlayer1"
date = "Date"
player_id = "PlayerID"
rating_column = "OverallRating"
left_dataframe = overall_rating
right_dataframe = df_merge


df_merge = overall_function(
    player, date, player_id, rating_column, left_dataframe, right_dataframe
)

In [148]:
player = "AwayPlayer2"
date = "Date"
player_id = "PlayerID"
rating_column = "OverallRating"
left_dataframe = overall_rating
right_dataframe = df_merge


df_merge = overall_function(
    player, date, player_id, rating_column, left_dataframe, right_dataframe
)

In [149]:
player = "AwayPlayer3"
date = "Date"
player_id = "PlayerID"
rating_column = "OverallRating"
left_dataframe = overall_rating
right_dataframe = df_merge


df_merge = overall_function(
    player, date, player_id, rating_column, left_dataframe, right_dataframe
)

In [150]:
player = "AwayPlayer4"
date = "Date"
player_id = "PlayerID"
rating_column = "OverallRating"
left_dataframe = overall_rating
right_dataframe = df_merge


df_merge = overall_function(
    player, date, player_id, rating_column, left_dataframe, right_dataframe
)

In [151]:
player = "AwayPlayer5"
date = "Date"
player_id = "PlayerID"
rating_column = "OverallRating"
left_dataframe = overall_rating
right_dataframe = df_merge


df_merge = overall_function(
    player, date, player_id, rating_column, left_dataframe, right_dataframe
)

In [152]:
player = "AwayPlayer6"
date = "Date"
player_id = "PlayerID"
rating_column = "OverallRating"
left_dataframe = overall_rating
right_dataframe = df_merge


df_merge = overall_function(
    player, date, player_id, rating_column, left_dataframe, right_dataframe
)

In [153]:
player = "AwayPlayer7"
date = "Date"
player_id = "PlayerID"
rating_column = "OverallRating"
left_dataframe = overall_rating
right_dataframe = df_merge


df_merge = overall_function(
    player, date, player_id, rating_column, left_dataframe, right_dataframe
)

In [154]:
player = "AwayPlayer8"
date = "Date"
player_id = "PlayerID"
rating_column = "OverallRating"
left_dataframe = overall_rating
right_dataframe = df_merge


df_merge = overall_function(
    player, date, player_id, rating_column, left_dataframe, right_dataframe
)

In [155]:
player = "AwayPlayer9"
date = "Date"
player_id = "PlayerID"
rating_column = "OverallRating"
left_dataframe = overall_rating
right_dataframe = df_merge


df_merge = overall_function(
    player, date, player_id, rating_column, left_dataframe, right_dataframe
)

In [156]:
player = "AwayPlayer10"
date = "Date"
player_id = "PlayerID"
rating_column = "OverallRating"
left_dataframe = overall_rating
right_dataframe = df_merge


df_merge = overall_function(
    player, date, player_id, rating_column, left_dataframe, right_dataframe
)

In [157]:
player = "AwayPlayer11"
date = "Date"
player_id = "PlayerID"
rating_column = "OverallRating"
left_dataframe = overall_rating
right_dataframe = df_merge


df_merge = overall_function(
    player, date, player_id, rating_column, left_dataframe, right_dataframe
)

In [158]:
matches = df_merge

In [159]:
matches = matches.copy(deep=True)

matches["Result"] = matches["HomeGoal"] - matches["AwayGoal"]
matches.loc[matches["Result"] < 0, "Result"] = -1
matches.loc[matches["Result"] == 0, "Result"] = 0
matches.loc[matches["Result"] > 0, "Result"] = 1

result = matches.pop("Result")
matches.insert(8, "Result", result)

In [160]:
def calculate_mean(df, column):
    '''
    Calculates the mean of a column and inserts the mean where there are missing values in the column.
    '''    
    df[column] = df[column].fillna(df[column].mean())

    return df

In [161]:
attribute_list = [
    "HomeShoton",
    "AwayShoton",
    "HomeShotoff",
    "AwayShotoff",
    "HomeFoulCommit",
    "AwayFoulCommit",
    "HomeCard",
    "AwayCard",
    "HomeCross",
    "AwayCross",
    "HomeCorner",
    "AwayCorner",
    "HomePossession",
    "AwayPossession",
]

In [162]:
player_list = [
    "HomePlayer1",
    "HomePlayer2",
    "HomePlayer3",
    "HomePlayer4",
    "HomePlayer5",
    "HomePlayer6",
    "HomePlayer7",
    "HomePlayer8",
    "HomePlayer9",
    "HomePlayer10",
    "HomePlayer11",
    "AwayPlayer1",
    "AwayPlayer2",
    "AwayPlayer3",
    "AwayPlayer4",
    "AwayPlayer5",
    "AwayPlayer6",
    "AwayPlayer7",
    "AwayPlayer8",
    "AwayPlayer9",
    "AwayPlayer10",
    "AwayPlayer11",
]

In [163]:
column = player_list

matches = round(calculate_mean(matches, column), 1)

In [164]:
column = attribute_list

matches = round(calculate_mean(matches, column), 0)

In [165]:
home_long = matches.pop("HomeTeamLongName")
matches.insert(5, "HomeTeamLongName", home_long)

In [166]:
home_short = matches.pop("HomeTeamShortName")
matches.insert(6, "HomeTeamShortName", home_short)

In [167]:
away_long = matches.pop("AwayTeamLongName")
matches.insert(8, "AwayTeamLongName", away_long)

In [168]:
away_short = matches.pop("AwayTeamShortName")
matches.insert(9, "AwayTeamShortName", away_short)

In [169]:
matches = matches.loc[:, ~matches.columns.duplicated()].copy()

In [170]:
matches.to_csv(
    "C:\\Users\\Space\\Documents\\py\\Projects\\TuringCollege\\Football\\DataSets\\matches.csv",
    index=True,
)