In [2]:
import pandas as pd
import xml.etree.ElementTree as ET
import sqlite3

In [3]:
db_path = "../../../Databases/football_database.sqlite"

db = sqlite3.connect(db_path)

In [4]:
match_df = pd.read_sql("SELECT * from Match", con = db)

In [5]:
xml_cols = match_df[['id', 'match_api_id', 'goal', 'shoton', 'shotoff', 'foulcommit', 'card', 'cross', 'corner',
       'possession']]

In [6]:
xml_cols.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25979 entries, 0 to 25978
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            25979 non-null  int64 
 1   match_api_id  25979 non-null  int64 
 2   goal          14217 non-null  object
 3   shoton        14217 non-null  object
 4   shotoff       14217 non-null  object
 5   foulcommit    14217 non-null  object
 6   card          14217 non-null  object
 7   cross         14217 non-null  object
 8   corner        14217 non-null  object
 9   possession    14217 non-null  object
dtypes: int64(2), object(8)
memory usage: 2.0+ MB


In [7]:
xml_cols = xml_cols.dropna(how = "any")

# Create Tables

In this section the information is extracted from the columns that contain data in xml format and saved into seperated tables.

#### Example:

In [8]:
example = xml_cols.iloc[12]["goal"]

In [9]:
example

'<goal><value><comment>p</comment><stats><penalties>1</penalties></stats><event_incident_typefk>20</event_incident_typefk><elapsed>53</elapsed><player1>35608</player1><sortorder>0</sortorder><team>8549</team><id>464942</id><n>298</n><type>goal</type><goal_type>p</goal_type></value><value><comment>n</comment><stats><goals>1</goals><shoton>1</shoton></stats><elapsed_plus>4</elapsed_plus><event_incident_typefk>393</event_incident_typefk><elapsed>90</elapsed><player2>42183</player2><subtype>shot</subtype><player1>24393</player1><sortorder>4</sortorder><team>8549</team><id>465709</id><n>297</n><type>goal</type><goal_type>n</goal_type></value></goal>'

In [10]:
myroot = ET.fromstring(example)

for goal in myroot:
    for x in goal:
        print(x.tag, x.text)

comment p
stats None
event_incident_typefk 20
elapsed 53
player1 35608
sortorder 0
team 8549
id 464942
n 298
type goal
goal_type p
comment n
stats None
elapsed_plus 4
event_incident_typefk 393
elapsed 90
player2 42183
subtype shot
player1 24393
sortorder 4
team 8549
id 465709
n 297
type goal
goal_type n


### 1. Goal Table 

In [11]:
# create some lists
match_ids, goal_number, attribute, values = [], [], [], []

# loop over the rows
for match_id, go in zip(xml_cols.match_api_id, xml_cols.goal):
    myroot = ET.fromstring(go)
    
    if len(myroot) > 0:
        
        goal = 1
        # lists for the goals, ids, attributes
        for root in myroot:
            for x in root:

                match_ids.append(match_id)
                goal_number.append(goal)
                attribute.append(x.tag)
                values.append(x.text)


            # next goal
            goal += 1

# pandas dataframe
goal_df = pd.DataFrame({
    "match_api_id" : match_ids,
    "goal_number" : goal_number,
    "attribute" : attribute,
    "value" : values
})


In [14]:
goal_df.head(20)

Unnamed: 0,match_api_id,goal_number,attribute,value
0,489042,1,comment,n
1,489042,1,stats,
2,489042,1,event_incident_typefk,406
3,489042,1,elapsed,22
4,489042,1,player2,38807
5,489042,1,subtype,header
6,489042,1,player1,37799
7,489042,1,sortorder,5
8,489042,1,team,10261
9,489042,1,id,378998


In [15]:
len(goal_df)

489623

## get the longest set of attributes of a single goal

In [16]:
# therefore I need to store the lengths of the cols of a goal
goal_attr_lengths = []

# loop over goals
for match_id in goal_df.match_api_id.unique():
    df = goal_df[goal_df.match_api_id == match_id]
    
    for goal in df.goal_number.unique():
        goal_length = len(df[df.goal_number == goal])
        goal_attr_lengths.append(goal_length)

In [17]:
max_goals = max(goal_attr_lengths)
print(max_goals)

15


### Find first 10 records with the max number of col values

In [18]:
def find_out_index(attr_df, column, max_value):

    #find out where the length is 15
    
    match_ids, indices = [], []
    # loop over selected column
    for index, match_id in enumerate(attr_df.match_api_id.unique()):
        df = attr_df[attr_df.match_api_id == match_id]
        
        for index, val in enumerate(df[column].unique()):
            length = len(df[df[column] == val])
            if length == max_value:
                indices.append(index + 1)
                match_ids.append(match_id)
                if len(indices) > 9:
                    return match_ids, indices
    return match_ids, indices
            
        

In [19]:
match_ids, indices = find_out_index(goal_df, "goal_number", 15)

In [20]:
match_ids, indices

([1723982,
  1723991,
  1724072,
  1724075,
  1724092,
  1724109,
  1724118,
  1724144,
  1724144,
  1724159],
 [2, 2, 3, 4, 3, 3, 3, 2, 4, 4])

In [21]:
# retreive the attr, compare them
compare_cols_list = []
for match_id, goal_number in zip(match_ids, indices):
    
    cols = goal_df[(goal_df.match_api_id == match_id) & (goal_df.goal_number == goal_number)]["attribute"].values
    
    compare_cols_list.append(cols)

In [22]:
compare_attr = pd.DataFrame(compare_cols_list)
compare_attr

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,comment,stats,elapsed_plus,event_incident_typefk,coordinates,elapsed,player2,subtype,player1,sortorder,team,id,n,type,goal_type
1,comment,stats,elapsed_plus,event_incident_typefk,coordinates,elapsed,player2,subtype,player1,sortorder,team,id,n,type,goal_type
2,comment,stats,elapsed_plus,event_incident_typefk,coordinates,elapsed,player2,subtype,player1,sortorder,team,id,n,type,goal_type
3,comment,stats,elapsed_plus,event_incident_typefk,coordinates,elapsed,player2,subtype,player1,sortorder,team,id,n,type,goal_type
4,comment,stats,elapsed_plus,event_incident_typefk,coordinates,elapsed,player2,subtype,player1,sortorder,team,id,n,type,goal_type
5,comment,stats,elapsed_plus,event_incident_typefk,coordinates,elapsed,player2,subtype,player1,sortorder,team,id,n,type,goal_type
6,comment,stats,elapsed_plus,event_incident_typefk,coordinates,elapsed,player2,subtype,player1,sortorder,team,id,n,type,goal_type
7,comment,stats,elapsed_plus,event_incident_typefk,coordinates,elapsed,player2,subtype,player1,sortorder,team,id,n,type,goal_type
8,comment,stats,elapsed_plus,event_incident_typefk,coordinates,elapsed,player2,subtype,player1,sortorder,team,id,n,type,goal_type
9,comment,stats,elapsed_plus,event_incident_typefk,coordinates,elapsed,player2,subtype,player1,sortorder,team,id,n,type,goal_type


They look the same. So I use them for the dataframe.

In [23]:
all_match_ids = goal_df.match_api_id.unique()

piv_df_list = []

for match_id in all_match_ids:
    test_df = goal_df[goal_df.match_api_id == match_id]
    
    piv_df = test_df.pivot(index = "goal_number", columns='attribute', values='value')
    piv_df.insert(0, "match_api_id", match_id)
    piv_df_list.append(piv_df)

In [24]:
goal_attributes_df = pd.concat(piv_df_list)

In [25]:
goal_attributes_df = goal_attributes_df.rename(columns = {"id" : "goal_id"})
goal_attributes_df.insert(0, "goal_number",goal_attributes_df.index)
goal_attributes_df.reset_index(drop = True, inplace = True)

In [26]:
goal_attributes_df.nunique()

goal_number                 12
match_api_id             13224
comment                      7
elapsed                     90
event_incident_typefk       54
goal_type                    7
goal_id                  39980
n                          923
player1                   4355
player2                   3337
sortorder                   63
stats                        2
subtype                     17
team                       199
type                         1
elapsed_plus                12
del                          1
coordinates                  1
dtype: int64

In [27]:
len(goal_attributes_df)

39980

In [28]:
goal_attributes_df.head()

Unnamed: 0,goal_number,match_api_id,comment,elapsed,event_incident_typefk,goal_type,goal_id,n,player1,player2,sortorder,stats,subtype,team,type,elapsed_plus,del,coordinates
0,1,489042,n,22,406,n,378998,295,37799,38807,5,,header,10261,goal,,,
1,2,489042,n,24,393,n,379019,298,24148,24154,4,,shot,10260,goal,,,
2,1,489043,n,4,393,n,375546,231,26181,39297,2,,shot,9825,goal,,,
3,1,489044,n,83,407,n,378041,344,30853,30889,0,,distance,8650,goal,,,
4,1,489045,n,4,393,n,376060,244,23139,36394,2,,shot,8654,goal,,,


### Link Table from Match to Atributes

In [24]:
# i would need a df consisting of the match_id, the goals as the columns and the corresponding goal_ids

# find out the max goals per game
max_goals = goal_df.goal_number.max()
print(max_goals)

12


In [25]:
match_goal_df = goal_attributes_df.pivot(index = "match_api_id", columns='goal_number', values='goal_id')

In [26]:
match_goal_df

goal_number,1,2,3,4,5,6,7,8,9,10,11,12
match_api_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
489042,378998,379019,,,,,,,,,,
489043,375546,,,,,,,,,,,
489044,378041,,,,,,,,,,,
489045,376060,376165,376929,,,,,,,,,
489046,378837,378981,379030,379074,379095,379250,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
2060642,5607618,5608164,5608321,5608543,5608972,,,,,,,
2060643,5607960,5609258,5609878,5610026,,,,,,,,
2060644,5605861,5606127,5606361,5606857,5606952,,,,,,,
2060645,5614388,5615046,5615083,,,,,,,,,


In [27]:
# create some lists
match_ids, shoton_number, attribute, values = [], [], [], []

# loop over the rows
for match_id, go in zip(xml_cols.match_api_id, xml_cols.shoton):
    myroot = ET.fromstring(go)
    
    if len(myroot) > 0:
        
        goal = 1
        # lists for the goals, ids, attributes
        for root in myroot:
            for x in root:

                match_ids.append(match_id)
                shoton_number.append(goal)
                attribute.append(x.tag)
                values.append(x.text)


            # next goal
            goal += 1

# pandas dataframe
shoton_df = pd.DataFrame({
    "match_api_id" : match_ids,
    "shoton_number" : shoton_number,
    "attribute" : attribute,
    "value" : values
})


In [28]:
# get the longest set of attributes of a single shoton for the next table

# therefore I need to store the lengths of the cols of a shoton
shoton_attr_lengths = []

# loop over shotons
for match_id in shoton_df.match_api_id.unique():
    df = shoton_df[shoton_df.match_api_id == match_id]
    
    for shoton in df.shoton_number.unique():
        shoton_length = len(df[df.shoton_number == shoton])
        shoton_attr_lengths.append(shoton_length)

In [29]:
max_shoton = max(shoton_attr_lengths)
print(max_shoton)

12


In [30]:
def find_out_index(attr_df, column, max_value):

    #find out where the length is 15
    
    match_ids, indices = [], []
    # loop over selected column
    for index, match_id in enumerate(attr_df.match_api_id.unique()):
        df = attr_df[attr_df.match_api_id == match_id]
        
        for index, val in enumerate(df[column].unique()):
            length = len(df[df[column] == val])
            if length == max_value:
                indices.append(index + 1)
                match_ids.append(match_id)
                if len(indices) > 9:
                    return match_ids, indices
    return match_ids, indices
            
        

In [31]:
match_ids, indices = find_out_index(shoton_df, "shoton_number", max_shoton)

In [32]:
match_ids, indices

([1723982,
  1723984,
  1723985,
  1723985,
  1723986,
  1723987,
  1723988,
  1723989,
  1723989,
  1723989],
 [10, 9, 3, 4, 11, 12, 16, 4, 6, 7])

In [33]:
# retreive the attr, compare them
compare_cols_list = []
for match_id, shoton_number in zip(match_ids, indices):
    
    cols = shoton_df[(shoton_df.match_api_id == match_id) & (shoton_df.shoton_number == shoton_number)]["attribute"].values
    
    compare_cols_list.append(cols)

In [34]:
compare_attr = pd.DataFrame(compare_cols_list)
compare_attr

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,stats,elapsed_plus,event_incident_typefk,coordinates,elapsed,subtype,player1,sortorder,team,n,type,id
1,stats,elapsed_plus,event_incident_typefk,coordinates,elapsed,subtype,player1,sortorder,team,n,type,id
2,stats,elapsed_plus,event_incident_typefk,coordinates,elapsed,subtype,player1,sortorder,team,n,type,id
3,stats,elapsed_plus,event_incident_typefk,coordinates,elapsed,subtype,player1,sortorder,team,n,type,id
4,stats,elapsed_plus,event_incident_typefk,coordinates,elapsed,subtype,player1,sortorder,team,n,type,id
5,stats,elapsed_plus,event_incident_typefk,coordinates,elapsed,subtype,player1,sortorder,team,n,type,id
6,stats,elapsed_plus,event_incident_typefk,coordinates,elapsed,subtype,player1,sortorder,team,n,type,id
7,stats,elapsed_plus,event_incident_typefk,coordinates,elapsed,subtype,player1,sortorder,team,n,type,id
8,stats,elapsed_plus,event_incident_typefk,coordinates,elapsed,subtype,player1,sortorder,team,n,type,id
9,stats,elapsed_plus,event_incident_typefk,coordinates,elapsed,subtype,player1,sortorder,team,n,type,id


They look the same. So I use them for the dataframe.

In [35]:
all_match_ids = shoton_df.match_api_id.unique()

piv_df_list = []

for match_id in all_match_ids:
    test_df = shoton_df[shoton_df.match_api_id == match_id]
    
    piv_df = test_df.pivot(index = "shoton_number", columns='attribute', values='value')
    piv_df.insert(0, "match_api_id", match_id)
    piv_df_list.append(piv_df)

In [36]:
shoton_attributes_df = pd.concat(piv_df_list)

In [37]:
shoton_attributes_df = shoton_attributes_df.rename(columns = {"id" : "shoton_id"})
shoton_attributes_df.insert(0, "shoton_number",shoton_attributes_df.index)
shoton_attributes_df.reset_index(drop = True, inplace = True)

In [38]:
shoton_attributes_df.nunique()

shoton_number               28
match_api_id              8463
elapsed                     90
event_incident_typefk      107
shoton_id                93755
n                         1043
player1                   4633
sortorder                   52
stats                        2
subtype                     16
team                       178
type                         1
elapsed_plus                14
goal_type                    2
del                          1
coordinates                  1
card_type                    1
dtype: int64

In [39]:
shoton_attributes_df.head()

Unnamed: 0,shoton_number,match_api_id,elapsed,event_incident_typefk,shoton_id,n,player1,sortorder,stats,subtype,team,type,elapsed_plus,goal_type,del,coordinates,card_type
0,1,489042,3,61,378828,253,24154,0,,blocked_shot,10260,shoton,,,,,
1,2,489042,7,154,378866,258,24157,2,,header,10260,shoton,,,,,
2,3,489042,14,153,378922,274,30829,1,,shot,10260,shoton,,,,,
3,4,489042,14,153,378923,279,30373,2,,shot,10260,shoton,,,,,
4,5,489042,17,137,378951,272,30373,3,,distance,10260,shoton,,,,,


### Link Table from Match to Atributes

In [40]:
# i would need a df consisting of the match_id, the shotons as the columns and the corresponding shoton_ids

# find out the max shotons per game
max_shotons = shoton_df.shoton_number.max()
print(max_shotons)

28


In [41]:
match_shoton_df = shoton_attributes_df.pivot(index = "match_api_id", columns='shoton_number', values='shoton_id')

In [42]:
match_shoton_df

shoton_number,1,2,3,4,5,6,7,8,9,10,...,19,20,21,22,23,24,25,26,27,28
match_api_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
489042,378828,378866,378922,378923,378951,379204,379363,379401,379406,379414,...,,,,,,,,,,
489043,375556,375557,375563,375596,375628,375651,375654,375680,375717,375740,...,,,,,,,,,,
489044,377771,377823,377845,377901,377918,377924,377967,377970,377980,377982,...,,,,,,,,,,
489045,376231,376427,376476,376801,377038,377095,377175,377236,377411,377459,...,,,,,,,,,,
489046,378524,378542,378563,378695,378699,378787,378852,378865,379010,379157,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2060642,5607915,5608242,5608249,5608258,5608967,5609105,5609500,,,,...,,,,,,,,,,
2060643,5607804,5608333,5608351,5608538,5608675,5609157,5609353,5609555,5609565,5609657,...,,,,,,,,,,
2060644,5605742,5606232,5606259,5606281,5606285,5606558,5606565,5606583,5606705,5606904,...,,,,,,,,,,
2060645,5614378,5614655,5614660,5614697,5614988,5615100,5615126,5615278,5615296,5615690,...,,,,,,,,,,


In [43]:
# create some lists
match_ids, shotoff_number, attribute, values = [], [], [], []

# loop over the rows
for match_id, go in zip(xml_cols.match_api_id, xml_cols.shotoff):
    myroot = ET.fromstring(go)
    
    if len(myroot) > 0:
        
        goal = 1
        # lists for the goals, ids, attributes
        for root in myroot:
            for x in root:

                match_ids.append(match_id)
                shotoff_number.append(goal)
                attribute.append(x.tag)
                values.append(x.text)


            # next goal
            goal += 1

# pandas dataframe
shotoff_df = pd.DataFrame({
    "match_api_id" : match_ids,
    "shotoff_number" : shotoff_number,
    "attribute" : attribute,
    "value" : values
})

In [44]:
# get the longest set of attributes of a single shotoff for the next table

# therefore I need to store the lengths of the cols of a shotoff
shotoff_attr_lengths = []

# loop over shotoffs
for match_id in shotoff_df.match_api_id.unique():
    df = shotoff_df[shotoff_df.match_api_id == match_id]
    
    for shotoff in df.shotoff_number.unique():
        shotoff_length = len(df[df.shotoff_number == shotoff])
        shotoff_attr_lengths.append(shotoff_length)

In [45]:
max_shotoff = max(shotoff_attr_lengths)
print(max_shotoff)

12


In [46]:
def find_out_index(attr_df, column, max_value):

    #find out where the length is 15
    
    match_ids, indices = [], []
    # loop over selected column
    for index, match_id in enumerate(attr_df.match_api_id.unique()):
        df = attr_df[attr_df.match_api_id == match_id]
        
        for index, val in enumerate(df[column].unique()):
            length = len(df[df[column] == val])
            if length == max_value:
                indices.append(index + 1)
                match_ids.append(match_id)
                if len(indices) > 9:
                    return match_ids, indices
    return match_ids, indices
            
        

In [47]:
match_ids, indices = find_out_index(shotoff_df, "shotoff_number", max_shotoff)

In [48]:
match_ids, indices

([1723984,
  1723986,
  1723987,
  1723988,
  1724074,
  1724076,
  1724077,
  1724077,
  1724079,
  1724080],
 [10, 6, 12, 6, 12, 11, 10, 11, 11, 3])

In [49]:
# retreive the attr, compare them
compare_cols_list = []
for match_id, shotoff_number in zip(match_ids, indices):
    
    cols = shotoff_df[(shotoff_df.match_api_id == match_id) & (shotoff_df.shotoff_number == shotoff_number)]["attribute"].values
    
    compare_cols_list.append(cols)

In [50]:
compare_attr = pd.DataFrame(compare_cols_list)
compare_attr

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,stats,elapsed_plus,event_incident_typefk,coordinates,elapsed,subtype,player1,sortorder,team,n,type,id
1,stats,elapsed_plus,event_incident_typefk,coordinates,elapsed,subtype,player1,sortorder,team,n,type,id
2,stats,elapsed_plus,event_incident_typefk,coordinates,elapsed,subtype,player1,sortorder,team,n,type,id
3,stats,elapsed_plus,event_incident_typefk,coordinates,elapsed,subtype,player1,sortorder,team,n,type,id
4,stats,elapsed_plus,event_incident_typefk,coordinates,elapsed,subtype,player1,sortorder,team,n,type,id
5,stats,elapsed_plus,event_incident_typefk,coordinates,elapsed,subtype,player1,sortorder,team,n,type,id
6,stats,elapsed_plus,event_incident_typefk,coordinates,elapsed,subtype,player1,sortorder,team,n,type,id
7,stats,elapsed_plus,event_incident_typefk,coordinates,elapsed,subtype,player1,sortorder,team,n,type,id
8,stats,elapsed_plus,event_incident_typefk,coordinates,elapsed,subtype,player1,sortorder,team,n,type,id
9,stats,elapsed_plus,event_incident_typefk,coordinates,elapsed,subtype,player1,sortorder,team,n,type,id


They look the same. So I use them for the dataframe.

In [51]:
all_match_ids = shotoff_df.match_api_id.unique()

piv_df_list = []

for match_id in all_match_ids:
    test_df = shotoff_df[shotoff_df.match_api_id == match_id]
    
    piv_df = test_df.pivot(index = "shotoff_number", columns='attribute', values='value')
    piv_df.insert(0, "match_api_id", match_id)
    piv_df_list.append(piv_df)

In [52]:
shotoff_attributes_df = pd.concat(piv_df_list)

In [53]:
shotoff_attributes_df = shotoff_attributes_df.rename(columns = {"id" : "shotoff_id"})
shotoff_attributes_df.insert(0, "shotoff_number",shotoff_attributes_df.index)
shotoff_attributes_df.reset_index(drop = True, inplace = True)

In [54]:
shotoff_attributes_df.nunique()

shotoff_number              27
match_api_id              8463
elapsed                     90
elapsed_plus                13
event_incident_typefk       66
shotoff_id               95303
n                         1042
player1                   4800
sortorder                   41
stats                        2
subtype                     19
team                       178
type                         1
del                          1
card_type                    2
coordinates                  1
dtype: int64

In [55]:
shotoff_attributes_df.head()

Unnamed: 0,shotoff_number,match_api_id,elapsed,elapsed_plus,event_incident_typefk,shotoff_id,n,player1,sortorder,stats,subtype,team,type,del,card_type,coordinates
0,1,489042,4,,9,378835,264,30373,1,,distance,10260,shotoff,,,
1,2,489042,5,,9,378845,257,37799,2,,distance,10261,shotoff,,,
2,3,489042,22,,317,378995,269,24228,3,,deflected,10261,shotoff,,,
3,4,489042,31,,9,379075,278,38807,0,,distance,10261,shotoff,,,
4,5,489042,34,,47,379111,281,24154,2,,header,10260,shotoff,,,


### Link Table from Match to Atributes

In [56]:
# i would need a df consisting of the match_id, the shotoffs as the columns and the corresponding shotoff_ids

# find out the max shotoffs per game
max_shotoffs = shotoff_df.shotoff_number.max()
print(max_shotoffs)

27


In [57]:
match_shotoff_df = shotoff_attributes_df.pivot(index = "match_api_id", columns='shotoff_number', values='shotoff_id')

In [58]:
match_shotoff_df

shotoff_number,1,2,3,4,5,6,7,8,9,10,...,18,19,20,21,22,23,24,25,26,27
match_api_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
489042,378835,378845,378995,379075,379111,379140,379378,379415,379419,379421,...,379560,379573,,,,,,,,
489043,375553,375561,375572,375585,375603,375620,375633,375660,375674,375681,...,,,,,,,,,,
489044,377805,377884,377905,377935,377963,377971,378034,378055,,,...,,,,,,,,,,
489045,376107,376213,376267,376278,376456,376813,376902,376953,377015,377049,...,377433,377566,377575,377603,377651,,,,,
489046,378506,378631,378673,378700,378707,378747,378883,378937,379280,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2060642,5608130,5608381,5608453,5608507,5608532,5609076,5609304,5609377,5609404,5609603,...,,,,,,,,,,
2060643,5607649,5607662,5607905,5607934,5608557,5608618,5608689,5609005,5609068,5609522,...,,,,,,,,,,
2060644,5606297,5606306,5606330,5606354,5606387,5606404,5606544,5606685,5606767,5606780,...,,,,,,,,,,
2060645,5614262,5614296,5614330,5614568,5614686,5614745,5614860,5614965,5615011,5615347,...,,,,,,,,,,


In [59]:
# create some lists
match_ids, foulcommit_number, attribute, values = [], [], [], []

# loop over the rows
for match_id, go in zip(xml_cols.match_api_id, xml_cols.foulcommit):
    myroot = ET.fromstring(go)
    
    if len(myroot) > 0:
        
        goal = 1
        # lists for the goals, ids, attributes
        for root in myroot:
            for x in root:

                match_ids.append(match_id)
                foulcommit_number.append(goal)
                attribute.append(x.tag)
                values.append(x.text)


            # next goal
            goal += 1

# pandas dataframe
foulcommit_df = pd.DataFrame({
    "match_api_id" : match_ids,
    "foulcommit_number" : foulcommit_number,
    "attribute" : attribute,
    "value" : values
})

In [60]:
# get the longest set of attributes of a single foulcommit for the next table

# therefore I need to store the lengths of the cols of a foulcommit
foulcommit_attr_lengths = []

# loop over foulcommits
for match_id in foulcommit_df.match_api_id.unique():
    df = foulcommit_df[foulcommit_df.match_api_id == match_id]
    
    for foulcommit in df.foulcommit_number.unique():
        foulcommit_length = len(df[df.foulcommit_number == foulcommit])
        foulcommit_attr_lengths.append(foulcommit_length)

In [61]:
max_foulcommit = max(foulcommit_attr_lengths)
print(max_foulcommit)

13


In [62]:
def find_out_index(attr_df, column, max_value):

    #find out where the length is 15
    
    match_ids, indices = [], []
    # loop over selected column
    for index, match_id in enumerate(attr_df.match_api_id.unique()):
        df = attr_df[attr_df.match_api_id == match_id]
        
        for index, val in enumerate(df[column].unique()):
            length = len(df[df[column] == val])
            if length == max_value:
                indices.append(index + 1)
                match_ids.append(match_id)
                if len(indices) > 9:
                    return match_ids, indices
    return match_ids, indices
            
        

In [63]:
match_ids, indices = find_out_index(foulcommit_df, "foulcommit_number", max_foulcommit)

In [64]:
match_ids, indices

([1723982,
  1723982,
  1723989,
  1723990,
  1723991,
  1724072,
  1724072,
  1724075,
  1724076,
  1724081],
 [15, 31, 16, 22, 23, 10, 11, 15, 25, 9])

In [65]:
# retreive the attr, compare them
compare_cols_list = []
for match_id, foulcommit_number in zip(match_ids, indices):
    
    cols = foulcommit_df[(foulcommit_df.match_api_id == match_id) & (foulcommit_df.foulcommit_number == foulcommit_number)]["attribute"].values
    
    compare_cols_list.append(cols)

In [66]:
compare_attr = pd.DataFrame(compare_cols_list)
compare_attr

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,stats,elapsed_plus,event_incident_typefk,coordinates,elapsed,player2,subtype,player1,sortorder,team,n,type,id
1,stats,elapsed_plus,event_incident_typefk,coordinates,elapsed,player2,subtype,player1,sortorder,team,n,type,id
2,stats,elapsed_plus,event_incident_typefk,coordinates,elapsed,player2,subtype,player1,sortorder,team,n,type,id
3,stats,elapsed_plus,event_incident_typefk,coordinates,elapsed,player2,subtype,player1,sortorder,team,n,type,id
4,stats,elapsed_plus,event_incident_typefk,coordinates,elapsed,player2,subtype,player1,sortorder,team,n,type,id
5,stats,elapsed_plus,event_incident_typefk,coordinates,elapsed,player2,subtype,player1,sortorder,team,n,type,id
6,stats,elapsed_plus,event_incident_typefk,coordinates,elapsed,player2,subtype,player1,sortorder,team,n,type,id
7,stats,elapsed_plus,event_incident_typefk,coordinates,elapsed,player2,subtype,player1,sortorder,team,n,type,id
8,stats,elapsed_plus,event_incident_typefk,coordinates,elapsed,player2,subtype,player1,sortorder,team,n,type,id
9,stats,elapsed_plus,event_incident_typefk,coordinates,elapsed,player2,subtype,player1,sortorder,team,n,type,id


They look the same. So I use them for the dataframe.

In [67]:
all_match_ids = foulcommit_df.match_api_id.unique()

piv_df_list = []

for match_id in all_match_ids:
    test_df = foulcommit_df[foulcommit_df.match_api_id == match_id]
    
    piv_df = test_df.pivot(index = "foulcommit_number", columns='attribute', values='value')
    piv_df.insert(0, "match_api_id", match_id)
    piv_df_list.append(piv_df)

In [68]:
foulcommit_attributes_df = pd.concat(piv_df_list)

In [69]:
foulcommit_attributes_df = foulcommit_attributes_df.rename(columns = {"id" : "foulcommit_id"})
foulcommit_attributes_df.insert(0, "foulcommit_number",foulcommit_attributes_df.index)
foulcommit_attributes_df.reset_index(drop = True, inplace = True)

In [70]:
foulcommit_attributes_df.nunique()

foulcommit_number            55
match_api_id               8465
elapsed                      91
elapsed_plus                 16
event_incident_typefk        59
foulcommit_id            219234
n                          1135
player1                    5618
player2                    5589
sortorder                    53
stats                         2
subtype                      13
team                        178
type                          1
card_type                     2
del                           1
coordinates                   1
venue                         2
injury_time                   2
dtype: int64

In [71]:
foulcommit_attributes_df.head()

Unnamed: 0,foulcommit_number,match_api_id,elapsed,elapsed_plus,event_incident_typefk,foulcommit_id,n,player1,player2,sortorder,stats,subtype,team,type,card_type,del,coordinates,venue,injury_time
0,1,489042,1,,37,378824,267,25518,32569,1,,,10261,foulcommit,,,,,
1,2,489042,2,,37,378826,277,30929,24157,0,,,10261,foulcommit,,,,,
2,3,489042,3,,37,378830,254,29581,24148,1,,,10261,foulcommit,,,,,
3,4,489042,5,,37,378841,256,30373,40565,0,,,10260,foulcommit,,,,,
4,5,489042,10,,320,378894,275,29581,30829,0,,pushing,10261,foulcommit,,,,,


### Link Table from Match to Atributes

In [72]:
# i would need a df consisting of the match_id, the foulcommits as the columns and the corresponding foulcommit_ids

# find out the max foulcommits per game
max_foulcommits = foulcommit_df.foulcommit_number.max()
print(max_foulcommits)

55


In [73]:
match_foulcommit_df = foulcommit_attributes_df.pivot(index = "match_api_id", columns='foulcommit_number', values='foulcommit_id')

In [74]:
match_foulcommit_df

foulcommit_number,1,2,3,4,5,6,7,8,9,10,...,46,47,48,49,50,51,52,53,54,55
match_api_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
489042,378824,378826,378830,378841,378894,378913,378974,379035,379044,379100,...,,,,,,,,,,
489043,375531,375541,375551,375564,375581,375593,375631,375643,375648,375672,...,,,,,,,,,,
489044,377817,377834,377837,377853,377872,377875,377887,377899,377921,377926,...,,,,,,,,,,
489045,376007,376023,376070,376081,376170,376178,376336,376378,376437,376582,...,,,,,,,,,,
489046,378496,378514,378544,378570,378579,378616,378636,378648,378656,378662,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2060642,5607553,5607599,5607839,5608058,5608152,5608276,5608299,5608486,5608617,5608658,...,,,,,,,,,,
2060643,5607753,5607785,5607820,5607884,5608000,5608032,5608213,5608311,5608576,5608596,...,,,,,,,,,,
2060644,5605781,5605844,5606043,5606094,5606193,5606221,5606249,5606270,5606273,5606338,...,,,,,,,,,,
2060645,5614254,5614318,5614371,5614408,5614430,5614534,5614559,5614585,5614730,5614818,...,,,,,,,,,,


In [75]:
# create some lists
match_ids, card_number, attribute, values = [], [], [], []

# loop over the rows
for match_id, go in zip(xml_cols.match_api_id, xml_cols.card):
    myroot = ET.fromstring(go)
    
    if len(myroot) > 0:
        
        goal = 1
        # lists for the goals, ids, attributes
        for root in myroot:
            for x in root:

                match_ids.append(match_id)
                card_number.append(goal)
                attribute.append(x.tag)
                values.append(x.text)


            # next goal
            goal += 1

# pandas dataframe
card_df = pd.DataFrame({
    "match_api_id" : match_ids,
    "card_number" : card_number,
    "attribute" : attribute,
    "value" : values
})


In [76]:
# get the longest set of attributes of a single card for the next table

# therefore I need to store the lengths of the cols of a card
card_attr_lengths = []

# loop over cards
for match_id in card_df.match_api_id.unique():
    df = card_df[card_df.match_api_id == match_id]
    
    for card in df.card_number.unique():
        card_length = len(df[df.card_number == card])
        card_attr_lengths.append(card_length)

In [77]:
max_card = max(card_attr_lengths)
print(max_card)

13


In [78]:
def find_out_index(attr_df, column, max_value):

    #find out where the length is 15
    
    match_ids, indices = [], []
    # loop over selected column
    for index, match_id in enumerate(attr_df.match_api_id.unique()):
        df = attr_df[attr_df.match_api_id == match_id]
        
        for index, val in enumerate(df[column].unique()):
            length = len(df[df[column] == val])
            if length == max_value:
                indices.append(index + 1)
                match_ids.append(match_id)
                if len(indices) > 9:
                    return match_ids, indices
    return match_ids, indices
            
        

In [79]:
match_ids, indices = find_out_index(card_df, "card_number", max_card)

In [80]:
match_ids, indices

([489047,
  489048,
  489049,
  489051,
  489140,
  489143,
  489148,
  489150,
  489150,
  489150],
 [4, 3, 3, 1, 4, 1, 6, 5, 6, 7])

In [81]:
# retreive the attr, compare them
compare_cols_list = []
for match_id, card_number in zip(match_ids, indices):
    
    cols = card_df[(card_df.match_api_id == match_id) & (card_df.card_number == card_number)]["attribute"].values
    
    compare_cols_list.append(cols)

In [82]:
compare_attr = pd.DataFrame(compare_cols_list)
compare_attr

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,comment,stats,elapsed_plus,event_incident_typefk,elapsed,card_type,subtype,player1,sortorder,team,n,type,id
1,comment,stats,elapsed_plus,event_incident_typefk,elapsed,card_type,subtype,player1,sortorder,team,n,type,id
2,comment,stats,elapsed_plus,event_incident_typefk,elapsed,card_type,subtype,player1,sortorder,team,n,type,id
3,comment,stats,elapsed_plus,event_incident_typefk,elapsed,card_type,subtype,player1,sortorder,team,n,type,id
4,comment,stats,elapsed_plus,event_incident_typefk,elapsed,card_type,subtype,player1,sortorder,team,n,type,id
5,comment,stats,elapsed_plus,event_incident_typefk,elapsed,card_type,subtype,player1,sortorder,team,n,type,id
6,comment,stats,elapsed_plus,event_incident_typefk,elapsed,card_type,subtype,player1,sortorder,team,n,type,id
7,comment,stats,elapsed_plus,event_incident_typefk,elapsed,card_type,subtype,player1,sortorder,team,n,type,id
8,comment,stats,elapsed_plus,event_incident_typefk,elapsed,card_type,subtype,player1,sortorder,team,n,type,id
9,comment,stats,elapsed_plus,event_incident_typefk,elapsed,card_type,subtype,player1,sortorder,team,n,type,id


They look the same. So I use them for the dataframe.

In [83]:
all_match_ids = card_df.match_api_id.unique()

piv_df_list = []

for match_id in all_match_ids:
    test_df = card_df[card_df.match_api_id == match_id]
    
    piv_df = test_df.pivot(index = "card_number", columns='attribute', values='value')
    piv_df.insert(0, "match_api_id", match_id)
    piv_df_list.append(piv_df)

In [84]:
card_attributes_df = pd.concat(piv_df_list)

In [85]:
card_attributes_df = card_attributes_df.rename(columns = {"id" : "card_id"})
card_attributes_df.insert(0, "card_number",card_attributes_df.index)
card_attributes_df.reset_index(drop = True, inplace = True)

In [86]:
card_attributes_df.nunique()

card_number                 19
match_api_id             13776
card_type                    3
comment                      3
elapsed                     92
event_incident_typefk       37
card_id                  62114
n                          988
player1                   5865
sortorder                   47
stats                        2
subtype                     13
team                       202
type                         1
elapsed_plus                10
del                          1
goal_type                    1
dtype: int64

In [87]:
card_attributes_df.head()

Unnamed: 0,card_number,match_api_id,card_type,comment,elapsed,event_incident_typefk,card_id,n,player1,sortorder,stats,subtype,team,type,elapsed_plus,del,goal_type
0,1,489042,y,y,78,73,379481,342,24157,1,,serious_fouls,10260,card,,,
1,2,489042,y,y,82,73,379503,346,30362,1,,serious_fouls,10260,card,,,
2,3,489042,y,y,90,70,379547,353,30829,1,,,10260,card,,,
3,1,489044,y,y,56,73,377978,327,37442,5,,serious_fouls,8650,card,,,
4,2,489044,y,y,90,25,378060,353,46621,3,,stall_time,8650,card,,,


### Link Table from Match to Atributes

In [88]:
# i would need a df consisting of the match_id, the cards as the columns and the corresponding card_ids

# find out the max cards per game
max_cards = card_df.card_number.max()
print(max_cards)

19


In [89]:
match_card_df = card_attributes_df.pivot(index = "match_api_id", columns='card_number', values='card_id')

In [90]:
match_card_df

card_number,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
match_api_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
489042,379481,379503,379547,,,,,,,,,,,,,,,,
489044,377978,378060,,,,,,,,,,,,,,,,,
489045,376643,376951,377290,,,,,,,,,,,,,,,,
489046,378719,,,,,,,,,,,,,,,,,,
489047,376743,377153,377390,377658,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2060642,5608061,5608304,5609156,5609272,5609274,5609457,5609461,5609992,,,,,,,,,,,
2060643,5608036,5609116,5609169,5609302,,,,,,,,,,,,,,,
2060644,5606114,5606224,5606343,5606378,5606798,,,,,,,,,,,,,,
2060645,5614588,5614665,5614732,,,,,,,,,,,,,,,,


In [91]:
# create some lists
match_ids, cross_number, attribute, values = [], [], [], []

# loop over the rows
for match_id, go in zip(xml_cols.match_api_id, xml_cols.cross):
    myroot = ET.fromstring(go)
    
    if len(myroot) > 0:
        
        goal = 1
        # lists for the goals, ids, attributes
        for root in myroot:
            for x in root:

                match_ids.append(match_id)
                cross_number.append(goal)
                attribute.append(x.tag)
                values.append(x.text)


            # next goal
            goal += 1

# pandas dataframe
cross_df = pd.DataFrame({
    "match_api_id" : match_ids,
    "cross_number" : cross_number,
    "attribute" : attribute,
    "value" : values
})


In [92]:
# get the longest set of attributes of a single cross for the next table

# therefore I need to store the lengths of the cols of a cross
cross_attr_lengths = []

# loop over crosss
for match_id in cross_df.match_api_id.unique():
    df = cross_df[cross_df.match_api_id == match_id]
    
    for cross in df.cross_number.unique():
        cross_length = len(df[df.cross_number == cross])
        cross_attr_lengths.append(cross_length)

In [93]:
max_cross = max(cross_attr_lengths)
print(max_cross)

12


In [94]:
def find_out_index(attr_df, column, max_value):

    #find out where the length is 15
    
    match_ids, indices = [], []
    # loop over selected column
    for index, match_id in enumerate(attr_df.match_api_id.unique()):
        df = attr_df[attr_df.match_api_id == match_id]
        
        for index, val in enumerate(df[column].unique()):
            length = len(df[df[column] == val])
            if length == max_value:
                indices.append(index + 1)
                match_ids.append(match_id)
                if len(indices) > 9:
                    return match_ids, indices
    return match_ids, indices
            
        

In [95]:
match_ids, indices = find_out_index(cross_df, "cross_number", max_cross)

In [96]:
match_ids, indices

([1723982,
  1723982,
  1723982,
  1723982,
  1723982,
  1723982,
  1723983,
  1723984,
  1723984,
  1723984],
 [17, 18, 19, 39, 40, 41, 22, 15, 28, 29])

In [97]:
# retreive the attr, compare them
compare_cols_list = []
for match_id, cross_number in zip(match_ids, indices):
    
    cols = cross_df[(cross_df.match_api_id == match_id) & (cross_df.cross_number == cross_number)]["attribute"].values
    
    compare_cols_list.append(cols)

In [98]:
compare_attr = pd.DataFrame(compare_cols_list)
compare_attr

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,stats,elapsed_plus,event_incident_typefk,coordinates,elapsed,subtype,player1,sortorder,team,n,type,id
1,stats,elapsed_plus,event_incident_typefk,coordinates,elapsed,subtype,player1,sortorder,team,n,type,id
2,stats,elapsed_plus,event_incident_typefk,coordinates,elapsed,subtype,player1,sortorder,team,n,type,id
3,stats,elapsed_plus,event_incident_typefk,coordinates,elapsed,subtype,player1,sortorder,team,n,type,id
4,stats,elapsed_plus,event_incident_typefk,coordinates,elapsed,subtype,player1,sortorder,team,n,type,id
5,stats,elapsed_plus,event_incident_typefk,coordinates,elapsed,subtype,player1,sortorder,team,n,type,id
6,stats,elapsed_plus,event_incident_typefk,coordinates,elapsed,subtype,player1,sortorder,team,n,type,id
7,stats,elapsed_plus,event_incident_typefk,coordinates,elapsed,subtype,player1,sortorder,team,n,type,id
8,stats,elapsed_plus,event_incident_typefk,coordinates,elapsed,subtype,player1,sortorder,team,n,type,id
9,stats,elapsed_plus,event_incident_typefk,coordinates,elapsed,subtype,player1,sortorder,team,n,type,id


They look the same. So I use them for the dataframe.

In [99]:
all_match_ids = cross_df.match_api_id.unique()

piv_df_list = []

for match_id in all_match_ids:
    test_df = cross_df[cross_df.match_api_id == match_id]
    
    piv_df = test_df.pivot(index = "cross_number", columns='attribute', values='value')
    piv_df.insert(0, "match_api_id", match_id)
    piv_df_list.append(piv_df)

In [100]:
cross_attributes_df = pd.concat(piv_df_list)

In [101]:
cross_attributes_df = cross_attributes_df.rename(columns = {"id" : "cross_id"})
cross_attributes_df.insert(0, "cross_number",cross_attributes_df.index)
cross_attributes_df.reset_index(drop = True, inplace = True)

In [102]:
cross_attributes_df.nunique()

cross_number                 89
match_api_id               8465
elapsed                      91
elapsed_plus                 13
event_incident_typefk        15
cross_id                 284775
n                          1135
player1                    5137
sortorder                    58
stats                         2
subtype                       1
team                        178
type                          3
del                           1
spectators                    1
coordinates                   1
goal_type                     1
dtype: int64

In [103]:
cross_attributes_df.head()

Unnamed: 0,cross_number,match_api_id,elapsed,elapsed_plus,event_incident_typefk,cross_id,n,player1,sortorder,stats,subtype,team,type,del,spectators,coordinates,goal_type
0,1,489042,7,,7,378863,265,30829,1,,cross,10260,cross,,,,
1,2,489042,14,,7,378921,255,24148,0,,cross,10260,cross,,,,
2,3,489042,19,,329,378960,280,38807,0,,cross,10261,corner,,,,
3,4,489042,19,,7,378970,292,30929,5,,cross,10261,cross,,,,
4,5,489042,20,,7,378978,259,24148,0,,cross,10260,cross,,,,


### Link Table from Match to Atributes

In [104]:
# i would need a df consisting of the match_id, the crosss as the columns and the corresponding cross_ids

# find out the max crosss per game
max_crosss = cross_df.cross_number.max()
print(max_crosss)

89


In [105]:
match_cross_df = cross_attributes_df.pivot(index = "match_api_id", columns='cross_number', values='cross_id')

In [106]:
match_cross_df

cross_number,1,2,3,4,5,6,7,8,9,10,...,80,81,82,83,84,85,86,87,88,89
match_api_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
489042,378863,378921,378960,378970,378978,378993,378996,379012,379018,379068,...,,,,,,,,,,
489043,375536,375538,375566,375568,375570,375583,375586,375600,375602,375606,...,,,,,,,,,,
489044,377780,377784,377798,377802,377818,377821,377832,377867,377870,377877,...,,,,,,,,,,
489045,376055,376102,376147,376227,376235,376259,376293,376320,376347,376355,...,,,,,,,,,,
489046,378471,378479,378505,378539,378548,378557,378562,378603,378671,378677,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2060642,5607580,5607593,5607637,5607641,5607648,5607727,5607742,5607845,5607857,5607964,...,,,,,,,,,,
2060643,5607552,5607604,5607636,5607928,5608059,5608121,5608224,5608501,5608587,5608965,...,,,,,,,,,,
2060644,5605855,5605935,5606246,5606269,5606288,5606295,5606304,5606322,5606347,5606520,...,,,,,,,,,,
2060645,5614328,5614376,5614488,5614506,5614519,5614836,5615016,5615069,5615081,5615107,...,,,,,,,,,,


In [107]:
# create some lists
match_ids, corner_number, attribute, values = [], [], [], []

# loop over the rows
for match_id, go in zip(xml_cols.match_api_id, xml_cols.corner):
    myroot = ET.fromstring(go)
    
    if len(myroot) > 0:
        
        goal = 1
        # lists for the goals, ids, attributes
        for root in myroot:
            for x in root:

                match_ids.append(match_id)
                corner_number.append(goal)
                attribute.append(x.tag)
                values.append(x.text)


            # next goal
            goal += 1

# pandas dataframe
corner_df = pd.DataFrame({
    "match_api_id" : match_ids,
    "corner_number" : corner_number,
    "attribute" : attribute,
    "value" : values
})


In [108]:
# get the longest set of attributes of a single corner for the next table

# therefore I need to store the lengths of the cols of a corner
corner_attr_lengths = []

# loop over corners
for match_id in corner_df.match_api_id.unique():
    df = corner_df[corner_df.match_api_id == match_id]
    
    for corner in df.corner_number.unique():
        corner_length = len(df[df.corner_number == corner])
        corner_attr_lengths.append(corner_length)

In [109]:
max_corner = max(corner_attr_lengths)
print(max_corner)

12


In [110]:
def find_out_index(attr_df, column, max_value):

    #find out where the length is 15
    
    match_ids, indices = [], []
    # loop over selected column
    for index, match_id in enumerate(attr_df.match_api_id.unique()):
        df = attr_df[attr_df.match_api_id == match_id]
        
        for index, val in enumerate(df[column].unique()):
            length = len(df[df[column] == val])
            if length == max_value:
                indices.append(index + 1)
                match_ids.append(match_id)
                if len(indices) > 9:
                    return match_ids, indices
    return match_ids, indices
            
        

In [111]:
match_ids, indices = find_out_index(corner_df, "corner_number", max_corner)

In [112]:
match_ids, indices

([1723982,
  1723982,
  1723982,
  1723985,
  1723987,
  1723989,
  1723989,
  1723989,
  1723990,
  1723991],
 [5, 11, 12, 6, 2, 3, 9, 10, 9, 13])

In [113]:
# retreive the attr, compare them
compare_cols_list = []
for match_id, corner_number in zip(match_ids, indices):
    
    cols = corner_df[(corner_df.match_api_id == match_id) & (corner_df.corner_number == corner_number)]["attribute"].values
    
    compare_cols_list.append(cols)

In [114]:
compare_attr = pd.DataFrame(compare_cols_list)
compare_attr

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,stats,elapsed_plus,event_incident_typefk,coordinates,elapsed,subtype,player1,sortorder,team,n,type,id
1,stats,elapsed_plus,event_incident_typefk,coordinates,elapsed,subtype,player1,sortorder,team,n,type,id
2,stats,elapsed_plus,event_incident_typefk,coordinates,elapsed,subtype,player1,sortorder,team,n,type,id
3,stats,elapsed_plus,event_incident_typefk,coordinates,elapsed,subtype,player1,sortorder,team,n,type,id
4,stats,elapsed_plus,event_incident_typefk,coordinates,elapsed,subtype,player1,sortorder,team,n,type,id
5,stats,elapsed_plus,event_incident_typefk,coordinates,elapsed,subtype,player1,sortorder,team,n,type,id
6,stats,elapsed_plus,event_incident_typefk,coordinates,elapsed,subtype,player1,sortorder,team,n,type,id
7,stats,elapsed_plus,event_incident_typefk,coordinates,elapsed,subtype,player1,sortorder,team,n,type,id
8,stats,elapsed_plus,event_incident_typefk,coordinates,elapsed,subtype,player1,sortorder,team,n,type,id
9,stats,elapsed_plus,event_incident_typefk,coordinates,elapsed,subtype,player1,sortorder,team,n,type,id


They look the same. So I use them for the dataframe.

In [115]:
all_match_ids = corner_df.match_api_id.unique()

piv_df_list = []

for match_id in all_match_ids:
    test_df = corner_df[corner_df.match_api_id == match_id]
    
    piv_df = test_df.pivot(index = "corner_number", columns='attribute', values='value')
    piv_df.insert(0, "match_api_id", match_id)
    piv_df_list.append(piv_df)

In [116]:
corner_attributes_df = pd.concat(piv_df_list)

In [117]:
corner_attributes_df = corner_attributes_df.rename(columns = {"id" : "corner_id"})
corner_attributes_df.insert(0, "corner_number",corner_attributes_df.index)
corner_attributes_df.reset_index(drop = True, inplace = True)

In [118]:
corner_attributes_df.nunique()

corner_number               25
match_api_id              8464
elapsed                     90
event_incident_typefk       13
corner_id                87839
n                         1038
player1                   2956
sortorder                   44
stats                        1
subtype                      6
team                       178
type                         1
elapsed_plus                12
del                          1
spectators                   1
coordinates                  1
dtype: int64

In [119]:
corner_attributes_df.head()

Unnamed: 0,corner_number,match_api_id,elapsed,event_incident_typefk,corner_id,n,player1,sortorder,stats,subtype,team,type,elapsed_plus,del,spectators,coordinates
0,1,489042,19,329,378960,280,38807,0,,cross,10261,corner,,,,
1,2,489042,22,330,378992,263,40565,0,,short,10261,corner,,,,
2,3,489042,22,329,378996,293,38807,4,,cross,10261,corner,,,,
3,4,489042,48,329,379352,300,24154,2,,cross,10260,corner,,,,
4,5,489042,51,329,379366,301,38807,0,,cross,10261,corner,,,,


### Link Table from Match to Atributes

In [120]:
# i would need a df consisting of the match_id, the corners as the columns and the corresponding corner_ids

# find out the max corners per game
max_corners = corner_df.corner_number.max()
print(max_corners)

25


In [121]:
match_corner_df = corner_attributes_df.pivot(index = "match_api_id", columns='corner_number', values='corner_id')

In [122]:
match_corner_df

corner_number,1,2,3,4,5,6,7,8,9,10,...,16,17,18,19,20,21,22,23,24,25
match_api_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
489042,378960,378992,378996,379352,379366,379403,379411,379420,379429,379435,...,,,,,,,,,,
489043,375535,375538,375562,375586,375602,375609,375621,375626,375657,375682,...,,,,,,,,,,
489044,377879,377892,377906,377910,377914,377931,377972,378001,378019,,...,,,,,,,,,,
489045,376147,376227,376235,376259,376320,376355,376655,376811,376957,377005,...,377593,,,,,,,,,
489046,378471,378479,378505,378539,378557,378782,378790,378854,378869,378946,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2060642,5607653,5608012,5608256,5608461,5608517,5608651,5608969,5608990,5609070,5609379,...,,,,,,,,,,
2060643,5608094,5608550,5609666,5609825,5610007,,,,,,...,,,,,,,,,,
2060644,5605911,5606238,5606284,5606287,5606333,5606518,5606561,5606624,5606702,5606796,...,,,,,,,,,,
2060645,5614419,5614473,5614575,5614623,5614690,5615360,5615626,,,,...,,,,,,,,,,


In [123]:
# create some lists
match_ids, possession_number, attribute, values = [], [], [], []

# loop over the rows
for match_id, go in zip(xml_cols.match_api_id, xml_cols.possession):
    myroot = ET.fromstring(go)
    
    if len(myroot) > 0:
        
        goal = 1
        # lists for the goals, ids, attributes
        for root in myroot:
            for x in root:

                match_ids.append(match_id)
                possession_number.append(goal)
                attribute.append(x.tag)
                values.append(x.text)


            # next goal
            goal += 1

# pandas dataframe
possession_df = pd.DataFrame({
    "match_api_id" : match_ids,
    "possession_number" : possession_number,
    "attribute" : attribute,
    "value" : values
})


In [124]:
# get the longest set of attributes of a single possession for the next table

# therefore I need to store the lengths of the cols of a possession
possession_attr_lengths = []

# loop over possessions
for match_id in possession_df.match_api_id.unique():
    df = possession_df[possession_df.match_api_id == match_id]
    
    for possession in df.possession_number.unique():
        possession_length = len(df[df.possession_number == possession])
        possession_attr_lengths.append(possession_length)

In [125]:
max_possession = max(possession_attr_lengths)
print(max_possession)

12


In [126]:
def find_out_index(attr_df, column, max_value):

    #find out where the length is 15
    
    match_ids, indices = [], []
    # loop over selected column
    for index, match_id in enumerate(attr_df.match_api_id.unique()):
        df = attr_df[attr_df.match_api_id == match_id]
        
        for index, val in enumerate(df[column].unique()):
            length = len(df[df[column] == val])
            if length == max_value:
                indices.append(index + 1)
                match_ids.append(match_id)
                if len(indices) > 9:
                    return match_ids, indices
    return match_ids, indices
            
        

In [127]:
match_ids, indices = find_out_index(possession_df, "possession_number", max_possession)

In [128]:
match_ids, indices

([1724244,
  1724244,
  1724312,
  1724314,
  1724315,
  1724316,
  1724316,
  1724317,
  1724317,
  1724318],
 [2, 4, 2, 4, 2, 2, 4, 2, 4, 4])

In [129]:
# retreive the attr, compare them
compare_cols_list = []
for match_id, possession_number in zip(match_ids, indices):
    
    cols = possession_df[(possession_df.match_api_id == match_id) & (possession_df.possession_number == possession_number)]["attribute"].values
    
    compare_cols_list.append(cols)

In [130]:
compare_attr = pd.DataFrame(compare_cols_list)
compare_attr

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,comment,stats,elapsed_plus,event_incident_typefk,elapsed,subtype,sortorder,awaypos,homepos,n,type,id
1,comment,stats,elapsed_plus,event_incident_typefk,elapsed,subtype,sortorder,awaypos,homepos,n,type,id
2,comment,stats,elapsed_plus,event_incident_typefk,elapsed,subtype,sortorder,awaypos,homepos,n,type,id
3,comment,stats,elapsed_plus,event_incident_typefk,elapsed,subtype,sortorder,awaypos,homepos,n,type,id
4,comment,stats,elapsed_plus,event_incident_typefk,elapsed,subtype,sortorder,awaypos,homepos,n,type,id
5,comment,stats,elapsed_plus,event_incident_typefk,elapsed,subtype,sortorder,awaypos,homepos,n,type,id
6,comment,stats,elapsed_plus,event_incident_typefk,elapsed,subtype,sortorder,awaypos,homepos,n,type,id
7,comment,stats,elapsed_plus,event_incident_typefk,elapsed,subtype,sortorder,awaypos,homepos,n,type,id
8,comment,stats,elapsed_plus,event_incident_typefk,elapsed,subtype,sortorder,awaypos,homepos,n,type,id
9,comment,stats,elapsed_plus,event_incident_typefk,elapsed,subtype,sortorder,awaypos,homepos,n,type,id


They look the same. So I use them for the dataframe.

In [131]:
all_match_ids = possession_df.match_api_id.unique()

piv_df_list = []

for match_id in all_match_ids:
    test_df = possession_df[possession_df.match_api_id == match_id]
    
    piv_df = test_df.pivot(index = "possession_number", columns='attribute', values='value')
    piv_df.insert(0, "match_api_id", match_id)
    piv_df_list.append(piv_df)

In [132]:
possession_attributes_df = pd.concat(piv_df_list)

In [133]:
possession_attributes_df = possession_attributes_df.rename(columns = {"id" : "possession_id"})
possession_attributes_df.insert(0, "possession_number",possession_attributes_df.index)
possession_attributes_df.reset_index(drop = True, inplace = True)

In [134]:
possession_attributes_df.nunique()

possession_number           21
match_api_id              8419
awaypos                     84
comment                    197
elapsed                     87
elapsed_plus                13
event_incident_typefk        1
homepos                     84
possession_id            34815
n                          975
sortorder                   39
subtype                      1
type                         1
goal_type                    1
card_type                    1
injury_time                  2
del                          1
stats                        1
dtype: int64

In [135]:
possession_attributes_df.head()

Unnamed: 0,possession_number,match_api_id,awaypos,comment,elapsed,elapsed_plus,event_incident_typefk,homepos,possession_id,n,sortorder,subtype,type,goal_type,card_type,injury_time,del,stats
0,1,489042,44,56,25,,352,56,379029,68,1,possession,special,,,,,
1,2,489042,46,54,45,1.0,352,54,379251,117,4,possession,special,,,,,
2,3,489042,46,54,70,,352,54,379443,190,0,possession,special,,,,,
3,4,489042,45,55,90,5.0,352,55,379575,252,1,possession,special,,,,,
4,1,489043,35,65,27,,352,65,375608,67,0,possession,special,,,,,


### Link Table from Match to Atributes

In [136]:
# i would need a df consisting of the match_id, the possessions as the columns and the corresponding possession_ids

# find out the max possessions per game
max_possessions = possession_df.possession_number.max()
print(max_possessions)

21


In [137]:
match_possession_df = possession_attributes_df.pivot(index = "match_api_id", columns='possession_number', values='possession_id')

In [138]:
match_possession_df

possession_number,1,2,3,4,5,6,7,8,9,10,...,12,13,14,15,16,17,18,19,20,21
match_api_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
489042,379029,379251,379443,379575,,,,,,,...,,,,,,,,,,
489043,375608,375663,375787,375909,,,,,,,...,,,,,,,,,,
489044,377876,377944,378011,378069,,,,,,,...,,,,,,,,,,
489045,376411,376739,377303,377621,,,,,,,...,,,,,,,,,,
489046,378668,378781,379034,379254,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2060642,5608142,5608712,5609410,5610127,,,,,,,...,,,,,,,,,,
2060643,5608011,5608629,5609436,5610130,,,,,,,...,,,,,,,,,,
2060644,5606248,5606406,5606745,5606990,,,,,,,...,,,,,,,,,,
2060645,5614522,5614838,5615241,5615729,,,,,,,...,,,,,,,,,,


#### Save tables as csv

In [146]:
possession_attributes_df.head()

Unnamed: 0,possession_number,match_api_id,awaypos,comment,elapsed,elapsed_plus,event_incident_typefk,homepos,possession_id,n,sortorder,subtype,type,goal_type,card_type,injury_time,del,stats
0,1,489042,44,56,25,,352,56,379029,68,1,possession,special,,,,,
1,2,489042,46,54,45,1.0,352,54,379251,117,4,possession,special,,,,,
2,3,489042,46,54,70,,352,54,379443,190,0,possession,special,,,,,
3,4,489042,45,55,90,5.0,352,55,379575,252,1,possession,special,,,,,
4,1,489043,35,65,27,,352,65,375608,67,0,possession,special,,,,,


In [139]:
goal_attributes_df.to_csv("../Business_Satelite_Data/goal_df.csv")
shoton_attributes_df.to_csv("../Business_Satelite_Data/shoton_df.csv")
shotoff_attributes_df.to_csv("../Business_Satelite_Data/shotoff_df.csv")
foulcommit_attributes_df.to_csv("../Business_Satelite_Data/foulcommit_df.csv")
card_attributes_df.to_csv("../Business_Satelite_Data/card_df.csv")
cross_attributes_df.to_csv("../Business_Satelite_Data/cross_df.csv")
corner_attributes_df.to_csv("../Business_Satelite_Data/corner_df.csv")
possession_attributes_df.to_csv("../Business_Satelite_Data/possession_df.csv")