# Soccer data

Two soccer data sets pulled from kaggle.

CSV soccer data set: https://www.kaggle.com/secareanualin/football-events

SQLITE soccer data set: https://www.kaggle.com/hugomathien/soccer 

## CSV soccer data set

In [190]:
import pandas as pd 
import csv
import numpy as np

In [103]:
# Import dictionary.txt and make a nested dictionary 
dicts = ['event_type','event_type2','side','shot_place','shot_outcome','location','bodypart','assist_method','situation']

file = open('Resources/dictionary.txt')
f = csv.reader(file,delimiter='\t')

event = {} 

key0 = None 

for row in f:
    if(len(row)>0):
        if(row[0] in dicts):
            key = row[0]
            event[key] = {}
        if(len(row)>1):
            event[key][int(row[0])] = row[1]

file.close()
event

{'event_type': {0: 'Announcement',
  1: 'Attempt',
  2: 'Corner',
  3: 'Foul',
  4: 'Yellow card',
  5: 'Second yellow card',
  6: 'Red card',
  7: 'Substitution',
  8: 'Free kick won',
  9: 'Offside',
  10: 'Hand ball',
  11: 'Penalty conceded'},
 'event_type2': {12: 'Key Pass',
  13: 'Failed through ball',
  14: 'Sending off',
  15: 'Own goal'},
 'side': {1: 'Home', 2: 'Away'},
 'shot_place': {1: 'Bit too high',
  2: 'Blocked',
  3: 'Bottom left corner',
  4: 'Bottom right corner',
  5: 'Centre of the goal',
  6: 'High and wide',
  7: 'Hits the bar',
  8: 'Misses to the left',
  9: 'Misses to the right',
  10: 'Too high',
  11: 'Top centre of the goal',
  12: 'Top left corner',
  13: 'Top right corner'},
 'shot_outcome': {1: 'On target',
  2: 'Off target',
  3: 'Blocked',
  4: 'Hit the bar'},
 'location': {1: 'Attacking half',
  2: 'Defensive half',
  3: 'Centre of the box',
  4: 'Left wing',
  5: 'Right wing',
  6: 'Difficult angle and long range',
  7: 'Difficult angle on the left'

In [97]:
# Import events CSV
data = pd.read_csv("Resources/events.csv")
data

Unnamed: 0,id_odsp,id_event,sort_order,time,text,event_type,event_type2,side,event_team,opponent,...,player_in,player_out,shot_place,shot_outcome,is_goal,location,bodypart,assist_method,situation,fast_break
0,UFot0hit/,UFot0hit1,1,2,Attempt missed. Mladen Petric (Hamburg) left f...,1,12.0,2,Hamburg SV,Borussia Dortmund,...,,,6.0,2.0,0,9.0,2.0,1,1.0,0
1,UFot0hit/,UFot0hit2,2,4,"Corner, Borussia Dortmund. Conceded by Dennis...",2,,1,Borussia Dortmund,Hamburg SV,...,,,,,0,,,0,,0
2,UFot0hit/,UFot0hit3,3,4,"Corner, Borussia Dortmund. Conceded by Heiko ...",2,,1,Borussia Dortmund,Hamburg SV,...,,,,,0,,,0,,0
3,UFot0hit/,UFot0hit4,4,7,Foul by Sven Bender (Borussia Dortmund).,3,,1,Borussia Dortmund,Hamburg SV,...,,,,,0,,,0,,0
4,UFot0hit/,UFot0hit5,5,7,Gokhan Tore (Hamburg) wins a free kick in the ...,8,,2,Hamburg SV,Borussia Dortmund,...,,,,,0,2.0,,0,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
941004,z5L2OT5E/,z5L2OT5E123,123,92,Lucas Torreira (Sampdoria) wins a free kick in...,8,,2,Sampdoria,Atalanta,...,,,,,0,2.0,,0,,0
941005,z5L2OT5E/,z5L2OT5E124,124,93,"Corner, Sampdoria. Conceded by Andrea Masiello.",2,,2,Sampdoria,Atalanta,...,,,,,0,,,0,,0
941006,z5L2OT5E/,z5L2OT5E125,125,93,Attempt missed. Fabio Quagliarella (Sampdoria)...,1,12.0,2,Sampdoria,Atalanta,...,,,8.0,2.0,0,9.0,1.0,1,3.0,0
941007,z5L2OT5E/,z5L2OT5E126,126,94,Alberto Grassi (Atalanta) wins a free kick on ...,8,,1,Atalanta,Sampdoria,...,,,,,0,4.0,,0,,0


In [98]:
# Data cleaning
# Use events dictionary to replace values in the data df 
data['event_type'] = data['event_type'].map(event['event_type'])
data['event_type2'] = data['event_type2'].map(event['event_type2'])
data['side'] = data['side'].map(event['side'])
data['shot_place'] = data['shot_place'].map(event['shot_place'])
data['shot_outcome'] = data['shot_outcome'].map(event['shot_outcome'])
data['location'] = data['location'].map(event['location'])
data['bodypart'] = data['bodypart'].map(event['bodypart'])
data['assist_method'] = data['assist_method'].map(event['assist_method'])
data['situation'] = data['situation'].map(event['situation'])
data

Unnamed: 0,id_odsp,id_event,sort_order,time,text,event_type,event_type2,side,event_team,opponent,...,player_in,player_out,shot_place,shot_outcome,is_goal,location,bodypart,assist_method,situation,fast_break
0,UFot0hit/,UFot0hit1,1,2,Attempt missed. Mladen Petric (Hamburg) left f...,Attempt,Key Pass,Away,Hamburg SV,Borussia Dortmund,...,,,High and wide,Off target,0,Left side of the box,left foot,Pass,Open play,0
1,UFot0hit/,UFot0hit2,2,4,"Corner, Borussia Dortmund. Conceded by Dennis...",Corner,,Home,Borussia Dortmund,Hamburg SV,...,,,,,0,,,,,0
2,UFot0hit/,UFot0hit3,3,4,"Corner, Borussia Dortmund. Conceded by Heiko ...",Corner,,Home,Borussia Dortmund,Hamburg SV,...,,,,,0,,,,,0
3,UFot0hit/,UFot0hit4,4,7,Foul by Sven Bender (Borussia Dortmund).,Foul,,Home,Borussia Dortmund,Hamburg SV,...,,,,,0,,,,,0
4,UFot0hit/,UFot0hit5,5,7,Gokhan Tore (Hamburg) wins a free kick in the ...,Free kick won,,Away,Hamburg SV,Borussia Dortmund,...,,,,,0,Defensive half,,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
941004,z5L2OT5E/,z5L2OT5E123,123,92,Lucas Torreira (Sampdoria) wins a free kick in...,Free kick won,,Away,Sampdoria,Atalanta,...,,,,,0,Defensive half,,,,0
941005,z5L2OT5E/,z5L2OT5E124,124,93,"Corner, Sampdoria. Conceded by Andrea Masiello.",Corner,,Away,Sampdoria,Atalanta,...,,,,,0,,,,,0
941006,z5L2OT5E/,z5L2OT5E125,125,93,Attempt missed. Fabio Quagliarella (Sampdoria)...,Attempt,Key Pass,Away,Sampdoria,Atalanta,...,,,Misses to the left,Off target,0,Left side of the box,right foot,Pass,Corner,0
941007,z5L2OT5E/,z5L2OT5E126,126,94,Alberto Grassi (Atalanta) wins a free kick on ...,Free kick won,,Home,Atalanta,Sampdoria,...,,,,,0,Left wing,,,,0


In [104]:
# Import ginf CSV
metadata = pd.read_csv("Resources/ginf.csv")
metadata

Unnamed: 0,id_odsp,link_odsp,adv_stats,date,league,season,country,ht,at,fthg,ftag,odd_h,odd_d,odd_a,odd_over,odd_under,odd_bts,odd_bts_n
0,UFot0hit/,/soccer/germany/bundesliga-2011-2012/dortmund-...,True,2011-08-05,D1,2012,germany,Borussia Dortmund,Hamburg SV,3,1,1.56,4.41,7.42,,,,
1,Aw5DflLH/,/soccer/germany/bundesliga-2011-2012/augsburg-...,True,2011-08-06,D1,2012,germany,FC Augsburg,SC Freiburg,2,2,2.36,3.60,3.40,,,,
2,bkjpaC6n/,/soccer/germany/bundesliga-2011-2012/werder-br...,True,2011-08-06,D1,2012,germany,Werder Bremen,Kaiserslautern,2,0,1.83,4.20,4.80,,,,
3,CzPV312a/,/soccer/france/ligue-1-2011-2012/paris-sg-lori...,True,2011-08-06,F1,2012,france,Paris Saint-Germain,Lorient,0,1,1.55,4.50,9.40,,,,
4,GUOdmtII/,/soccer/france/ligue-1-2011-2012/caen-valencie...,True,2011-08-06,F1,2012,france,Caen,Valenciennes,1,0,2.50,3.40,3.45,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10107,xAkY8l6R/,/soccer/italy/serie-a/genoa-crotone-xAkY8l6R/,True,2017-01-22,I1,2017,italy,Genoa,Crotone,2,2,1.97,4.35,8.00,1.95,2.03,2.03,1.86
10108,xSU9scI9/,/soccer/england/premier-league/chelsea-hull-ci...,True,2017-01-22,E0,2017,england,Chelsea,Hull,2,0,1.19,8.50,20.00,1.54,2.68,2.40,1.66
10109,xY7uZwOI/,/soccer/france/ligue-1/monaco-lorient-xY7uZwOI/,True,2017-01-22,F1,2017,france,AS Monaco,Lorient,4,0,1.32,6.24,11.50,1.53,3.08,1.80,2.25
10110,YyeGxMX8/,/soccer/spain/laliga/betis-gijon-YyeGxMX8/,True,2017-01-22,SP1,2017,spain,Real Betis,Sporting Gijon,0,0,1.74,4.07,5.90,2.20,1.89,2.05,1.86


## SQLITE soccer data set

In [105]:
# Python SQL toolkit and Object Relational Mapper
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func, select

In [107]:
# create engine to database.sqlite
engine = create_engine("sqlite:///Resources/database.sqlite")

In [108]:
# reflect an existing database into a new model
Base = automap_base()
# reflect the tables
Base.prepare(engine, reflect=True)

In [109]:
# View all of the classes that automap found
Base.classes.keys()

['Country',
 'League',
 'country',
 'Match',
 'Player',
 'Team',
 'Player_Attributes',
 'Team_Attributes']

In [175]:
results = engine.execute('SELECT * FROM Country')
keys = engine.execute('SELECT * FROM Country').keys()

data = []
column = []

for row in results:
    data.append(row)
    
for row in keys:
    column.append(row)
    
country = pd.DataFrame(data, columns=column)
country

Unnamed: 0,id,name
0,1,Belgium
1,1729,England
2,4769,France
3,7809,Germany
4,10257,Italy
5,13274,Netherlands
6,15722,Poland
7,17642,Portugal
8,19694,Scotland
9,21518,Spain


In [176]:
results = engine.execute('SELECT * FROM League')
keys = engine.execute('SELECT * FROM League').keys()

data = []
column = []

for row in results:
    data.append(row)
    
for row in keys:
    column.append(row)
    
league = pd.DataFrame(data, columns=column)
league

Unnamed: 0,id,country_id,name
0,1,1,Belgium Jupiler League
1,1729,1729,England Premier League
2,4769,4769,France Ligue 1
3,7809,7809,Germany 1. Bundesliga
4,10257,10257,Italy Serie A
5,13274,13274,Netherlands Eredivisie
6,15722,15722,Poland Ekstraklasa
7,17642,17642,Portugal Liga ZON Sagres
8,19694,19694,Scotland Premier League
9,21518,21518,Spain LIGA BBVA


In [181]:
results = engine.execute('SELECT * FROM Match')
keys = engine.execute('SELECT * FROM Match').keys()

data = []
column = []

for row in results:
    data.append(row)
    
for row in keys:
    column.append(row)
    
match = pd.DataFrame(data, columns=column)
match

Unnamed: 0,id,country_id,league_id,season,stage,date,match_api_id,home_team_api_id,away_team_api_id,home_team_goal,...,SJA,VCH,VCD,VCA,GBH,GBD,GBA,BSH,BSD,BSA
0,1,1,1,2008/2009,1,2008-08-17 00:00:00,492473,9987,9993,1,...,4.00,1.65,3.40,4.50,1.78,3.25,4.00,1.73,3.40,4.20
1,2,1,1,2008/2009,1,2008-08-16 00:00:00,492474,10000,9994,0,...,3.80,2.00,3.25,3.25,1.85,3.25,3.75,1.91,3.25,3.60
2,3,1,1,2008/2009,1,2008-08-16 00:00:00,492475,9984,8635,0,...,2.50,2.35,3.25,2.65,2.50,3.20,2.50,2.30,3.20,2.75
3,4,1,1,2008/2009,1,2008-08-17 00:00:00,492476,9991,9998,5,...,7.50,1.45,3.75,6.50,1.50,3.75,5.50,1.44,3.75,6.50
4,5,1,1,2008/2009,1,2008-08-16 00:00:00,492477,7947,9985,1,...,1.73,4.50,3.40,1.65,4.50,3.50,1.65,4.75,3.30,1.67
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25974,25975,24558,24558,2015/2016,9,2015-09-22 00:00:00,1992091,10190,10191,1,...,,,,,,,,,,
25975,25976,24558,24558,2015/2016,9,2015-09-23 00:00:00,1992092,9824,10199,1,...,,,,,,,,,,
25976,25977,24558,24558,2015/2016,9,2015-09-23 00:00:00,1992093,9956,10179,2,...,,,,,,,,,,
25977,25978,24558,24558,2015/2016,9,2015-09-22 00:00:00,1992094,7896,10243,0,...,,,,,,,,,,


In [183]:
# Data cleaning, getting rid of the hours in date column
match['date'] = match['date'].str.split(' ',expand=True)[0]
match

Unnamed: 0,id,country_id,league_id,season,stage,date,match_api_id,home_team_api_id,away_team_api_id,home_team_goal,...,SJA,VCH,VCD,VCA,GBH,GBD,GBA,BSH,BSD,BSA
0,1,1,1,2008/2009,1,2008-08-17,492473,9987,9993,1,...,4.00,1.65,3.40,4.50,1.78,3.25,4.00,1.73,3.40,4.20
1,2,1,1,2008/2009,1,2008-08-16,492474,10000,9994,0,...,3.80,2.00,3.25,3.25,1.85,3.25,3.75,1.91,3.25,3.60
2,3,1,1,2008/2009,1,2008-08-16,492475,9984,8635,0,...,2.50,2.35,3.25,2.65,2.50,3.20,2.50,2.30,3.20,2.75
3,4,1,1,2008/2009,1,2008-08-17,492476,9991,9998,5,...,7.50,1.45,3.75,6.50,1.50,3.75,5.50,1.44,3.75,6.50
4,5,1,1,2008/2009,1,2008-08-16,492477,7947,9985,1,...,1.73,4.50,3.40,1.65,4.50,3.50,1.65,4.75,3.30,1.67
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25974,25975,24558,24558,2015/2016,9,2015-09-22,1992091,10190,10191,1,...,,,,,,,,,,
25975,25976,24558,24558,2015/2016,9,2015-09-23,1992092,9824,10199,1,...,,,,,,,,,,
25976,25977,24558,24558,2015/2016,9,2015-09-23,1992093,9956,10179,2,...,,,,,,,,,,
25977,25978,24558,24558,2015/2016,9,2015-09-22,1992094,7896,10243,0,...,,,,,,,,,,


In [179]:
keys

['id',
 'country_id',
 'league_id',
 'season',
 'stage',
 'date',
 'match_api_id',
 'home_team_api_id',
 'away_team_api_id',
 'home_team_goal',
 'away_team_goal',
 'home_player_X1',
 'home_player_X2',
 'home_player_X3',
 'home_player_X4',
 'home_player_X5',
 'home_player_X6',
 'home_player_X7',
 'home_player_X8',
 'home_player_X9',
 'home_player_X10',
 'home_player_X11',
 'away_player_X1',
 'away_player_X2',
 'away_player_X3',
 'away_player_X4',
 'away_player_X5',
 'away_player_X6',
 'away_player_X7',
 'away_player_X8',
 'away_player_X9',
 'away_player_X10',
 'away_player_X11',
 'home_player_Y1',
 'home_player_Y2',
 'home_player_Y3',
 'home_player_Y4',
 'home_player_Y5',
 'home_player_Y6',
 'home_player_Y7',
 'home_player_Y8',
 'home_player_Y9',
 'home_player_Y10',
 'home_player_Y11',
 'away_player_Y1',
 'away_player_Y2',
 'away_player_Y3',
 'away_player_Y4',
 'away_player_Y5',
 'away_player_Y6',
 'away_player_Y7',
 'away_player_Y8',
 'away_player_Y9',
 'away_player_Y10',
 'away_player

In [170]:
results = engine.execute('SELECT * FROM Player')
keys = engine.execute('SELECT * FROM Player').keys()

data = []
column = []

for row in results:
    data.append(row)
    
for row in keys:
    column.append(row)
    
player = pd.DataFrame(data, columns=column)
player

Unnamed: 0,id,player_api_id,player_name,player_fifa_api_id,birthday,height,weight
0,1,505942,Aaron Appindangoye,218353,1992-02-29 00:00:00,182.88,187
1,2,155782,Aaron Cresswell,189615,1989-12-15 00:00:00,170.18,146
2,3,162549,Aaron Doran,186170,1991-05-13 00:00:00,170.18,163
3,4,30572,Aaron Galindo,140161,1982-05-08 00:00:00,182.88,198
4,5,23780,Aaron Hughes,17725,1979-11-08 00:00:00,182.88,154
...,...,...,...,...,...,...,...
11055,11071,26357,Zoumana Camara,2488,1979-04-03 00:00:00,182.88,168
11056,11072,111182,Zsolt Laczko,164680,1986-12-18 00:00:00,182.88,176
11057,11073,36491,Zsolt Low,111191,1979-04-29 00:00:00,180.34,154
11058,11074,35506,Zurab Khizanishvili,47058,1981-10-06 00:00:00,185.42,172


In [185]:
# Data cleaning, getting rid of the hours in birthday column
player['birthday'] = player['birthday'].str.split(' ',expand=True)[0]
player

Unnamed: 0,id,player_api_id,player_name,player_fifa_api_id,birthday,height,weight
0,1,505942,Aaron Appindangoye,218353,1992-02-29,182.88,187
1,2,155782,Aaron Cresswell,189615,1989-12-15,170.18,146
2,3,162549,Aaron Doran,186170,1991-05-13,170.18,163
3,4,30572,Aaron Galindo,140161,1982-05-08,182.88,198
4,5,23780,Aaron Hughes,17725,1979-11-08,182.88,154
...,...,...,...,...,...,...,...
11055,11071,26357,Zoumana Camara,2488,1979-04-03,182.88,168
11056,11072,111182,Zsolt Laczko,164680,1986-12-18,182.88,176
11057,11073,36491,Zsolt Low,111191,1979-04-29,180.34,154
11058,11074,35506,Zurab Khizanishvili,47058,1981-10-06,185.42,172


In [192]:
results = engine.execute('SELECT * FROM Team')
keys = engine.execute('SELECT * FROM Team').keys()

data = []
column = []

for row in results:
    data.append(row)
    
for row in keys:
    column.append(row)
    
team = pd.DataFrame(data, columns=column)
team

Unnamed: 0,id,team_api_id,team_fifa_api_id,team_long_name,team_short_name
0,1,9987,673.0,KRC Genk,GEN
1,2,9993,675.0,Beerschot AC,BAC
2,3,10000,15005.0,SV Zulte-Waregem,ZUL
3,4,9994,2007.0,Sporting Lokeren,LOK
4,5,9984,1750.0,KSV Cercle Brugge,CEB
...,...,...,...,...,...
294,49479,10190,898.0,FC St. Gallen,GAL
295,49837,10191,1715.0,FC Thun,THU
296,50201,9777,324.0,Servette FC,SER
297,50204,7730,1862.0,FC Lausanne-Sports,LAU


In [172]:
results = engine.execute('SELECT * FROM Player_Attributes')
keys = engine.execute('SELECT * FROM Player_Attributes').keys()

data = []
column = []

for row in results:
    data.append(row)
    
for row in keys:
    column.append(row)
    
player_attributes = pd.DataFrame(data, columns=column)
player_attributes

Unnamed: 0,id,player_fifa_api_id,player_api_id,date,overall_rating,potential,preferred_foot,attacking_work_rate,defensive_work_rate,crossing,...,vision,penalties,marking,standing_tackle,sliding_tackle,gk_diving,gk_handling,gk_kicking,gk_positioning,gk_reflexes
0,1,218353,505942,2016-02-18 00:00:00,67.0,71.0,right,medium,medium,49.0,...,54.0,48.0,65.0,69.0,69.0,6.0,11.0,10.0,8.0,8.0
1,2,218353,505942,2015-11-19 00:00:00,67.0,71.0,right,medium,medium,49.0,...,54.0,48.0,65.0,69.0,69.0,6.0,11.0,10.0,8.0,8.0
2,3,218353,505942,2015-09-21 00:00:00,62.0,66.0,right,medium,medium,49.0,...,54.0,48.0,65.0,66.0,69.0,6.0,11.0,10.0,8.0,8.0
3,4,218353,505942,2015-03-20 00:00:00,61.0,65.0,right,medium,medium,48.0,...,53.0,47.0,62.0,63.0,66.0,5.0,10.0,9.0,7.0,7.0
4,5,218353,505942,2007-02-22 00:00:00,61.0,65.0,right,medium,medium,48.0,...,53.0,47.0,62.0,63.0,66.0,5.0,10.0,9.0,7.0,7.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
183973,183974,102359,39902,2009-08-30 00:00:00,83.0,85.0,right,medium,low,84.0,...,88.0,83.0,22.0,31.0,30.0,9.0,20.0,84.0,20.0,20.0
183974,183975,102359,39902,2009-02-22 00:00:00,78.0,80.0,right,medium,low,74.0,...,88.0,70.0,32.0,31.0,30.0,9.0,20.0,73.0,20.0,20.0
183975,183976,102359,39902,2008-08-30 00:00:00,77.0,80.0,right,medium,low,74.0,...,88.0,70.0,32.0,31.0,30.0,9.0,20.0,73.0,20.0,20.0
183976,183977,102359,39902,2007-08-30 00:00:00,78.0,81.0,right,medium,low,74.0,...,88.0,53.0,28.0,32.0,30.0,9.0,20.0,73.0,20.0,20.0


In [186]:
# Data cleaning, getting rid of the hours in date column
player_attributes['date'] = player_attributes['date'].str.split(' ',expand=True)[0]
player_attributes

Unnamed: 0,id,player_fifa_api_id,player_api_id,date,overall_rating,potential,preferred_foot,attacking_work_rate,defensive_work_rate,crossing,...,vision,penalties,marking,standing_tackle,sliding_tackle,gk_diving,gk_handling,gk_kicking,gk_positioning,gk_reflexes
0,1,218353,505942,2016-02-18,67.0,71.0,right,medium,medium,49.0,...,54.0,48.0,65.0,69.0,69.0,6.0,11.0,10.0,8.0,8.0
1,2,218353,505942,2015-11-19,67.0,71.0,right,medium,medium,49.0,...,54.0,48.0,65.0,69.0,69.0,6.0,11.0,10.0,8.0,8.0
2,3,218353,505942,2015-09-21,62.0,66.0,right,medium,medium,49.0,...,54.0,48.0,65.0,66.0,69.0,6.0,11.0,10.0,8.0,8.0
3,4,218353,505942,2015-03-20,61.0,65.0,right,medium,medium,48.0,...,53.0,47.0,62.0,63.0,66.0,5.0,10.0,9.0,7.0,7.0
4,5,218353,505942,2007-02-22,61.0,65.0,right,medium,medium,48.0,...,53.0,47.0,62.0,63.0,66.0,5.0,10.0,9.0,7.0,7.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
183973,183974,102359,39902,2009-08-30,83.0,85.0,right,medium,low,84.0,...,88.0,83.0,22.0,31.0,30.0,9.0,20.0,84.0,20.0,20.0
183974,183975,102359,39902,2009-02-22,78.0,80.0,right,medium,low,74.0,...,88.0,70.0,32.0,31.0,30.0,9.0,20.0,73.0,20.0,20.0
183975,183976,102359,39902,2008-08-30,77.0,80.0,right,medium,low,74.0,...,88.0,70.0,32.0,31.0,30.0,9.0,20.0,73.0,20.0,20.0
183976,183977,102359,39902,2007-08-30,78.0,81.0,right,medium,low,74.0,...,88.0,53.0,28.0,32.0,30.0,9.0,20.0,73.0,20.0,20.0


In [173]:
results = engine.execute('SELECT * FROM Team_Attributes')
keys = engine.execute('SELECT * FROM Team_Attributes').keys()

data = []
column = []

for row in results:
    data.append(row)
    
for row in keys:
    column.append(row)
    
team_attributes = pd.DataFrame(data, columns=column)
team_attributes

Unnamed: 0,id,team_fifa_api_id,team_api_id,date,buildUpPlaySpeed,buildUpPlaySpeedClass,buildUpPlayDribbling,buildUpPlayDribblingClass,buildUpPlayPassing,buildUpPlayPassingClass,...,chanceCreationShooting,chanceCreationShootingClass,chanceCreationPositioningClass,defencePressure,defencePressureClass,defenceAggression,defenceAggressionClass,defenceTeamWidth,defenceTeamWidthClass,defenceDefenderLineClass
0,1,434,9930,2010-02-22 00:00:00,60,Balanced,,Little,50,Mixed,...,55,Normal,Organised,50,Medium,55,Press,45,Normal,Cover
1,2,434,9930,2014-09-19 00:00:00,52,Balanced,48.0,Normal,56,Mixed,...,64,Normal,Organised,47,Medium,44,Press,54,Normal,Cover
2,3,434,9930,2015-09-10 00:00:00,47,Balanced,41.0,Normal,54,Mixed,...,64,Normal,Organised,47,Medium,44,Press,54,Normal,Cover
3,4,77,8485,2010-02-22 00:00:00,70,Fast,,Little,70,Long,...,70,Lots,Organised,60,Medium,70,Double,70,Wide,Cover
4,5,77,8485,2011-02-22 00:00:00,47,Balanced,,Little,52,Mixed,...,52,Normal,Organised,47,Medium,47,Press,52,Normal,Cover
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1453,1454,15005,10000,2011-02-22 00:00:00,52,Balanced,,Little,52,Mixed,...,53,Normal,Organised,46,Medium,48,Press,53,Normal,Cover
1454,1455,15005,10000,2012-02-22 00:00:00,54,Balanced,,Little,51,Mixed,...,50,Normal,Organised,44,Medium,55,Press,53,Normal,Cover
1455,1456,15005,10000,2013-09-20 00:00:00,54,Balanced,,Little,51,Mixed,...,32,Little,Organised,44,Medium,58,Press,37,Normal,Cover
1456,1457,15005,10000,2014-09-19 00:00:00,54,Balanced,42.0,Normal,51,Mixed,...,32,Little,Organised,44,Medium,58,Press,37,Normal,Cover


In [187]:
# Data cleaning, getting rid of the hours in date column
team_attributes['date'] = team_attributes['date'].str.split(' ',expand=True)[0]
team_attributes

Unnamed: 0,id,team_fifa_api_id,team_api_id,date,buildUpPlaySpeed,buildUpPlaySpeedClass,buildUpPlayDribbling,buildUpPlayDribblingClass,buildUpPlayPassing,buildUpPlayPassingClass,...,chanceCreationShooting,chanceCreationShootingClass,chanceCreationPositioningClass,defencePressure,defencePressureClass,defenceAggression,defenceAggressionClass,defenceTeamWidth,defenceTeamWidthClass,defenceDefenderLineClass
0,1,434,9930,2010-02-22,60,Balanced,,Little,50,Mixed,...,55,Normal,Organised,50,Medium,55,Press,45,Normal,Cover
1,2,434,9930,2014-09-19,52,Balanced,48.0,Normal,56,Mixed,...,64,Normal,Organised,47,Medium,44,Press,54,Normal,Cover
2,3,434,9930,2015-09-10,47,Balanced,41.0,Normal,54,Mixed,...,64,Normal,Organised,47,Medium,44,Press,54,Normal,Cover
3,4,77,8485,2010-02-22,70,Fast,,Little,70,Long,...,70,Lots,Organised,60,Medium,70,Double,70,Wide,Cover
4,5,77,8485,2011-02-22,47,Balanced,,Little,52,Mixed,...,52,Normal,Organised,47,Medium,47,Press,52,Normal,Cover
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1453,1454,15005,10000,2011-02-22,52,Balanced,,Little,52,Mixed,...,53,Normal,Organised,46,Medium,48,Press,53,Normal,Cover
1454,1455,15005,10000,2012-02-22,54,Balanced,,Little,51,Mixed,...,50,Normal,Organised,44,Medium,55,Press,53,Normal,Cover
1455,1456,15005,10000,2013-09-20,54,Balanced,,Little,51,Mixed,...,32,Little,Organised,44,Medium,58,Press,37,Normal,Cover
1456,1457,15005,10000,2014-09-19,54,Balanced,42.0,Normal,51,Mixed,...,32,Little,Organised,44,Medium,58,Press,37,Normal,Cover


# Unifiying both data sets

Each data set has a hard time to find a link in between each other, the goal of this section is to create auxiliar dataframes that can easily connect both data sets. This will be done through teams for the metadata set and players for the data set.

## Teams

In [254]:
# Get teams and ids from the team dataframe
team_id = team['team_api_id'].tolist()
team_name = team['team_long_name'].tolist()

# Insert both series into two nested lists for easy indexing
team_info = [team_id,team_name]

In [255]:
# Get all unique teams from the metadata dataframe
metadata_htname = metadata['ht'].unique().tolist()
metadata_atname = metadata['at'].unique().tolist()

# Create a list with two nested lists
common_teams = []
common_teams.append([])
common_teams.append([])

# Find common teams in both data sets
for x in range(len(metadata_htname)):
    if metadata_htname[x] in team_name:
        common_teams[0].append(metadata_htname[x])
        
for x in range(len(metadata_atname)):        
    if metadata_atname[x] in team_name and metadata_atname[x] not in common_teams[0]:
        common_teams[0].append(metadata_htname[x])  

In [256]:
# Fill out the list with the ids of the teams in common
for x in range(len(common_teams[0])):
    for y in range(len(team_info[1])):
        if common_teams[0][x] == team_info[1][y]:
            common_teams[1].append(team_info[0][y])

In [257]:
# Create auxiliar dataframe
common_teams_data = pd.DataFrame(columns=['team_id','team'])
common_teams_data['team_id'] = common_teams[1]
common_teams_data['team'] = common_teams[0]

common_teams_data

Unnamed: 0,team_id,team
0,9789,Borussia Dortmund
1,8406,FC Augsburg
2,9847,Paris Saint-Germain
3,8576,AC Ajaccio
4,10269,VfB Stuttgart
...,...,...
60,9891,Frosinone
61,8234,FC Ingolstadt 04
62,208931,Carpi
63,8549,Middlesbrough


In [252]:
# Making sure that the team and id match
team.loc[team['team_api_id'] == 9847]

Unnamed: 0,id,team_api_id,team_fifa_api_id,team_long_name,team_short_name
70,9548,9847,73.0,Paris Saint-Germain,PSG


In [243]:
# Making sure that the team and id match
team.loc[team['team_api_id'] == 9789]

Unnamed: 0,id,team_api_id,team_fifa_api_id,team_long_name,team_short_name
97,15620,9789,22.0,Borussia Dortmund,DOR


## Players