# Soccer data

Two soccer data sets pulled from kaggle.

CSV soccer data set: https://www.kaggle.com/secareanualin/football-events

SQLITE soccer data set: https://www.kaggle.com/hugomathien/soccer 

## CSV soccer data set

In [106]:
import pandas as pd 
import csv
import numpy as np

In [110]:
# Import dictionary.txt and make a nested dictionary 
dicts = ['event_type','event_type2','side','shot_place','shot_outcome','location','bodypart','assist_method','situation']

file = open('Resources/dictionary.txt')
f = csv.reader(file,delimiter='\t')

event = {} 

key0 = None 

for row in f:
    if(len(row)>0):
        if(row[0] in dicts):
            key = row[0]
            event[key] = {}
        if(len(row)>1):
            event[key][int(row[0])] = row[1]

file.close()
event

{'event_type': {0: 'Announcement',
  1: 'Attempt',
  2: 'Corner',
  3: 'Foul',
  4: 'Yellow card',
  5: 'Second yellow card',
  6: 'Red card',
  7: 'Substitution',
  8: 'Free kick won',
  9: 'Offside',
  10: 'Hand ball',
  11: 'Penalty conceded'},
 'event_type2': {12: 'Key Pass',
  13: 'Failed through ball',
  14: 'Sending off',
  15: 'Own goal'},
 'side': {1: 'Home', 2: 'Away'},
 'shot_place': {1: 'Bit too high',
  2: 'Blocked',
  3: 'Bottom left corner',
  4: 'Bottom right corner',
  5: 'Centre of the goal',
  6: 'High and wide',
  7: 'Hits the bar',
  8: 'Misses to the left',
  9: 'Misses to the right',
  10: 'Too high',
  11: 'Top centre of the goal',
  12: 'Top left corner',
  13: 'Top right corner'},
 'shot_outcome': {1: 'On target',
  2: 'Off target',
  3: 'Blocked',
  4: 'Hit the bar'},
 'location': {1: 'Attacking half',
  2: 'Defensive half',
  3: 'Centre of the box',
  4: 'Left wing',
  5: 'Right wing',
  6: 'Difficult angle and long range',
  7: 'Difficult angle on the left'

In [286]:
# Import events CSV
data = pd.read_csv("Resources/events.csv")
data

Unnamed: 0,id_odsp,id_event,sort_order,time,text,event_type,event_type2,side,event_team,opponent,...,player_in,player_out,shot_place,shot_outcome,is_goal,location,bodypart,assist_method,situation,fast_break
0,UFot0hit/,UFot0hit1,1,2,Attempt missed. Mladen Petric (Hamburg) left f...,1,12.0,2,Hamburg SV,Borussia Dortmund,...,,,6.0,2.0,0,9.0,2.0,1,1.0,0
1,UFot0hit/,UFot0hit2,2,4,"Corner, Borussia Dortmund. Conceded by Dennis...",2,,1,Borussia Dortmund,Hamburg SV,...,,,,,0,,,0,,0
2,UFot0hit/,UFot0hit3,3,4,"Corner, Borussia Dortmund. Conceded by Heiko ...",2,,1,Borussia Dortmund,Hamburg SV,...,,,,,0,,,0,,0
3,UFot0hit/,UFot0hit4,4,7,Foul by Sven Bender (Borussia Dortmund).,3,,1,Borussia Dortmund,Hamburg SV,...,,,,,0,,,0,,0
4,UFot0hit/,UFot0hit5,5,7,Gokhan Tore (Hamburg) wins a free kick in the ...,8,,2,Hamburg SV,Borussia Dortmund,...,,,,,0,2.0,,0,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
941004,z5L2OT5E/,z5L2OT5E123,123,92,Lucas Torreira (Sampdoria) wins a free kick in...,8,,2,Sampdoria,Atalanta,...,,,,,0,2.0,,0,,0
941005,z5L2OT5E/,z5L2OT5E124,124,93,"Corner, Sampdoria. Conceded by Andrea Masiello.",2,,2,Sampdoria,Atalanta,...,,,,,0,,,0,,0
941006,z5L2OT5E/,z5L2OT5E125,125,93,Attempt missed. Fabio Quagliarella (Sampdoria)...,1,12.0,2,Sampdoria,Atalanta,...,,,8.0,2.0,0,9.0,1.0,1,3.0,0
941007,z5L2OT5E/,z5L2OT5E126,126,94,Alberto Grassi (Atalanta) wins a free kick on ...,8,,1,Atalanta,Sampdoria,...,,,,,0,4.0,,0,,0


In [287]:
# Data cleaning
# Use events dictionary to replace values in the data df 
data['event_type'] = data['event_type'].map(event['event_type'])
data['event_type2'] = data['event_type2'].map(event['event_type2'])
data['side'] = data['side'].map(event['side'])
data['shot_place'] = data['shot_place'].map(event['shot_place'])
data['shot_outcome'] = data['shot_outcome'].map(event['shot_outcome'])
data['location'] = data['location'].map(event['location'])
data['bodypart'] = data['bodypart'].map(event['bodypart'])
data['assist_method'] = data['assist_method'].map(event['assist_method'])
data['situation'] = data['situation'].map(event['situation'])

# Capitalize names
data['player'] = data['player'].str.title()
data['player2'] = data['player2'].str.title()
data

Unnamed: 0,id_odsp,id_event,sort_order,time,text,event_type,event_type2,side,event_team,opponent,...,player_in,player_out,shot_place,shot_outcome,is_goal,location,bodypart,assist_method,situation,fast_break
0,UFot0hit/,UFot0hit1,1,2,Attempt missed. Mladen Petric (Hamburg) left f...,Attempt,Key Pass,Away,Hamburg SV,Borussia Dortmund,...,,,High and wide,Off target,0,Left side of the box,left foot,Pass,Open play,0
1,UFot0hit/,UFot0hit2,2,4,"Corner, Borussia Dortmund. Conceded by Dennis...",Corner,,Home,Borussia Dortmund,Hamburg SV,...,,,,,0,,,,,0
2,UFot0hit/,UFot0hit3,3,4,"Corner, Borussia Dortmund. Conceded by Heiko ...",Corner,,Home,Borussia Dortmund,Hamburg SV,...,,,,,0,,,,,0
3,UFot0hit/,UFot0hit4,4,7,Foul by Sven Bender (Borussia Dortmund).,Foul,,Home,Borussia Dortmund,Hamburg SV,...,,,,,0,,,,,0
4,UFot0hit/,UFot0hit5,5,7,Gokhan Tore (Hamburg) wins a free kick in the ...,Free kick won,,Away,Hamburg SV,Borussia Dortmund,...,,,,,0,Defensive half,,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
941004,z5L2OT5E/,z5L2OT5E123,123,92,Lucas Torreira (Sampdoria) wins a free kick in...,Free kick won,,Away,Sampdoria,Atalanta,...,,,,,0,Defensive half,,,,0
941005,z5L2OT5E/,z5L2OT5E124,124,93,"Corner, Sampdoria. Conceded by Andrea Masiello.",Corner,,Away,Sampdoria,Atalanta,...,,,,,0,,,,,0
941006,z5L2OT5E/,z5L2OT5E125,125,93,Attempt missed. Fabio Quagliarella (Sampdoria)...,Attempt,Key Pass,Away,Sampdoria,Atalanta,...,,,Misses to the left,Off target,0,Left side of the box,right foot,Pass,Corner,0
941007,z5L2OT5E/,z5L2OT5E126,126,94,Alberto Grassi (Atalanta) wins a free kick on ...,Free kick won,,Home,Atalanta,Sampdoria,...,,,,,0,Left wing,,,,0


In [116]:
# Import ginf CSV
metadata = pd.read_csv("Resources/ginf.csv")
metadata

Unnamed: 0,id_odsp,link_odsp,adv_stats,date,league,season,country,ht,at,fthg,ftag,odd_h,odd_d,odd_a,odd_over,odd_under,odd_bts,odd_bts_n
0,UFot0hit/,/soccer/germany/bundesliga-2011-2012/dortmund-...,True,2011-08-05,D1,2012,germany,Borussia Dortmund,Hamburg SV,3,1,1.56,4.41,7.42,,,,
1,Aw5DflLH/,/soccer/germany/bundesliga-2011-2012/augsburg-...,True,2011-08-06,D1,2012,germany,FC Augsburg,SC Freiburg,2,2,2.36,3.60,3.40,,,,
2,bkjpaC6n/,/soccer/germany/bundesliga-2011-2012/werder-br...,True,2011-08-06,D1,2012,germany,Werder Bremen,Kaiserslautern,2,0,1.83,4.20,4.80,,,,
3,CzPV312a/,/soccer/france/ligue-1-2011-2012/paris-sg-lori...,True,2011-08-06,F1,2012,france,Paris Saint-Germain,Lorient,0,1,1.55,4.50,9.40,,,,
4,GUOdmtII/,/soccer/france/ligue-1-2011-2012/caen-valencie...,True,2011-08-06,F1,2012,france,Caen,Valenciennes,1,0,2.50,3.40,3.45,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10107,xAkY8l6R/,/soccer/italy/serie-a/genoa-crotone-xAkY8l6R/,True,2017-01-22,I1,2017,italy,Genoa,Crotone,2,2,1.97,4.35,8.00,1.95,2.03,2.03,1.86
10108,xSU9scI9/,/soccer/england/premier-league/chelsea-hull-ci...,True,2017-01-22,E0,2017,england,Chelsea,Hull,2,0,1.19,8.50,20.00,1.54,2.68,2.40,1.66
10109,xY7uZwOI/,/soccer/france/ligue-1/monaco-lorient-xY7uZwOI/,True,2017-01-22,F1,2017,france,AS Monaco,Lorient,4,0,1.32,6.24,11.50,1.53,3.08,1.80,2.25
10110,YyeGxMX8/,/soccer/spain/laliga/betis-gijon-YyeGxMX8/,True,2017-01-22,SP1,2017,spain,Real Betis,Sporting Gijon,0,0,1.74,4.07,5.90,2.20,1.89,2.05,1.86


## SQLITE soccer data set

In [117]:
# Python SQL toolkit and Object Relational Mapper
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func, select

In [118]:
# create engine to database.sqlite
engine = create_engine("sqlite:///Resources/database.sqlite")

In [119]:
# reflect an existing database into a new model
Base = automap_base()
# reflect the tables
Base.prepare(engine, reflect=True)

In [120]:
# View all of the classes that automap found
Base.classes.keys()

['Country',
 'League',
 'country',
 'Match',
 'Player',
 'Team',
 'Player_Attributes',
 'Team_Attributes']

In [121]:
results = engine.execute('SELECT * FROM Country')
keys = engine.execute('SELECT * FROM Country').keys()

data = []
column = []

for row in results:
    data.append(row)
    
for row in keys:
    column.append(row)
    
country = pd.DataFrame(data, columns=column)
country

Unnamed: 0,id,name
0,1,Belgium
1,1729,England
2,4769,France
3,7809,Germany
4,10257,Italy
5,13274,Netherlands
6,15722,Poland
7,17642,Portugal
8,19694,Scotland
9,21518,Spain


In [122]:
results = engine.execute('SELECT * FROM League')
keys = engine.execute('SELECT * FROM League').keys()

data = []
column = []

for row in results:
    data.append(row)
    
for row in keys:
    column.append(row)
    
league = pd.DataFrame(data, columns=column)
league

Unnamed: 0,id,country_id,name
0,1,1,Belgium Jupiler League
1,1729,1729,England Premier League
2,4769,4769,France Ligue 1
3,7809,7809,Germany 1. Bundesliga
4,10257,10257,Italy Serie A
5,13274,13274,Netherlands Eredivisie
6,15722,15722,Poland Ekstraklasa
7,17642,17642,Portugal Liga ZON Sagres
8,19694,19694,Scotland Premier League
9,21518,21518,Spain LIGA BBVA


In [255]:
results = engine.execute('SELECT * FROM Match')
keys = engine.execute('SELECT * FROM Match').keys()

data = []
column = []

for row in results:
    data.append(row)
    
for row in keys:
    column.append(row)
    
match = pd.DataFrame(data, columns=column)
match

Unnamed: 0,id,country_id,league_id,season,stage,date,match_api_id,home_team_api_id,away_team_api_id,home_team_goal,...,SJA,VCH,VCD,VCA,GBH,GBD,GBA,BSH,BSD,BSA
0,1118,1,1,2012/2013,23,2013-01-19,1224160,9987,9986,3,...,8.00,1.36,5.00,8.50,1.36,4.50,7.00,1.40,4.50,6.50
1,1119,1,1,2012/2013,23,2013-01-19,1224161,9991,9985,0,...,2.50,2.62,3.40,2.62,2.50,3.30,2.50,2.62,3.25,2.50
2,1120,1,1,2012/2013,23,2013-01-19,1224162,1773,9994,2,...,2.60,2.70,3.40,2.50,2.60,3.25,2.45,2.62,3.25,2.50
3,1121,1,1,2012/2013,23,2013-01-19,1224163,8571,9993,4,...,4.80,1.73,3.60,5.00,1.70,3.40,4.60,1.73,3.40,4.50
4,1122,1,1,2012/2013,23,2013-01-19,1224164,9989,8475,0,...,3.50,1.95,3.40,3.80,2.00,3.10,3.60,2.00,3.30,3.40
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25974,1114,1,1,2012/2013,22,2012-12-26,1224156,9994,9984,3,...,4.33,1.73,3.90,4.60,1.72,3.40,4.50,1.73,3.60,4.33
25975,1115,1,1,2012/2013,22,2012-12-26,1224157,8203,9991,1,...,2.80,2.55,3.40,2.75,2.40,3.30,2.60,2.50,3.25,2.62
25976,1116,1,1,2012/2013,22,2012-12-26,1224158,8475,9987,1,...,1.90,4.10,3.75,1.85,4.00,3.30,1.83,4.00,3.40,1.83
25977,1117,1,1,2012/2013,23,2013-02-27,1224159,8635,10000,0,...,7.00,1.44,4.80,7.50,1.40,4.20,6.75,1.40,4.33,7.00


In [257]:
# Checking for duplicates
match = match.drop_duplicates(subset=['match_api_id'], keep='first')
match

Unnamed: 0,id,country_id,league_id,season,stage,date,match_api_id,home_team_api_id,away_team_api_id,home_team_goal,...,SJA,VCH,VCD,VCA,GBH,GBD,GBA,BSH,BSD,BSA
0,1118,1,1,2012/2013,23,2013-01-19,1224160,9987,9986,3,...,8.00,1.36,5.00,8.50,1.36,4.50,7.00,1.40,4.50,6.50
1,1119,1,1,2012/2013,23,2013-01-19,1224161,9991,9985,0,...,2.50,2.62,3.40,2.62,2.50,3.30,2.50,2.62,3.25,2.50
2,1120,1,1,2012/2013,23,2013-01-19,1224162,1773,9994,2,...,2.60,2.70,3.40,2.50,2.60,3.25,2.45,2.62,3.25,2.50
3,1121,1,1,2012/2013,23,2013-01-19,1224163,8571,9993,4,...,4.80,1.73,3.60,5.00,1.70,3.40,4.60,1.73,3.40,4.50
4,1122,1,1,2012/2013,23,2013-01-19,1224164,9989,8475,0,...,3.50,1.95,3.40,3.80,2.00,3.10,3.60,2.00,3.30,3.40
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25974,1114,1,1,2012/2013,22,2012-12-26,1224156,9994,9984,3,...,4.33,1.73,3.90,4.60,1.72,3.40,4.50,1.73,3.60,4.33
25975,1115,1,1,2012/2013,22,2012-12-26,1224157,8203,9991,1,...,2.80,2.55,3.40,2.75,2.40,3.30,2.60,2.50,3.25,2.62
25976,1116,1,1,2012/2013,22,2012-12-26,1224158,8475,9987,1,...,1.90,4.10,3.75,1.85,4.00,3.30,1.83,4.00,3.40,1.83
25977,1117,1,1,2012/2013,23,2013-02-27,1224159,8635,10000,0,...,7.00,1.44,4.80,7.50,1.40,4.20,6.75,1.40,4.33,7.00


In [126]:
results = engine.execute('SELECT * FROM Player')
keys = engine.execute('SELECT * FROM Player').keys()

data = []
column = []

for row in results:
    data.append(row)
    
for row in keys:
    column.append(row)
    
player = pd.DataFrame(data, columns=column)
player

Unnamed: 0,id,player_api_id,player_name,player_fifa_api_id,birthday,height,weight
0,1,505942,Aaron Appindangoye,218353,1992-02-29 00:00:00,182.88,187
1,2,155782,Aaron Cresswell,189615,1989-12-15 00:00:00,170.18,146
2,3,162549,Aaron Doran,186170,1991-05-13 00:00:00,170.18,163
3,4,30572,Aaron Galindo,140161,1982-05-08 00:00:00,182.88,198
4,5,23780,Aaron Hughes,17725,1979-11-08 00:00:00,182.88,154
...,...,...,...,...,...,...,...
11055,11071,26357,Zoumana Camara,2488,1979-04-03 00:00:00,182.88,168
11056,11072,111182,Zsolt Laczko,164680,1986-12-18 00:00:00,182.88,176
11057,11073,36491,Zsolt Low,111191,1979-04-29 00:00:00,180.34,154
11058,11074,35506,Zurab Khizanishvili,47058,1981-10-06 00:00:00,185.42,172


In [127]:
# Data cleaning, getting rid of the hours in birthday column
player['birthday'] = player['birthday'].str.split(' ',expand=True)[0]
player

Unnamed: 0,id,player_api_id,player_name,player_fifa_api_id,birthday,height,weight
0,1,505942,Aaron Appindangoye,218353,1992-02-29,182.88,187
1,2,155782,Aaron Cresswell,189615,1989-12-15,170.18,146
2,3,162549,Aaron Doran,186170,1991-05-13,170.18,163
3,4,30572,Aaron Galindo,140161,1982-05-08,182.88,198
4,5,23780,Aaron Hughes,17725,1979-11-08,182.88,154
...,...,...,...,...,...,...,...
11055,11071,26357,Zoumana Camara,2488,1979-04-03,182.88,168
11056,11072,111182,Zsolt Laczko,164680,1986-12-18,182.88,176
11057,11073,36491,Zsolt Low,111191,1979-04-29,180.34,154
11058,11074,35506,Zurab Khizanishvili,47058,1981-10-06,185.42,172


In [128]:
# Checking for duplicates
player = player.drop_duplicates(subset=['player_name'], keep='first')
player

Unnamed: 0,id,player_api_id,player_name,player_fifa_api_id,birthday,height,weight
0,1,505942,Aaron Appindangoye,218353,1992-02-29,182.88,187
1,2,155782,Aaron Cresswell,189615,1989-12-15,170.18,146
2,3,162549,Aaron Doran,186170,1991-05-13,170.18,163
3,4,30572,Aaron Galindo,140161,1982-05-08,182.88,198
4,5,23780,Aaron Hughes,17725,1979-11-08,182.88,154
...,...,...,...,...,...,...,...
11055,11071,26357,Zoumana Camara,2488,1979-04-03,182.88,168
11056,11072,111182,Zsolt Laczko,164680,1986-12-18,182.88,176
11057,11073,36491,Zsolt Low,111191,1979-04-29,180.34,154
11058,11074,35506,Zurab Khizanishvili,47058,1981-10-06,185.42,172


In [129]:
results = engine.execute('SELECT * FROM Team')
keys = engine.execute('SELECT * FROM Team').keys()

data = []
column = []

for row in results:
    data.append(row)
    
for row in keys:
    column.append(row)
    
team = pd.DataFrame(data, columns=column)
team

Unnamed: 0,id,team_api_id,team_fifa_api_id,team_long_name,team_short_name
0,1,9987,673.0,KRC Genk,GEN
1,2,9993,675.0,Beerschot AC,BAC
2,3,10000,15005.0,SV Zulte-Waregem,ZUL
3,4,9994,2007.0,Sporting Lokeren,LOK
4,5,9984,1750.0,KSV Cercle Brugge,CEB
...,...,...,...,...,...
294,49479,10190,898.0,FC St. Gallen,GAL
295,49837,10191,1715.0,FC Thun,THU
296,50201,9777,324.0,Servette FC,SER
297,50204,7730,1862.0,FC Lausanne-Sports,LAU


In [130]:
# Checking for duplicates
team = team.drop_duplicates(subset=['team_long_name'], keep='first')
team

Unnamed: 0,id,team_api_id,team_fifa_api_id,team_long_name,team_short_name
0,1,9987,673.0,KRC Genk,GEN
1,2,9993,675.0,Beerschot AC,BAC
2,3,10000,15005.0,SV Zulte-Waregem,ZUL
3,4,9994,2007.0,Sporting Lokeren,LOK
4,5,9984,1750.0,KSV Cercle Brugge,CEB
...,...,...,...,...,...
294,49479,10190,898.0,FC St. Gallen,GAL
295,49837,10191,1715.0,FC Thun,THU
296,50201,9777,324.0,Servette FC,SER
297,50204,7730,1862.0,FC Lausanne-Sports,LAU


In [131]:
results = engine.execute('SELECT * FROM Player_Attributes')
keys = engine.execute('SELECT * FROM Player_Attributes').keys()

data = []
column = []

for row in results:
    data.append(row)
    
for row in keys:
    column.append(row)
    
player_attributes = pd.DataFrame(data, columns=column)
player_attributes

Unnamed: 0,id,player_fifa_api_id,player_api_id,date,overall_rating,potential,preferred_foot,attacking_work_rate,defensive_work_rate,crossing,...,vision,penalties,marking,standing_tackle,sliding_tackle,gk_diving,gk_handling,gk_kicking,gk_positioning,gk_reflexes
0,1,218353,505942,2016-02-18 00:00:00,67.0,71.0,right,medium,medium,49.0,...,54.0,48.0,65.0,69.0,69.0,6.0,11.0,10.0,8.0,8.0
1,2,218353,505942,2015-11-19 00:00:00,67.0,71.0,right,medium,medium,49.0,...,54.0,48.0,65.0,69.0,69.0,6.0,11.0,10.0,8.0,8.0
2,3,218353,505942,2015-09-21 00:00:00,62.0,66.0,right,medium,medium,49.0,...,54.0,48.0,65.0,66.0,69.0,6.0,11.0,10.0,8.0,8.0
3,4,218353,505942,2015-03-20 00:00:00,61.0,65.0,right,medium,medium,48.0,...,53.0,47.0,62.0,63.0,66.0,5.0,10.0,9.0,7.0,7.0
4,5,218353,505942,2007-02-22 00:00:00,61.0,65.0,right,medium,medium,48.0,...,53.0,47.0,62.0,63.0,66.0,5.0,10.0,9.0,7.0,7.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
183973,183974,102359,39902,2009-08-30 00:00:00,83.0,85.0,right,medium,low,84.0,...,88.0,83.0,22.0,31.0,30.0,9.0,20.0,84.0,20.0,20.0
183974,183975,102359,39902,2009-02-22 00:00:00,78.0,80.0,right,medium,low,74.0,...,88.0,70.0,32.0,31.0,30.0,9.0,20.0,73.0,20.0,20.0
183975,183976,102359,39902,2008-08-30 00:00:00,77.0,80.0,right,medium,low,74.0,...,88.0,70.0,32.0,31.0,30.0,9.0,20.0,73.0,20.0,20.0
183976,183977,102359,39902,2007-08-30 00:00:00,78.0,81.0,right,medium,low,74.0,...,88.0,53.0,28.0,32.0,30.0,9.0,20.0,73.0,20.0,20.0


In [132]:
# Data cleaning, getting rid of the hours in date column
player_attributes['date'] = player_attributes['date'].str.split(' ',expand=True)[0]
player_attributes

Unnamed: 0,id,player_fifa_api_id,player_api_id,date,overall_rating,potential,preferred_foot,attacking_work_rate,defensive_work_rate,crossing,...,vision,penalties,marking,standing_tackle,sliding_tackle,gk_diving,gk_handling,gk_kicking,gk_positioning,gk_reflexes
0,1,218353,505942,2016-02-18,67.0,71.0,right,medium,medium,49.0,...,54.0,48.0,65.0,69.0,69.0,6.0,11.0,10.0,8.0,8.0
1,2,218353,505942,2015-11-19,67.0,71.0,right,medium,medium,49.0,...,54.0,48.0,65.0,69.0,69.0,6.0,11.0,10.0,8.0,8.0
2,3,218353,505942,2015-09-21,62.0,66.0,right,medium,medium,49.0,...,54.0,48.0,65.0,66.0,69.0,6.0,11.0,10.0,8.0,8.0
3,4,218353,505942,2015-03-20,61.0,65.0,right,medium,medium,48.0,...,53.0,47.0,62.0,63.0,66.0,5.0,10.0,9.0,7.0,7.0
4,5,218353,505942,2007-02-22,61.0,65.0,right,medium,medium,48.0,...,53.0,47.0,62.0,63.0,66.0,5.0,10.0,9.0,7.0,7.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
183973,183974,102359,39902,2009-08-30,83.0,85.0,right,medium,low,84.0,...,88.0,83.0,22.0,31.0,30.0,9.0,20.0,84.0,20.0,20.0
183974,183975,102359,39902,2009-02-22,78.0,80.0,right,medium,low,74.0,...,88.0,70.0,32.0,31.0,30.0,9.0,20.0,73.0,20.0,20.0
183975,183976,102359,39902,2008-08-30,77.0,80.0,right,medium,low,74.0,...,88.0,70.0,32.0,31.0,30.0,9.0,20.0,73.0,20.0,20.0
183976,183977,102359,39902,2007-08-30,78.0,81.0,right,medium,low,74.0,...,88.0,53.0,28.0,32.0,30.0,9.0,20.0,73.0,20.0,20.0


In [133]:
results = engine.execute('SELECT * FROM Team_Attributes')
keys = engine.execute('SELECT * FROM Team_Attributes').keys()

data = []
column = []

for row in results:
    data.append(row)
    
for row in keys:
    column.append(row)
    
team_attributes = pd.DataFrame(data, columns=column)
team_attributes

Unnamed: 0,id,team_fifa_api_id,team_api_id,date,buildUpPlaySpeed,buildUpPlaySpeedClass,buildUpPlayDribbling,buildUpPlayDribblingClass,buildUpPlayPassing,buildUpPlayPassingClass,...,chanceCreationShooting,chanceCreationShootingClass,chanceCreationPositioningClass,defencePressure,defencePressureClass,defenceAggression,defenceAggressionClass,defenceTeamWidth,defenceTeamWidthClass,defenceDefenderLineClass
0,1,434,9930,2010-02-22 00:00:00,60,Balanced,,Little,50,Mixed,...,55,Normal,Organised,50,Medium,55,Press,45,Normal,Cover
1,2,434,9930,2014-09-19 00:00:00,52,Balanced,48.0,Normal,56,Mixed,...,64,Normal,Organised,47,Medium,44,Press,54,Normal,Cover
2,3,434,9930,2015-09-10 00:00:00,47,Balanced,41.0,Normal,54,Mixed,...,64,Normal,Organised,47,Medium,44,Press,54,Normal,Cover
3,4,77,8485,2010-02-22 00:00:00,70,Fast,,Little,70,Long,...,70,Lots,Organised,60,Medium,70,Double,70,Wide,Cover
4,5,77,8485,2011-02-22 00:00:00,47,Balanced,,Little,52,Mixed,...,52,Normal,Organised,47,Medium,47,Press,52,Normal,Cover
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1453,1454,15005,10000,2011-02-22 00:00:00,52,Balanced,,Little,52,Mixed,...,53,Normal,Organised,46,Medium,48,Press,53,Normal,Cover
1454,1455,15005,10000,2012-02-22 00:00:00,54,Balanced,,Little,51,Mixed,...,50,Normal,Organised,44,Medium,55,Press,53,Normal,Cover
1455,1456,15005,10000,2013-09-20 00:00:00,54,Balanced,,Little,51,Mixed,...,32,Little,Organised,44,Medium,58,Press,37,Normal,Cover
1456,1457,15005,10000,2014-09-19 00:00:00,54,Balanced,42.0,Normal,51,Mixed,...,32,Little,Organised,44,Medium,58,Press,37,Normal,Cover


In [134]:
# Data cleaning, getting rid of the hours in date column
team_attributes['date'] = team_attributes['date'].str.split(' ',expand=True)[0]
team_attributes

Unnamed: 0,id,team_fifa_api_id,team_api_id,date,buildUpPlaySpeed,buildUpPlaySpeedClass,buildUpPlayDribbling,buildUpPlayDribblingClass,buildUpPlayPassing,buildUpPlayPassingClass,...,chanceCreationShooting,chanceCreationShootingClass,chanceCreationPositioningClass,defencePressure,defencePressureClass,defenceAggression,defenceAggressionClass,defenceTeamWidth,defenceTeamWidthClass,defenceDefenderLineClass
0,1,434,9930,2010-02-22,60,Balanced,,Little,50,Mixed,...,55,Normal,Organised,50,Medium,55,Press,45,Normal,Cover
1,2,434,9930,2014-09-19,52,Balanced,48.0,Normal,56,Mixed,...,64,Normal,Organised,47,Medium,44,Press,54,Normal,Cover
2,3,434,9930,2015-09-10,47,Balanced,41.0,Normal,54,Mixed,...,64,Normal,Organised,47,Medium,44,Press,54,Normal,Cover
3,4,77,8485,2010-02-22,70,Fast,,Little,70,Long,...,70,Lots,Organised,60,Medium,70,Double,70,Wide,Cover
4,5,77,8485,2011-02-22,47,Balanced,,Little,52,Mixed,...,52,Normal,Organised,47,Medium,47,Press,52,Normal,Cover
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1453,1454,15005,10000,2011-02-22,52,Balanced,,Little,52,Mixed,...,53,Normal,Organised,46,Medium,48,Press,53,Normal,Cover
1454,1455,15005,10000,2012-02-22,54,Balanced,,Little,51,Mixed,...,50,Normal,Organised,44,Medium,55,Press,53,Normal,Cover
1455,1456,15005,10000,2013-09-20,54,Balanced,,Little,51,Mixed,...,32,Little,Organised,44,Medium,58,Press,37,Normal,Cover
1456,1457,15005,10000,2014-09-19,54,Balanced,42.0,Normal,51,Mixed,...,32,Little,Organised,44,Medium,58,Press,37,Normal,Cover


# Unifiying both data sets

Each data set has a hard time to find a link in between each other, the goal of this section is to create auxiliar dataframes that can easily connect both data sets. This will be done through teams for the metadata set and players for the data set.

## Teams table

In [135]:
# Get teams and ids from the team dataframe
team_id = team['team_api_id'].tolist()
team_name = team['team_long_name'].tolist()

# Insert both lists into two nested lists for easy indexing
team_info = [team_id,team_name]

In [136]:
# To make sure there are no duplicates, both list's length have to be equal
print(len(team_id))
print(len(team_name))

296
296


In [137]:
# Get all unique teams from the metadata dataframe
metadata_htname = metadata['ht'].unique().tolist()
metadata_atname = metadata['at'].unique().tolist()

# Find common teams in both data sets
unique_teams = list(set(metadata_htname + metadata_atname))
common_teams = list(set(unique_teams) & set(team_name))

# Create a list with two nested lists
common_teams_data = []
common_teams_data.append(common_teams)
common_teams_data.append([None] * len(common_teams))

In [138]:
# Fill out the list with the ids of the teams in common
for x in range(len(common_teams_data[0])):
    for y in range(len(team_info[1])) :
        if common_teams_data[0][x] == team_info[1][y]:
            common_teams_data[1][x] = team_info[0][y]

In [139]:
# Create auxiliar dataframe
common_teams_df = pd.DataFrame(columns=['team_id','team'])
common_teams_df['team_id'] = common_teams_data[1]
common_teams_df['team'] = common_teams_data[0]

common_teams_df

Unnamed: 0,team_id,team
0,8696,Racing Santander
1,9798,Reading
2,9857,Bologna
3,9836,Dijon FCO
4,10281,Real Valladolid
...,...,...
60,8678,Bournemouth
61,9876,Hellas Verona
62,8295,Karlsruher SC
63,9875,Napoli


In [140]:
# Making sure that the team and id match
team.loc[team['team_long_name'] == 'Liverpool']

Unnamed: 0,id,team_api_id,team_fifa_api_id,team_long_name,team_short_name
30,3462,8650,9.0,Liverpool,LIV


In [141]:
# Making sure that the team and id match
team.loc[team['team_api_id'] == 9804]

Unnamed: 0,id,team_api_id,team_fifa_api_id,team_long_name,team_short_name
140,20529,9804,54.0,Torino,TOR


## Players table

In [142]:
# Get players and ids from the team dataframe
player_name = player['player_name'].tolist()
player_id = player['player_api_id'].tolist()

# Insert both lists into two nested lists for easy indexing
player_info = [player_id,player_name]

In [143]:
# To make sure there are no duplicates, both list's length have to be equal
print(len(player_id))
print(len(player_name))

10848
10848


In [147]:
# Get all unique players from data dataframe
data_player1 = data['player'].unique().tolist()
data_player2 = data['player2'].unique().tolist()

In [148]:
# Find unique players from both lists
unique_players = list(set(data_player1 + data_player2))
common_players = list(set(unique_players) & set(player_name))

common_players_info = []
common_players_info.append(common_players)
common_players_info.append([None] * len(common_players))

In [149]:
for x in range(len(common_players_info[0])):
    for y in range(len(player_info[1])):
        if common_players_info[0][x] == player_info[1][y]:
            common_players_info[1][x] = player_info[0][y]

In [150]:
# Create auxiliar dataframe
common_players_data = pd.DataFrame(columns=['player_id','player'])
common_players_data['player_id'] = common_players_info[1]
common_players_data['player'] = common_players_info[0]

common_players_data

Unnamed: 0,player_id,player
0,212511,Bruno Martins Indi
1,71560,Sidney Sam
2,26675,Eljero Elia
3,40176,Chinedu Obasi
4,35988,Christian Lell
...,...,...
3989,237631,Valentin Eysseric
3990,143403,Danilo
3991,24211,Richard Dunne
3992,29340,Marco Sailer


In [151]:
player.loc[player['player_api_id'] == 127894]

Unnamed: 0,id,player_api_id,player_name,player_fifa_api_id,birthday,height,weight
4160,4164,127894,Hector Rodas,192596,1988-03-07,190.5,176


In [152]:
player.loc[player['player_api_id'] == 129763]

Unnamed: 0,id,player_api_id,player_name,player_fifa_api_id,birthday,height,weight
6241,6248,129763,Luca Ceccarelli,172494,1983-03-24,182.88,154


## Data clean up to conserve the integrity of the data set

Rows from the match df have to be deleted since some teams are not present in the team df, initially I wanted to add those teams to the team df but there is no way of knowing what are the names of the teams missing, hence removing them is the only option to keep the integrity of the data set

In [233]:
# Find teams that are not present in the team df
home_different = match['home_team_api_id'].isin(team['team_api_id'])
home_different

0        True
1        True
2        True
3        True
4        True
         ... 
25974    True
25975    True
25976    True
25977    True
25978    True
Name: home_team_api_id, Length: 25979, dtype: bool

In [235]:
away_different = match['away_team_api_id'].isin(team['team_api_id'])
away_different

0        True
1        True
2        True
3        True
4        True
         ... 
25974    True
25975    True
25976    True
25977    True
25978    True
Name: away_team_api_id, Length: 25979, dtype: bool

In [234]:
# All index of teams that are in match df but not in team df
teams_not_present = home_different[home_different == False].index
teams_not_present

Int64Index([ 1259,  1276,  1283,  1300,  1304,  1324,  1341,  1348,  1364,
             1379,
            ...
            17479, 17504, 17525, 17544, 17554, 17569, 17596, 17612, 17624,
            17639],
           dtype='int64', length=195)

In [236]:
teams_not_present2 = away_different[away_different == False].index
teams_not_present2

Int64Index([ 1254,  1267,  1293,  1318,  1335,  1357,  1369,  1384,  1404,
             1408,
            ...
            17492, 17508, 17520, 17535, 17550, 17565, 17583, 17588, 17608,
            17629],
           dtype='int64', length=195)

In [242]:
# Append both series together and get the unique indexes
uncommon_teams = teams_not_present.append(teams_not_present2).unique()
uncommon_teams

Int64Index([ 1259,  1276,  1283,  1300,  1304,  1324,  1341,  1348,  1364,
             1379,
            ...
            17492, 17508, 17520, 17535, 17550, 17565, 17583, 17588, 17608,
            17629],
           dtype='int64', length=382)

In [262]:
# Get match_id from the indexes 
diff_teams = match[match['id'].isin(uncommon_teams)]
diff_teams

Unnamed: 0,id,country_id,league_id,season,stage,date,match_api_id,home_team_api_id,away_team_api_id,home_team_goal,...,SJA,VCH,VCD,VCA,GBH,GBD,GBA,BSH,BSD,BSA
153,1254,1,1,2014/2015,1,2014-07-27,1717822,8203,9987,3,...,2.6,2.60,3.25,2.55,,,,,,
158,1259,1,1,2014/2015,10,2014-10-04,1717891,9986,8571,0,...,,2.05,3.40,3.75,,,,,,
168,1267,1,1,2014/2015,11,2014-10-18,1717899,9991,9986,2,...,,1.60,4.00,6.00,,,,,,
179,1276,1,1,2014/2015,12,2014-10-25,1717908,10000,9984,1,...,,1.67,3.80,5.25,,,,,,
188,1283,1,1,2014/2015,13,2014-10-29,1717915,8573,10001,4,...,,1.95,3.50,3.80,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16469,17608,15722,15722,2015/2016,5,2015-08-14,1994653,8021,2182,2,...,,,,,,,,,,
16473,17612,15722,15722,2015/2016,6,2015-08-22,1994657,8025,8033,1,...,,,,,,,,,,
16485,17624,15722,15722,2015/2016,7,2015-08-30,1994669,177361,2182,3,...,,,,,,,,,,
16490,17629,15722,15722,2015/2016,8,2015-09-14,1994674,8019,10265,1,...,,,,,,,,,,


In [263]:
# Get match_id from the indexes
diff_teams_s = diff_teams['home_team_api_id']
diff_teams_s = diff_teams_s.append(diff_teams['away_team_api_id'])
diff_teams_s = diff_teams_s.unique()
diff_teams_s

array([  8203,   9986,   9991,  10000,   8573,   9989,  10001,   8475,
         8571,   9985,   9984,   9987,   8635,   9997,   1773,   8342,
         9994,   8030,  10265,   8021,   8025,   8569,   2186,   8322,
         8673,   1601,   8020,   8031,   2183,   8245,   2182,   8024,
         1957,   8033,   8244,   8023,   8028,   8027,   8019,   8242,
       177361], dtype=int64)

In [264]:
# Delete rows with teams not present in team df
match = match[~match['home_team_api_id'].isin(diff_teams_s)]
match = match[~match['away_team_api_id'].isin(diff_teams_s)]
match

Unnamed: 0,id,country_id,league_id,season,stage,date,match_api_id,home_team_api_id,away_team_api_id,home_team_goal,...,SJA,VCH,VCD,VCA,GBH,GBD,GBA,BSH,BSD,BSA
12,1127,1,1,2012/2013,24,2013-01-26,1224169,9993,9998,0,...,,,,,,,,,,
143,1235,1,1,2012/2013,9,2012-09-29,1224053,9998,9993,1,...,,,,,,,,,,
725,1729,1729,1729,2008/2009,1,2008-08-17,489042,10260,10261,1,...,10.00,1.28,5.5,12.00,1.30,4.75,10.00,1.29,4.50,11.0
726,4798,4769,4769,2008/2009,11,2008-10-29,483238,9873,9851,0,...,3.00,2.65,2.8,2.65,2.75,2.90,2.65,2.60,2.88,2.7
728,1730,1729,1729,2008/2009,1,2008-08-16,489043,9825,8659,1,...,12.00,1.25,6.0,13.00,1.22,5.50,13.00,1.22,5.00,13.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25189,362,1,1,2009/2010,16,2009-11-28,665596,9993,9999,3,...,9.50,1.30,4.5,8.50,1.30,4.50,9.00,1.30,4.50,8.5
25459,631,1,1,2010/2011,22,2010-12-29,838750,9993,6351,0,...,3.75,1.95,3.5,3.80,1.95,3.30,3.50,1.91,3.30,3.5
25566,738,1,1,2010/2011,7,2010-09-18,838547,6351,9993,0,...,2.30,2.88,3.3,2.40,2.75,3.20,2.35,2.75,3.20,2.3
25620,792,1,1,2011/2012,13,2011-11-05,1032791,9993,9998,2,...,,,,,,,,,,


In [267]:
match[match['home_team_api_id'] == 274581]

Unnamed: 0,id,country_id,league_id,season,stage,date,match_api_id,home_team_api_id,away_team_api_id,home_team_goal,...,SJA,VCH,VCD,VCA,GBH,GBD,GBA,BSH,BSD,BSA


# Inserting tables into Postgresql

In [284]:
import os

# dotenv adds .env variables to the environment
from dotenv import load_dotenv

# Load variables
load_dotenv()
key = os.environ['KEY']

# Create engine and connect to PostgreSQL
engine = create_engine('postgresql://postgres:'+key+'@localhost:5432/soccer_data')
connection = engine.connect()

## Tables from the CSV data

In [290]:
# Insert metadata dataframe into PostgreSQL
metadata.to_sql('event', con=engine, if_exists='replace', index=False)

# Set id_odsp as primary keys
connection.execute('ALTER TABLE event ADD CONSTRAINT event_constraint PRIMARY KEY(id_odsp);')

# Change date format
connection.execute('ALTER TABLE event ALTER COLUMN date TYPE DATE USING date::date;')

<sqlalchemy.engine.result.ResultProxy at 0x20930fc6b80>

In [288]:
# Insert data dataframe into PostgreSQL
data.to_sql('event_data', con=engine, if_exists='replace', index=False)

# Set id_odsp and id_event as primary keys
connection.execute('ALTER TABLE event_data ADD CONSTRAINT event_moment PRIMARY KEY(id_odsp,id_event);')

<sqlalchemy.engine.result.ResultProxy at 0x207c59acd90>

In [291]:
connection.execute('ALTER TABLE event_data ADD FOREIGN KEY(id_odsp) REFERENCES event(id_odsp)')

<sqlalchemy.engine.result.ResultProxy at 0x209311ef220>

## Tables from the SQLite data

In [170]:
# Insert country dataframe into PostgreSQL
country.to_sql('country', con=engine, if_exists='replace', index=False)

# Set id as primary key
connection.execute('ALTER TABLE country ADD PRIMARY KEY (id);')

<sqlalchemy.engine.result.ResultProxy at 0x207c5472100>

In [171]:
# Insert league dataframe into PostgreSQL
league.to_sql('league', con=engine, if_exists='replace', index=False)

# Set country_id as primary key
connection.execute('ALTER TABLE league ADD PRIMARY KEY (country_id);')

<sqlalchemy.engine.result.ResultProxy at 0x207c5472190>

In [172]:
# Insert player dataframe into PostgreSQL
player.to_sql('player', con=engine, if_exists='replace', index=False)

# Set player_api_id and id as primary keys
connection.execute('ALTER TABLE player ADD PRIMARY KEY (player_api_id);')

# Change birthday format
connection.execute('ALTER TABLE player ALTER COLUMN birthday TYPE DATE USING birthday::date;')

<sqlalchemy.engine.result.ResultProxy at 0x20799960fa0>

In [174]:
# Insert player_attributes dataframe into PostgreSQL
player_attributes.to_sql('player_attributes', con=engine, if_exists='replace', index=False)

# Set player_api_id and id as primary keys
connection.execute('ALTER TABLE player_attributes ADD PRIMARY KEY (id,player_api_id);')

# Change date format
connection.execute('ALTER TABLE player_attributes ALTER COLUMN date TYPE DATE USING date::date;')

<sqlalchemy.engine.result.ResultProxy at 0x207c6430fa0>

In [None]:
connection.execute('ALTER TABLE player_attributes ADD FOREIGN KEY(player_api_id) REFERENCES player(player_api_id)')

In [173]:
# Insert team dataframe into PostgreSQL
team.to_sql('team', con=engine, if_exists='replace', index=False)

# Set player_api_id and id as primary keys
connection.execute('ALTER TABLE team ADD PRIMARY KEY (team_api_id);')

<sqlalchemy.engine.result.ResultProxy at 0x207e38a58b0>

In [279]:
# Insert team_attributes dataframe into PostgreSQL
team_attributes.to_sql('team_attributes', con=engine, if_exists='replace', index=False)

# Set team_api_id and id as primary keys
connection.execute('ALTER TABLE team_attributes ADD PRIMARY KEY (id,team_api_id);')

# Change date format
connection.execute('ALTER TABLE team_attributes ALTER COLUMN date TYPE DATE USING date::date;')

<sqlalchemy.engine.result.ResultProxy at 0x2093a175d90>

In [None]:
connection.execute('ALTER TABLE team_attributes ADD FOREIGN KEY(team_api_id) REFERENCES team(team_api_id)')

In [268]:
# Insert match dataframe into PostgreSQL
match.to_sql('match', con=engine, if_exists='replace', index=False)

In [269]:
# Set id as primary key
connection.execute('ALTER TABLE match ADD PRIMARY KEY (match_api_id);')

# Change date format
connection.execute('ALTER TABLE match ALTER COLUMN date TYPE DATE USING date::date;')

<sqlalchemy.engine.result.ResultProxy at 0x20930518be0>

In [270]:
# Foreign keys
connection.execute('ALTER TABLE match ADD FOREIGN KEY(country_id) REFERENCES country(id)')

<sqlalchemy.engine.result.ResultProxy at 0x20930fc6cd0>

In [271]:
connection.execute('ALTER TABLE match ADD FOREIGN KEY(league_id) REFERENCES league(country_id)')

<sqlalchemy.engine.result.ResultProxy at 0x207fb743b20>

In [273]:
connection.execute('ALTER TABLE match ADD FOREIGN KEY(home_team_api_id) REFERENCES team(team_api_id)')

<sqlalchemy.engine.result.ResultProxy at 0x20931213cd0>

In [274]:
connection.execute('ALTER TABLE match ADD FOREIGN KEY(away_team_api_id) REFERENCES team(team_api_id)')

<sqlalchemy.engine.result.ResultProxy at 0x2093121dc10>

In [None]:
connection.execute('ALTER TABLE match ALTER COLUMN home_player_1 TYPE bigint USING home_player_1::bigint;')
connection.execute('ALTER TABLE match ADD FOREIGN KEY(home_player_1) REFERENCES player(player_api_id)')

## Auxiliar tables

In [65]:
# Insert common_teams_df dataframe into PostgreSQL
common_teams_df.to_sql('common_teams', con=engine, if_exists='replace', index=False)

# Set team_id as primary keys
connection.execute('ALTER TABLE common_teams ADD PRIMARY KEY (team_id);')

<sqlalchemy.engine.result.ResultProxy at 0x207c59b9c40>

In [66]:
# Set team as UNIQUE constraint, isolation level has to be autocommit, more info here: https://docs.sqlalchemy.org/en/14/core/connections.html#sqlalchemy.engine.Connection.execution_options.params.isolation_level
connection.execution_options(isolation_level="AUTOCOMMIT").execute('CREATE UNIQUE INDEX CONCURRENTLY unique_team_id ON common_teams (team);')
connection.execute('ALTER TABLE common_teams ADD CONSTRAINT unique_team UNIQUE USING INDEX unique_team_id;')

<sqlalchemy.engine.result.ResultProxy at 0x207c6424850>

In [67]:
# Insert common_teams_df dataframe into PostgreSQL
common_players_data.to_sql('common_players', con=engine, if_exists='replace', index=False)

# Set player_id as primary keys
connection.execute('ALTER TABLE common_players ADD PRIMARY KEY (player_id);')

<sqlalchemy.engine.result.ResultProxy at 0x207c7487730>

In [68]:
connection.execute('CREATE UNIQUE INDEX CONCURRENTLY unique_player_id ON common_players (player);')
connection.execute('ALTER TABLE common_players ADD CONSTRAINT unique_player UNIQUE USING INDEX unique_player_id;')

<sqlalchemy.engine.result.ResultProxy at 0x2078f2bf3d0>