In [2]:
# import the necessary packages
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
import psycopg2
import sqlalchemy
from sqlalchemy import create_engine
import glob
from datetime import datetime

pd.set_option('max_columns', 140)
pd.set_option('max_rows', 20)


In [3]:
# Set up the database connection wiht Psycopg2
db = psycopg2.connect(dbname='r7', user='postgres')
cursor=db.cursor()

# Set up a database connection using sqlalchemy
engine = create_engine('postgres://postgres:postgres@localhost:5432/r7')

In [4]:
# Define variables

# Define the input and output strings
input_string = '../_4_data_extracts/squad/'
output_strig = '../_6_data_clean/'

# Define schemas
schema1 = '_0_original_data'
schema2 = '_1_data_views'

### Load data

In [5]:
# Load data
fixtures = pd.read_sql_query("SELECT * FROM " + schema1 + ".match_fixtures WHERE tournament_id = '1685'",db)
stats = pd.read_sql_query("SELECT * FROM " + schema2 + ".player_stats_tournament1685",db)
dic = pd.read_sql_query("SELECT * FROM " + schema1 + ".team_id_dictionary",db)


In [6]:
# We check the what unique values are in a number of columns
stats.head()


Unnamed: 0,index,team,height_mean,height_max,height_min,weight_mean,weight_max,weight_min,age_mean,age_max,age_min,player_count
0,0,arg,183.692308,191,170,86.538462,100,75,24.513804,31.065753,20.654795,13
1,1,aus,184.769231,193,173,93.384615,104,82,25.943941,34.830137,20.580822,13
2,2,can,185.538462,195,178,94.307692,120,81,25.036881,30.468493,19.857534,13
3,3,eng,182.923077,195,174,91.846154,105,84,26.816017,33.326027,19.621918,13
4,4,fij,184.692308,195,172,92.153846,110,81,27.753846,32.136986,22.350685,13


### Merge data to create full table

In [7]:
# Merge stats table with dic
stats_full = pd.merge(stats, dic, left_on='team', right_on = 'country_code', how='left')

# Drop the unnecessary  and empty columns
stats_full = stats_full.drop({'index_x','index_y'}, axis = 1)

# Clean teamid column
stats_full['teamid'] = stats_full['teamid'].apply(lambda x: x.replace('\xa0\xa0\xa0\xa0\xa0\xa0',''))

stats_full.head()

Unnamed: 0,team,height_mean,height_max,height_min,weight_mean,weight_max,weight_min,age_mean,age_max,age_min,player_count,country,teamid,country_code
0,arg,183.692308,191,170,86.538462,100,75,24.513804,31.065753,20.654795,13,ARGENTINA,2408,arg
1,aus,184.769231,193,173,93.384615,104,82,25.943941,34.830137,20.580822,13,AUSTRALIA,2409,aus
2,can,185.538462,195,178,94.307692,120,81,25.036881,30.468493,19.857534,13,CANADA,2410,can
3,eng,182.923077,195,174,91.846154,105,84,26.816017,33.326027,19.621918,13,ENGLAND,2412,eng
4,fij,184.692308,195,172,92.153846,110,81,27.753846,32.136986,22.350685,13,FIJI,2413,fij


In [8]:
stats_full.teamid.unique()

array(['2408', '2409', '2410', '2412', '2413', '2414', '2423', '2415',
       '2416', '3911', '2420', '2418', '2419', '2745', '2422', '3921'], dtype=object)

In [9]:
# Merge stats_full table with dic
df_full_temp = pd.merge(fixtures, stats_full, left_on='team1id', right_on = 'teamid', how='left')
df_full = pd.merge(df_full_temp, stats_full, left_on='team2id', right_on = 'teamid', how='left')
#df_full.head()

In [10]:
# Rename some of the column headers to replace x with team1 and y with team2
dict_columns={}
for x in (df_full.columns.values):
    dict_columns[x] = x.replace('_x','_team1').replace('_y','_team2')
df_full = df_full.rename(columns=dict_columns)

In [11]:
# Drop some of the unnecessary columns
df_full = df_full.drop({'index','attendance'}, axis = 1)

In [12]:
df_full.head()

Unnamed: 0,city,eventname,gmtdifference,group,kickoff,matchid,matchnumber,stadium,stage,status,team1id,team1name,team1score,team2id,team2name,team2score,tournament_id,team_team1,height_mean_team1,height_max_team1,height_min_team1,weight_mean_team1,weight_max_team1,weight_min_team1,age_mean_team1,age_max_team1,age_min_team1,player_count_team1,country_team1,teamid_team1,country_code_team1,team_team2,height_mean_team2,height_max_team2,height_min_team2,weight_mean_team2,weight_max_team2,weight_min_team2,age_mean_team2,age_max_team2,age_min_team2,player_count_team2,country_team2,teamid_team2,country_code_team2
0,Cape Town,HSBC World Rugby Sevens Series 2016-17- Cape Town,2.0,D,2016-12-10T10:15:00,24137,1,Cape Town Stadium,Pool,Match Completed,2419,Scotland 7s,21,2418,Samoa 7s,19,1685,sco,183.166667,193,177,93.083333,103,83,28.734247,32.972603,22.60274,12,SCOTLAND,2419,sco,sam,182.846154,189,175,91.153846,103,74,25.759958,29.663014,21.882192,13,SAMOA,2418,sam
1,Cape Town,HSBC World Rugby Sevens Series 2016-17- Cape Town,2.0,D,2016-12-10T10:37:00,24138,2,Cape Town Stadium,Pool,Match Completed,3921,Wales 7s,29,2745,Uganda 7s,7,1685,wal,184.333333,195,177,91.833333,102,81,23.844521,31.328767,19.961644,12,WALES,3921,wal,uga,176.583333,187,164,80.5,89,70,23.989954,28.331507,21.175342,12,UGANDA,2745,uga
2,Cape Town,HSBC World Rugby Sevens Series 2016-17- Cape Town,2.0,C,2016-12-10T10:59:00,24139,3,Cape Town Stadium,Pool,Match Completed,2416,New Zealand 7s,26,2408,Argentina 7s,12,1685,nzl,188.076923,194,180,95.538462,104,80,27.578082,35.016438,22.824658,13,NEW ZEALAND,2416,nzl,arg,183.692308,191,170,86.538462,100,75,24.513804,31.065753,20.654795,13,ARGENTINA,2408,arg
3,Cape Town,HSBC World Rugby Sevens Series 2016-17- Cape Town,2.0,C,2016-12-10T11:21:00,24140,4,Cape Town Stadium,Pool,Match Completed,2412,England 7s,33,2410,Canada 7s,10,1685,eng,182.923077,195,174,91.846154,105,84,26.816017,33.326027,19.621918,13,ENGLAND,2412,eng,can,185.538462,195,178,94.307692,120,81,25.036881,30.468493,19.857534,13,CANADA,2410,can
4,Cape Town,HSBC World Rugby Sevens Series 2016-17- Cape Town,2.0,B,2016-12-10T11:43:00,24141,5,Cape Town Stadium,Pool,Match Completed,2414,France 7s,14,2415,Kenya 7s,33,1685,fra,181.769231,194,169,89.230769,100,71,27.759326,37.454795,21.191781,13,FRANCE,2414,fra,ken,179.416667,189,155,92.583333,105,70,26.471005,33.989041,22.263014,12,KENYA,2415,ken


In [13]:
df_full.to_csv('../_6_data_clean/fixtures_with_player_info.csv')

In [None]:
# Send df to SQL
table_name = 'fixtures_with_player_info'
df_full.to_sql(schema=schema1, con=engine, if_exists='replace', name=table_name)
db.commit()
db.close()