In [1]:
# import the necessary packages
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
import psycopg2
import sqlalchemy
from sqlalchemy import create_engine
import glob
from datetime import datetime
import urllib.request
import requests
import io


In [13]:
# Set max number of rows and and columns to view in one go
pd.set_option('max_columns', 40)
pd.set_option('max_rows', 100)


In [2]:
# Set up the database connection wiht Psycopg2
db = psycopg2.connect(dbname='r7', user='postgres')
cursor=db.cursor()

# Set up a database connection using sqlalchemy
engine = create_engine('postgres://postgres:postgres@localhost:5432/r7')

In [3]:
# Define variables

# Define the input and output strings
input_string = '../_4_data_extracts/squad/'
output_strig = '../_6_data_clean/'

# Define schemas
schema1 = '_0_original_data'
schema2 = '_1_data_views'

In [4]:
cursor.execute("CREATE SCHEMA IF NOT EXISTS " + schema1)
cursor.execute("CREATE SCHEMA IF NOT EXISTS " + schema2)
db.commit()

### Reading Match Fixtures into datafrmae

In [26]:
# Step 2: we run the function
def xml2df(xml_data):
    root = ET.XML(xml_data) # element tree
    all_records = []
    for child in enumerate(root):
        print(child)
        record = {}
        for subchild in child:
        #for subchild in root.iter():
            record[subchild.tag] = subchild.text
            all_records.append(record)
    return pd.DataFrame(all_records)

In [6]:
url = 'http://webservices.irb.com/EventInformation.asmx/MatchWithTimeline?uid=e9656db8-ffb5-4115-ac0b-cbd5688e6648&EventID=1693&matchid=24947'
response = urllib.request.urlopen(url)
data = response.read()      
text = data.decode('utf-8')

root = ET.XML(text) # element tree
element = root.find('MatchInformation')

print(element)

None


In [7]:
# List all event id's of the past two years
eventid = pd.read_sql_query('select eventid from _0_original_data.tournament_id_dictionary',db)
eventid_list = list(eventid.eventid.unique())

In [36]:
# Define an empty dataframe to append the different xml files to
player_profile_with_testbreakdown = pd.DataFrame()

eventid_list = ['1611']

for k in eventid_list:
    
    print(k)
    
    personid = pd.read_sql_query("select personid from _0_original_data.full_squads_info where tournament_id = '"+k+"'",db)
    personid_list = list(personid.personid.unique())
    
    personid_list = ['47808','48529']
    
    for n in personid_list:
    
        print(n)
    
        url = 'http://webservices.irb.com/EventInformation.asmx/PlayerProfileWithTestBreakdown?uid=e9656db8-ffb5-4115-ac0b-cbd5688e6648&eventid=1611&personid=47808'
        response = urllib.request.urlopen(url)
        data = response.read()      
        text = data.decode('utf-8') 
    
        # Create a df which equals the text object
        df = xml2df(text)

        print('we got this far')
        
        # Extract the tournament id from the file name
        tournament_id =  k
        person_id = n
        
        # Set a column equal to the file name
        df['tournament_id'] = tournament_id
        df['person_id'] = person_id
    
        # Clean the column headers 
        dict_columns={}
        for x in (df.columns.values):
            dict_columns[x] = x.lower().replace('{http://webservices.irb.com/}','')
        df_clean = df.rename(columns=dict_columns)
    
        # Append each individual dataframe to the full_fixtures df
        frames =(player_profile_with_testbreakdown,df_clean)
        player_profile_with_testbreakdown = pd.concat(frames)

1611
47808
we got this far
48529
we got this far


In [37]:
player_profile_with_testbreakdown.head()

Unnamed: 0,conversions,country,dropgoals,matchdate,matchid,penalties,playedagainst,playedfor,points,position,result,rolename,score,teamid,teamname,teamtype,tries,tournament_id,person_id
0,,Fiji,,,,,,,,,,Player,,2135,"Fiji Warriors, Fiji - Mens (Representative)",Representative,,1611,47808
1,,Fiji,,,,,,,,,,Player,,2135,"Fiji Warriors, Fiji - Mens (Representative)",Representative,,1611,47808
2,,Fiji,,,,,,,,,,Player,,2135,"Fiji Warriors, Fiji - Mens (Representative)",Representative,,1611,47808
3,,Fiji,,,,,,,,,,Player,,2135,"Fiji Warriors, Fiji - Mens (Representative)",Representative,,1611,47808
4,,Fiji,,,,,,,,,,Player,,2135,"Fiji Warriors, Fiji - Mens (Representative)",Representative,,1611,47808


In [9]:
# The final table includes 20 tournaments and has 3918 player names associated with the 20 different tournaments. 
player_profile_with_testbreakdown.tournament_id.nunique()
len(player_profile_with_testbreakdown)

3918

In [10]:
# Extract to CSV
player_profile_with_testbreakdown.to_csv('../_6_data_clean/player_profile_with_testbreakdown.csv')

In [11]:
# Extract to SQL
table_name = 'player_profile_with_testbreakdown'
player_profile_with_testbreakdown.to_sql(schema=schema1, con=engine, if_exists='replace', name=table_name)
db.commit
db.close()