__Author:__ Marijse

__Aim:__ loading the event tables into the local postgres instance. 

__Prerequisites:__
- Have the tournament id dictionary loaded into _0_original_data;

In [1]:
# import the necessary packages
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
import psycopg2
import sqlalchemy
from sqlalchemy import create_engine
import glob
from datetime import datetime
import urllib.request
import requests
import io


In [2]:
# Set up the database connection wiht Psycopg2
db = psycopg2.connect(dbname='Rugby', user='postgres', host='localhost', password='password')
cursor=db.cursor()

# Set up a database connection using sqlalchemy
engine = create_engine('postgres://postgres:password@localhost:5432/Rugby')

In [3]:
# Define variables

# Define the input and output strings
input_string = '../_4_data_extracts/squad/'
output_strig = '../_6_data_clean/'

# Define schemas
schema1 = '_0_original_data'
schema2 = '_1_data_views'

In [4]:
cursor.execute("CREATE SCHEMA IF NOT EXISTS " + schema1)
cursor.execute("CREATE SCHEMA IF NOT EXISTS " + schema2)
db.commit()

### Reading Match Fixtures into datafrmae

In [5]:
# Step 2: we run the function
def xml2df(xml_data):
    root = ET.XML(xml_data) # element tree
    all_records = []
    for i, child in enumerate(root):
        record = {}
        for subchild in child:
            record[subchild.tag] = subchild.text
            all_records.append(record)
    return pd.DataFrame(all_records)

In [6]:
# List all event id's of the past two years
eventid = pd.read_sql_query('select eventid from _0_original_data.tournament_id_dictionary',db)
eventid_list = list(eventid.eventid.unique())

In [7]:
# Define an empty dataframe to append the different xml files to
event_tables = pd.DataFrame()

for k in eventid_list:
    
    group_id_list = ['A','B','C','D','']
    
    for n in group_id_list:
    
        url = 'http://webservices.irb.com/EventInformation.asmx/EventTables?uid=e9656db8-ffb5-4115-ac0b-cbd5688e6648&eventid='+k+'&group='+n
        response = urllib.request.urlopen(url)
        data = response.read()      
        text = data.decode('utf-8') 
    
        # Create a df which equals the text object
        df = xml2df(text)
        
        # Skip all empty Event ID's
        if df.empty == True:
            continue
        else:
            pass

        # Extract the tournament id from the file name
        tournament_id =  k
        group_id = n
        
        # Set a column equal to the file name
        df['tournament_id'] = tournament_id
        df['group_id'] = group_id
    
        # Clean the column headers 
        dict_columns={}
        for x in (df.columns.values):
            dict_columns[x] = x.lower().replace('{http://webservices.irb.com/}','')
        df_clean = df.rename(columns=dict_columns)
    
        # remove duplicates from the data
        df_clean = df_clean.drop_duplicates(['teamid','tournament_id','group_id'], keep='first')
        
        # Append each individual dataframe to the full_fixtures df
        frames =(event_tables,df_clean)
        event_tables = pd.concat(frames)

HTTPError: HTTP Error 500: Internal Server Error

In [None]:
event_tables.head()

In [None]:
# The final table includes 20 tournaments and has 3918 player names associated with the 20 different tournaments. 
event_tables.tournament_id.nunique()
len(event_tables)

In [None]:
# Extract to CSV
event_tables.to_csv('../_6_data_clean/event_tables_v2_02012018.csv')

In [None]:
# Extract to SQL
table_name = 'event_tables'
event_tables.to_sql(schema=schema1, con=engine, if_exists='replace', name=table_name, index=False)
db.commit
db.close()