__Author:__ Marijse

__Aim:__ load the team records in events in all time for all tournaments into the local postgres instance. 

__Prerequisites:__
- Have the tournament id dictionary loaded into _0_original_data;
- Have the terminology dictionary loaded into _0_original_data;

In [1]:
# import the necessary packages
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
import psycopg2
import sqlalchemy
from sqlalchemy import create_engine
import glob
from datetime import datetime
import urllib.request
import requests
import io


In [2]:
# Set up the database connection wiht Psycopg2
db = psycopg2.connect(dbname='Rugby', user='postgres', host='localhost', password='password')
cursor=db.cursor()

# Set up a database connection using sqlalchemy
engine = create_engine('postgres://postgres:password@localhost:5432/Rugby')

In [3]:
# Define variables

# Define the input and output strings
input_string = '../_4_data_extracts/squad/'
output_strig = '../_6_data_clean/'

# Define schemas
schema1 = '_0_original_data'
schema2 = '_1_data_views'

In [4]:
cursor.execute("CREATE SCHEMA IF NOT EXISTS " + schema1)
cursor.execute("CREATE SCHEMA IF NOT EXISTS " + schema2)
db.commit()

### Reading Match Fixtures into datafrmae

In [5]:
# Step 2: we run the function
def xml2df(xml_data):
    root = ET.XML(xml_data) # element tree
    all_records = []
    for i, child in enumerate(root):
        record = {}
        for subchild in child:
            record[subchild.tag] = subchild.text
            all_records.append(record)
    return pd.DataFrame(all_records)

In [6]:
# List all event id's of the past two years
eventid = pd.read_sql_query('select eventid from _0_original_data.tournament_id_dictionary',db)
eventid_list = list(eventid.eventid.unique())

# List all event id's of the past two years
terminology = pd.read_sql_query("select abbreviation from _0_original_data.terminology_dictionary \
                                    WHERE abbreviation NOT IN('Apps','Cap','Dra','LMargin','Los','Rep','Win','WMargin'\
                                    ,'Sta')",db)
terminology_list = list(terminology.abbreviation.unique())
terminology_list

['Con', 'DG', 'Pen', 'Points', 'Red', 'Subs', 'Try', 'Yellow']

In [7]:
# Define an empty dataframe to append the different xml files to
team_record_in_event_all_time = pd.DataFrame()

for k in eventid_list:
    
    #print(k)
    
    for n in terminology_list:    
        url = 'http://webservices.irb.com/EventInformation.asmx/TeamRecordsInEventAllTime?uid=e9656db8-ffb5-4115-ac0b-cbd5688e6648&EventID=' + k + '&Method=' + n
        response = urllib.request.urlopen(url)
        data = response.read()      
        text = data.decode('utf-8') 
        
        #print(n)
    
        # Create a df which equals the text object
        df = xml2df(text)
        
        # Skip all empty Event ID's
        if df.empty == True:
            continue
        else:
            pass
        
        #print('we got this far')

        # Extract the tournament id from the file name
        tournament_id =  k
        method = n
        
        # Set a column equal to the file name
        df['tournament_id'] = tournament_id
        df['method'] = method
    
        # Clean the column headers 
        dict_columns={}
        for x in (df.columns.values):
            dict_columns[x] = x.lower().replace('{http://webservices.irb.com/}','')
        df_clean = df.rename(columns=dict_columns)
    
        # remove duplicates from the data
        df_clean = df_clean.drop_duplicates(['tournament_id','teamfor','record'], keep='first')
        
        # Append each individual dataframe to the full_fixtures df
        frames =(team_record_in_event_all_time,df_clean)
        team_record_in_event_all_time = pd.concat(frames)

In [8]:
team_record_in_event_all_time.head()

Unnamed: 0,record,teamfor,tournament_id,method
0,2638,Fiji 7s,1382,Con
2,2620,New Zealand 7s,1382,Con
4,2189,South Africa 7s,1382,Con
6,1955,Australia 7s,1382,Con
8,1876,England 7s,1382,Con


In [9]:
# The final table includes 20 tournaments and has 2680 records associated with 8 methods over the 20 different tournaments. 
team_record_in_event_all_time.tournament_id.nunique()
team_record_in_event_all_time.method.nunique()
len(team_record_in_event_all_time)

26130

In [10]:
# Extract to CSV
team_record_in_event_all_time.to_csv('../_6_data_clean/team_record_in_event_all_time.csv')

In [11]:
# Extract to SQL
table_name = 'team_record_in_event_all_time'
team_record_in_event_all_time.to_sql(schema=schema1, con=engine, if_exists='replace', name=table_name)
db.commit
db.close()