__Author:__ Marijse

__Aim:__ loading team management information per event into the local postgres instance. 

__Prerequisites:__
- Have the tournament id dictionary loaded into _0_original_data;
- Have the team_data_per_tournament loaded into _0_original_data;

In [None]:
# import the necessary packages
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
import psycopg2
import sqlalchemy
from sqlalchemy import create_engine
import glob
from datetime import datetime
import urllib.request
import requests
import io


In [None]:
# Set up the database connection wiht Psycopg2
db = psycopg2.connect(dbname='Rugby', user='postgres', host='localhost', password='password')
cursor=db.cursor()

# Set up a database connection using sqlalchemy
engine = create_engine('postgres://postgres:password@localhost:5432/Rugby')

In [None]:
# Define variables

# Define the input and output strings
input_string = '../_1_data_extracts/squad/'
output_strig = '../_3_data_clean/'

# Define schemas
schema1 = '_0_original_data'
schema2 = '_1_data_views'

In [None]:
cursor.execute("CREATE SCHEMA IF NOT EXISTS " + schema1)
cursor.execute("CREATE SCHEMA IF NOT EXISTS " + schema2)
db.commit()

### Reading Match Fixtures into datafrmae

In [None]:
# Step 2: we run the function
def xml2df(xml_data):
    root = ET.XML(xml_data) # element tree
    all_records = []
    for i, child in enumerate(root):
        record = {}
        for subchild in child:
            record[subchild.tag] = subchild.text
            all_records.append(record)
    return pd.DataFrame(all_records)

In [None]:
# List all event id's of the past two years
eventid = pd.read_sql_query('select eventid from _0_original_data.tournament_id_dictionary',db)
eventid_list = list(eventid.eventid.unique())

In [None]:
# Define an empty dataframe to append the different xml files to
management = pd.DataFrame()

for k in eventid_list:
    
    teamid = pd.read_sql_query("select teamid from _0_original_data.team_data_per_tournament where tournament_id = '"+k+"'",db)
    teamid_list = list(teamid.teamid.unique())
    
    for n in teamid_list:
    
        url = 'http://webservices.irb.com/EventInformation.asmx/Management?uid=e9656db8-ffb5-4115-ac0b-cbd5688e6648&EventID=' + k + '&teamid=' + n
        response = urllib.request.urlopen(url)
        data = response.read()      
        text = data.decode('utf-8') 
    
        # Create a df which equals the text object
        df = xml2df(text)
        
        # Skip all empty Event ID's
        if df.empty == True:
            continue
        else:
            pass

        # Extract the tournament id from the file name
        tournament_id =  k
        team_id = n
        
        # Set a column equal to the file name
        df['tournament_id'] = tournament_id
        df['team_id'] = team_id
    
        # Clean the column headers 
        dict_columns={}
        for x in (df.columns.values):
            dict_columns[x] = x.lower().replace('{http://webservices.irb.com/}','')
        df_clean = df.rename(columns=dict_columns)
    
        # remove duplicates from the data
        df_clean = df_clean.drop_duplicates(['personname','tournament_id','team_id'], keep='first')
        
        # Append each individual dataframe to the full_fixtures df
        frames =(management,df_clean)
        management = pd.concat(frames)

In [None]:
management.head()

In [None]:
# The final table includes 20 tournaments and has 3918 player names associated with the 20 different tournaments. 
management.tournament_id.nunique()
len(management)

In [None]:
# Extract to CSV
management.to_csv('../_3_data_clean/management.csv')

In [None]:
# Extract to SQL
table_name = 'management'
management.to_sql(schema=schema1, con=engine, if_exists='replace', name=table_name)
db.commit
db.close()