__Author:__ Marijse

__Aim:__ Load all match fixtures into a dataframe and send to Postgres. 

__Prerequisites:__
- Have the tournament id dictionary loaded in postgres schema _0_original data


In [13]:
# import the necessary packages
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
import psycopg2
import sqlalchemy
from sqlalchemy import create_engine

import urllib.request
import requests
import io
import glob

In [14]:
# Set up the database connection wiht Psycopg2
db = psycopg2.connect(dbname='Rugby', user='postgres', host='localhost', password='password')
cursor=db.cursor()

In [15]:
# Set up a database connection using sqlalchemy
engine = create_engine('postgres://postgres:password@localhost:5432/Rugby')

In [16]:
# Creating table and schema
schema_name = '_0_original_data'
cursor.execute("CREATE SCHEMA IF NOT EXISTS " + schema_name)

### Reading Match Fixtures into datafrmae

In [17]:
def xml2df(xml_data):
    root = ET.XML(xml_data) 
    all_records = []
    for i, child in enumerate(root):
        record = {}
        for subchild in child:
            record[subchild.tag] = subchild.text
            all_records.append(record)
    return pd.DataFrame(all_records)

In [18]:
# List all event id's of the past two years
eventid = pd.read_sql_query('select eventid from _0_original_data.tournament_id_dictionary',db)
eventid_list = list(eventid.eventid.unique())
print (eventid_list)

['1382', '1386', '1388', '1387', '1389', '1383', '1380', '1381', '1385', '1395', '1396', '1397', '1390', '1398', '1394', '1393', '1391', '1392', '1409', '1407', '1405', '1406', '1410', '1400', '1408', '1403', '1404', '1401', '1402', '1415', '1418', '1416', '1417', '1412', '1414', '1413', '1421', '1424', '1422', '1426', '1419', '1423', '1425', '1420', '1429', '1432', '897', '1433', '1428', '1430', '1436', '1440', '1437', '1441', '1434', '1439', '1438', '1435', '1445', '1447', '1446', '1449', '1443', '1448', '1444', '1450', '1453', '1455', '1454', '1457', '1451', '1456', '1452', '1458', '1462', '1464', '1463', '1466', '1459', '1465', '1460', '1467', '1470', '1472', '1471', '1475', '1473', '1468', '1469', '1474', '1479', '1481', '1480', '1484', '1477', '1482', '1478', '1483', '1491', '1488', '1492', '1489', '1490', '1485', '1487', '1493', '1486', '1500', '1497', '1521', '1501', '1498', '1522', '1499', '1494', '1496', '1523', '1502', '1495', '1524', '1509', '1527', '1506', '1510', '1508', 

In [19]:
# Define an empty dataframe to append the different xml files to
full_fixtures = pd.DataFrame()

for k in eventid_list:
    url = 'http://webservices.irb.com/EventInformation.asmx/FixturesResults?uid=e9656db8-ffb5-4115-ac0b-cbd5688e6648&EventID=' + k
    response = urllib.request.urlopen(url)
    data = response.read()      
    text = data.decode('utf-8') 
    
    # Create a df which equals the text object
    df = xml2df(text)
    
    # Skip all empty Event ID's
    if df.empty == True:
        continue
    else:
        pass

    # Extract the tournament id from the file name
    tournament_id =  k
        
    # Set a column equal to the file name
    df['tournament_id'] = tournament_id
    
    # Clean the column headers 
    dict_columns={}
    for x in (df.columns.values):
        dict_columns[x] = x.lower().replace('{http://webservices.irb.com/}','')
    df_clean = df.rename(columns=dict_columns)
    
    # remove duplicates from the data
    df_clean = df_clean.drop_duplicates(['matchid'], keep='first')
    
    # Append each individual dataframe to the full_fixtures df
    frames =(full_fixtures,df_clean)
    full_fixtures = pd.concat(frames)



In [20]:
len(full_fixtures)

8026

In [21]:
# Extract to CSV
full_fixtures.to_csv('../_6_data_clean/match_fixtures.csv')

In [22]:
# Extract to SQL
table_name = 'match_fixtures'
full_fixtures.to_sql(schema=schema_name, con=engine, if_exists='replace', name=table_name)
db.commit
db.close()