Author: Marijse

Aim: 
- Load the original data from the WR database from csv into a local postgres instance; 
- Load the different data views created from the original data from csv into a local postgres instance;

Prerequisites: 
- Have all csv files loaded into the folder _6_data_clean & folder _10_data_views;
- Have a local PSQL instance set up with a designated database for the rugby. Ensure that you know the password of your postgres user. 

In [3]:
# sqlalchemy and psycopg2 might not be pre-installed, so we will run the installation of these packages first
! pip install psycopg2
! pip install sqlalchemy



In [4]:
# import the necessary python packages to be able to run the code that follows
import pandas as pd
import numpy as np
import psycopg2
import sqlalchemy
from sqlalchemy import create_engine
import glob


In [5]:
# Update the below connection details based on your own psql connection details. 
# Note that the details are case sensitive. 

# Set up the database connection wiht Psycopg2. Adjust the dbname, user and password. 
db = psycopg2.connect(dbname='databasename you have given', user='postgres', password= 'password of database')
cursor=db.cursor()

# Set up a database connection using sqlalchemy. The first commented line explains what the different part of the 
# string stand for. Using this, adjust the user, password, port and dbname.  
#engine = create_engine('postgres://user:password@localhost:port/database_name)
engine = create_engine('postgres://postgres:password@localhost:5432/dbname')


#### Set up the psql database schema structure

In [12]:
# Define variables
# Define the input and output strings 
# The file path might need to be updated depending on where/how the files get stored. If you are working on a windows
# system, comment out the below two lines and uncomment the lines for Windows underneath. 
# input_string1 = '../_6_data_clean/'
# input_string2 = '../_10_data_views/'

# Windows string (Where ever you git pull location is):
input_string1 = 'C:\\Users\\aadams\\Downloads\\Rugby7s\\_3_data_clean\\'
input_string2 = 'C:\\Users\\aadams\\Downloads\\Rugby7s\\4_data_views\\'

# Define the schema names. We will create these in Postgres. For consistency and to enable us to share code easier, 
# please do not change thes. 
schema1 = '_0_original_data'
schema2 = '_1_data_views'

In [13]:
# Create the schemas in postgres. 
cursor.execute("CREATE SCHEMA IF NOT EXISTS " + schema1)
cursor.execute("CREATE SCHEMA IF NOT EXISTS " + schema2)
db.commit()

#### Read the different original csv files into psql (schema1)

In [14]:
# Read the files in the first input_string into the 'files' object. These files will go in the _0_original_data schema. 
files = glob.glob(input_string1 + "**/*.csv", recursive=True)
files

['C:\\Users\\jprice\\Desktop\\Rugby\\_6_data_clean\\all_time_head_to_head.csv',
 'C:\\Users\\jprice\\Desktop\\Rugby\\_6_data_clean\\event_tables.csv',
 'C:\\Users\\jprice\\Desktop\\Rugby\\_6_data_clean\\full_squads.csv',
 'C:\\Users\\jprice\\Desktop\\Rugby\\_6_data_clean\\management.csv',
 'C:\\Users\\jprice\\Desktop\\Rugby\\_6_data_clean\\match_fixtures.csv',
 'C:\\Users\\jprice\\Desktop\\Rugby\\_6_data_clean\\player_records_in_event.csv',
 'C:\\Users\\jprice\\Desktop\\Rugby\\_6_data_clean\\player_records_in_event_all_time.csv',
 'C:\\Users\\jprice\\Desktop\\Rugby\\_6_data_clean\\player_records_match_record_in_event.csv',
 'C:\\Users\\jprice\\Desktop\\Rugby\\_6_data_clean\\player_records_match_record_in_event_all_time.csv',
 'C:\\Users\\jprice\\Desktop\\Rugby\\_6_data_clean\\team_data_per_tournament.csv',
 'C:\\Users\\jprice\\Desktop\\Rugby\\_6_data_clean\\team_dictionary.csv',
 'C:\\Users\\jprice\\Desktop\\Rugby\\_6_data_clean\\team_match_records_in_event.csv',
 'C:\\Users\\jprice\\D

In [19]:
# we create an empty table for each file
for k in files:
    
    # Set up the database connection wiht Psycopg2
    db = psycopg2.connect(dbname='databasename you have given', user='postgres', password= 'password of database')
    cursor=db.cursor()

    # Set up a database connection using sqlalchemy
    engine = create_engine('postgres://postgres:password@localhost:5432/dbname')
    
    # We extract the tablename from the csv_file
    #table_name = k.split('/')[2].split('.')[0]
    
    #for windows only:
    table_name = k.split('\\')[6].split('.')[0]

    print(table_name)
    
    # We load the csv into a df 
    df = pd.read_csv(k)
    
    # We clean up the column headers a bit
    dict_columns = {}
    for x in (df.columns.values):
        dict_columns[x] = x.lower().replace(' ','_').replace(':','').replace('group','pool')
    df = df.rename(columns=dict_columns)
    
    # We create an empty table with the table name
    df.to_sql(schema=schema1, con=engine, if_exists='append', name=table_name, index=False)
    
    # We drop the column 'Unnamed: 0' if it exists
    cursor.execute("alter table "+ schema1 +"."+table_name+" drop column if exists unnamed_0",db)

    db.commit()
    db.close()
 

all_time_head_to_head
event_tables
full_squads
management
match_fixtures
player_records_in_event
player_records_in_event_all_time
player_records_match_record_in_event
player_records_match_record_in_event_all_time
team_data_per_tournament
team_dictionary
team_match_records_in_event
team_match_records_in_event_all_time
team_records_in_event
team_record_in_event_all_time
terminology_dictionary
tournament_id_dictionary


#### Read the different csv views on the data into psql (schema2)

In [20]:
# Read the files in the second input_string into the 'files' object. These files will go in the _1_data_views schema. 
files = glob.glob(input_string2 + "**/*.csv", recursive=True)
files

['C:\\Users\\jprice\\Desktop\\Rugby\\_10_data_views\\fixtures_with_player_stats_full.csv',
 'C:\\Users\\jprice\\Desktop\\Rugby\\_10_data_views\\match_fixtures_features_row_per_team.csv',
 'C:\\Users\\jprice\\Desktop\\Rugby\\_10_data_views\\match_fixtures_full_row_per_match.csv',
 'C:\\Users\\jprice\\Desktop\\Rugby\\_10_data_views\\player_stats_full.csv',
 'C:\\Users\\jprice\\Desktop\\Rugby\\_10_data_views\\points_by_team_by_tournament.csv',
 'C:\\Users\\jprice\\Desktop\\Rugby\\_10_data_views\\team_data_with_timezone.csv']

In [21]:
# we create an empty table for each file
for k in files:
    
    # Set up the database connection wiht Psycopg2
    db = psycopg2.connect(dbname='databasename you have given', user='postgres', password= 'password of database')
    cursor=db.cursor()

    # Set up a database connection using sqlalchemy
    engine = create_engine('postgres://postgres:password@localhost:5432/dbname')
    
    # We extract the tablename from the csv_file
    #table_name = k.split('/')[2].split('.')[0]

    #for windows only:
    table_name = k.split('\\')[6].split('.')[0]
    
    print(table_name)
    
    # We load the csv into a df 
    df = pd.read_csv(k)
    
    # We clean up the column headers a bit
    dict_columns = {}
    for x in (df.columns.values):
        dict_columns[x] = x.lower().replace(' ','_').replace(':','').replace('group','pool')
    df = df.rename(columns=dict_columns)
    
    # We create an empty table with the table name
    df.to_sql(schema=schema2, con=engine, if_exists='append', name=table_name, index=False)
    
    # We drop the column 'Unnamed: 0' if it exists
    cursor.execute("alter table "+ schema2 +"."+table_name+" drop column if exists unnamed_0",db)

    db.commit()
    db.close()


fixtures_with_player_stats_full
match_fixtures_features_row_per_team
match_fixtures_full_row_per_match
player_stats_full
points_by_team_by_tournament
team_data_with_timezone
