# Schema and table testing

This notebook contains code related to establishing the schema for the train movements governed table, and creating it

In [2]:
import awswrangler as aw
import json
import boto3
import pandas as pd
import os
from datetime import datetime

In [3]:
# Ensure that the correct session is used
boto3.setup_default_session(region_name="us-east-1")

In [136]:
# These sample messages are loaded to create the correct schema. For each message, grab the 
# body and append the header message type into ot
all_file_bodies = []

for subdir, dirs, files in os.walk('./sample_data/'):
    for file in files:
        file_loc = os.path.join(subdir, file)
        f = open(file_loc)
        filedata = json.load(f)
        for row in filedata:
            row_to_add = row['body']
            row_to_add['msg_type'] = row['header']['msg_type']
            all_file_bodies.append(row_to_add)
        

In [141]:
flat_list = all_file_bodies

In [143]:
frame = pd.DataFrame(flat_list)

In [144]:
frame['segment_timestamp'] = frame['actual_timestamp'].fillna(frame['creation_timestamp'])
frame['segment_timestamp'] = frame['segment_timestamp'].fillna(frame['dep_timestamp'])
frame['segment_timestamp'] = frame['segment_timestamp'].fillna(frame['event_timestamp'])

In [145]:
frame['segment_date'] = frame['segment_timestamp'].apply(lambda x: datetime.fromtimestamp(int(x) / 1000.0).date())
frame = frame.drop(columns=['segment_timestamp'])

In [146]:
schema = aw.catalog.extract_athena_types(frame)

In [147]:
schema

({'event_type': 'string',
  'gbtt_timestamp': 'string',
  'original_loc_stanox': 'string',
  'planned_timestamp': 'string',
  'timetable_variation': 'string',
  'original_loc_timestamp': 'string',
  'current_train_id': 'string',
  'delay_monitoring_point': 'string',
  'next_report_run_time': 'string',
  'reporting_stanox': 'string',
  'actual_timestamp': 'string',
  'correction_ind': 'string',
  'event_source': 'string',
  'train_file_address': 'string',
  'platform': 'string',
  'division_code': 'string',
  'train_terminated': 'string',
  'train_id': 'string',
  'offroute_ind': 'string',
  'variation_status': 'string',
  'train_service_code': 'string',
  'toc_id': 'string',
  'loc_stanox': 'string',
  'auto_expected': 'string',
  'direction_ind': 'string',
  'route': 'string',
  'planned_event_type': 'string',
  'next_report_stanox': 'string',
  'line_ind': 'string',
  'msg_type': 'string',
  'schedule_source': 'string',
  'schedule_end_date': 'string',
  'tp_origin_timestamp': 'strin

In [148]:
use_schema = schema[0]
use_schema.pop("segment_date")

'date'

In [131]:
use_schema

{'event_type': 'string',
 'gbtt_timestamp': 'string',
 'original_loc_stanox': 'string',
 'planned_timestamp': 'string',
 'timetable_variation': 'string',
 'original_loc_timestamp': 'string',
 'current_train_id': 'string',
 'delay_monitoring_point': 'string',
 'next_report_run_time': 'string',
 'reporting_stanox': 'string',
 'actual_timestamp': 'string',
 'correction_ind': 'string',
 'event_source': 'string',
 'train_file_address': 'string',
 'platform': 'string',
 'division_code': 'string',
 'train_terminated': 'string',
 'train_id': 'string',
 'offroute_ind': 'string',
 'variation_status': 'string',
 'train_service_code': 'string',
 'toc_id': 'string',
 'loc_stanox': 'string',
 'auto_expected': 'string',
 'direction_ind': 'string',
 'route': 'string',
 'planned_event_type': 'string',
 'next_report_stanox': 'string',
 'line_ind': 'string',
 'schedule_source': 'string',
 'schedule_end_date': 'string',
 'tp_origin_timestamp': 'string',
 'creation_timestamp': 'string',
 'tp_origin_stanox'

In [149]:
aw.catalog.create_parquet_table(database = "train_bronze", 
                                table = "train_movements_governed", 
                                path = "s3://train-bronze/train_movements_governed",
                                columns_types = schema[0],
                                compression = "snappy",
                                partitions_types = {'segment_date': 'date'},
                                table_type = "GOVERNED")

In [5]:
aw.catalog.create_parquet_table(database = "train_silver", 
                                table = "journey", 
                                path = "s3://train-silver/journey/",
                                columns_types = {
                                    'train_id': 'string',
                                    'date': 'date',
                                    'stanox': 'string',
                                    'start_timestamp': 'timestamp',
                                    'created': 'timestamp',
                                    'canx_timestamp': 'bigint'
                                },
                                compression = "snappy",
                                partitions_types = {'segment_date': 'date'},
                                table_type = "GOVERNED")