## Backlog processing

This notebook processes the backlof of data resident in the train-data-landing bucket, saving it to the bronze train movements table. It was initially run in a Sagemaker notebook against a Glue endpoint to catch up the backlog.

In [None]:
!pip install awswrangler

In [1]:
import awswrangler as wr
import json
import boto3
import pandas as pd
import numpy as np
import os
import io
from datetime import datetime
# Ensure that the correct session is used
boto3.setup_default_session(region_name="us-east-1")
s3 = boto3.client('s3')

In [2]:
file_list = wr.s3.list_objects('s3://train-data-landing/2022/04/1')


In [3]:
len(file_list) 

7

In [4]:
counter = 0
rows = []
movefiles = []

for file in file_list:
    counter = counter + 1
    filename = file.split('//')[1].split('train-data-landing/')[1]
    obj = s3.get_object(Bucket='train-data-landing', Key=filename)
    content = obj['Body'].read().decode('utf8')
    movefiles.append(filename)    
    filedata = json.loads(content)
    # Add all rows from the landed file to a list
    for row in filedata:
        row_to_add = row['body']
        row_to_add['msg_type'] = row['header']['msg_type']
        rows.append(row_to_add)
    # Every 500 files, write all rows in the list to the table
    if counter % 500 == 0:
        print('processing increment ' + str(counter))
        frame = pd.DataFrame(rows)        
        frame['segment_timestamp'] = np.nan
        if 'actual_timestamp' in frame.columns:
            frame['segment_timestamp'] = frame['segment_timestamp'].fillna(frame['actual_timestamp'])
        if 'creation_timestamp' in frame.columns:
            frame['segment_timestamp'] = frame['segment_timestamp'].fillna(frame['creation_timestamp'])
        if 'dep_timestamp' in frame.columns:
            frame['segment_timestamp'] = frame['segment_timestamp'].fillna(frame['dep_timestamp'])
        if 'event_timestamp' in frame.columns:
            frame['segment_timestamp'] = frame['segment_timestamp'].fillna(frame['event_timestamp'])
                
        frame['segment_date'] = frame['segment_timestamp'].apply(lambda x: datetime.fromtimestamp(int(x) / 1000.0).date())
        frame = frame.drop(columns=['segment_timestamp'])
        now = datetime.now()
        current_time = now.strftime("%H:%M:%S")
        print('writing ' + str(len(rows)) + ' rows to governed table at ' + current_time)
        wr.s3.to_parquet(
            df=frame,
            dataset=True,
            mode="append",
            database="train_bronze",
            table="train_movements_governed",
            catalog_versioning=True,  # Optional
            #schema_evolution=True,
            partition_cols=['segment_date'])
        
        print ('moving files...')
        for movefile in movefiles:
            copy_source = {
                'Bucket': 'train-data-landing',
                'Key': movefile
            }
            s3.copy(copy_source, 'train-data-landing', 'processed/' + movefile)                                
            s3.delete_object(Bucket='train-data-landing', Key=movefile)
        rows = []
        movefiles = []

if len(rows) > 0:
    print('processing the last increment: ' + str(counter))
    frame = pd.DataFrame(rows)        
    frame['segment_timestamp'] = np.nan
    if 'actual_timestamp' in frame.columns:
        frame['segment_timestamp'] = frame['segment_timestamp'].fillna(frame['actual_timestamp'])
    if 'creation_timestamp' in frame.columns:
        frame['segment_timestamp'] = frame['segment_timestamp'].fillna(frame['creation_timestamp'])
    if 'dep_timestamp' in frame.columns:
        frame['segment_timestamp'] = frame['segment_timestamp'].fillna(frame['dep_timestamp'])
    if 'event_timestamp' in frame.columns:
        frame['segment_timestamp'] = frame['segment_timestamp'].fillna(frame['event_timestamp'])

    frame['segment_date'] = frame['segment_timestamp'].apply(lambda x: datetime.fromtimestamp(int(x) / 1000.0).date())
    frame = frame.drop(columns=['segment_timestamp'])
    now = datetime.now()
    current_time = now.strftime("%H:%M:%S")
    print('writing ' + str(len(rows)) + ' rows to governed table at ' + current_time)
    wr.s3.to_parquet(
        df=frame,
        dataset=True,
        mode="append",
        database="train_bronze",
        table="train_movements_governed",
        catalog_versioning=True,  # Optional
        #schema_evolution=True,
        partition_cols=['segment_date'])

    print ('moving files...')
    for movefile in movefiles:
        copy_source = {
            'Bucket': 'train-data-landing',
            'Key': movefile
        }
        s3.copy(copy_source, 'train-data-landing', 'processed/' + movefile)                                
        s3.delete_object(Bucket='train-data-landing', Key=movefile)

processing the last increment: 7
writing 174 rows to governed table at 09:04:22
moving files...
