# Transfer json No sub array to json multi row file

In [1]:
import configparser
import os
import boto3
import json
import pandas as pd
from pandas import json_normalize

In [2]:
config = configparser.ConfigParser()
config.read('dl.cfg')

['dl.cfg']

In [3]:
os.environ['AWS_ACCESS_KEY_ID'] = config['AWS']['AWS_ACCESS_KEY_ID']
os.environ['AWS_SECRET_ACCESS_KEY'] = config['AWS']['AWS_SECRET_ACCESS_KEY']

In [30]:
year = '2008'
source_folder = 'league_fixtures/'
dest_folder = f'league_fixtures/{year}/'
backup_folder = f'backup/league_fixtures/{year}/'

In [5]:
s3_client = boto3.client('s3')
#s3_resource = boto3.resource('s3')
bucket_name='capstonefootballbucket'

## Check if is need to flatten file

In [6]:
def is_valid_content(file_content):
    # Check if the content starts with an array or contains a newline character
    return file_content.strip().startswith('[') or '\n' not in file_content

## Start transfer JSON array to JSON rows file

In [31]:
paginator = s3_client.get_paginator("list_objects_v2")
response = paginator.paginate(Bucket=bucket_name,  Prefix=source_folder, Delimiter='/', PaginationConfig={"PageSize": 300})
count = 0
for page in response:
    files = page.get("Contents")
    for file in files:
        key = file['Key']
        if key.endswith('/'):
            continue
        if not year in key:
            continue
        #print(f"file path = {key}")
        #file_name = os.path.basename(key)
        # read original JSON file
        file_obj = s3_client.get_object(Bucket=bucket_name, Key=key)
        file_content = file_obj["Body"].read().decode('utf-8')
        
        # Check if file_content is empty or None
        if not file_content:
            continue
        #check if file_content is starts with an array or not contains a newline character    
        if not is_valid_content(file_content):
            continue
            
        data = json.loads(file_content)
        flattened_data = pd.DataFrame()    
        for item in data:
            if isinstance(item, str):
                try:
                    item = json.loads(item)
                except json.JSONDecodeError:
                    print(f"unparseable JSON string. file_key={key}")
                    break
                    
            # First, flatten the nested JSON objects using pd.json_normalize()
            flattened_item_df = json_normalize(item, sep='_')           
            flattened_data = pd.concat([flattened_data, flattened_item_df], ignore_index=True)

        json_lines = flattened_data.to_json(orient='records', lines=True)
        #build target object path
        target_key = key.replace(source_folder, dest_folder)
        print(f"target_key={target_key}")
        # write to new file to destination folder
        s3_client.put_object(Bucket=bucket_name, Key=target_key, Body=json_lines)
        #build backup object path
        backup_key = key.replace(source_folder, backup_folder)
        print(f"backup_key={backup_key}")
        #move target object
        s3_client.copy_object(Bucket=bucket_name, CopySource=f"{bucket_name}/{key}", Key=backup_key)
        #verify that the tartet object exists
        waiter = s3_client.get_waiter('object_exists')
        waiter.wait(Bucket=bucket_name, Key=backup_key)
        
        s3_client.delete_object(Bucket=bucket_name, Key=key)
        
       

target_key=league_fixtures/2008/4_2008fixtures.json
backup_key=backup/league_fixtures/2008/4_2008fixtures.json
