# Transfer File in S3 json array format to json multi row file

In [2]:
import configparser
import os
import boto3
import json
import pandas as pd
from pandas import json_normalize

In [3]:
config = configparser.ConfigParser()
config.read('dl.cfg')

['dl.cfg']

In [4]:
os.environ['AWS_ACCESS_KEY_ID'] = config['AWS']['AWS_ACCESS_KEY_ID']
os.environ['AWS_SECRET_ACCESS_KEY'] = config['AWS']['AWS_SECRET_ACCESS_KEY']

In [39]:
year = '2022'
source_folder = 'league_teams/'
dest_folder = f'league_teams/{year}/'
#backup_folder = f'backup/league_fixtures/{year}/'

In [6]:
s3_client = boto3.client('s3')
#s3_resource = boto3.resource('s3')
bucket_name='capstonefootballbucket'

## Check if is need to change to multi lines file

In [7]:
def is_valid_content(file_content):
    # Check if the content starts with an array or contains a newline character
    return file_content.strip().startswith('[') or '\n' not in file_content

## Start transfer JSON array to JSON rows file

### Teams data had to add season column

In [40]:
paginator = s3_client.get_paginator("list_objects_v2")
response = paginator.paginate(Bucket=bucket_name,  Prefix=source_folder, Delimiter='/', PaginationConfig={"PageSize": 300})
count = 0
for page in response:
    files = page.get("Contents")
    for file in files:
        key = file['Key']
        if key.endswith('/'):
            continue
        if not year in key:
            continue
        #print(f"file path = {key}")
        #file_name = os.path.basename(key)
        # read original JSON file
        file_obj = s3_client.get_object(Bucket=bucket_name, Key=key)
        file_content = file_obj["Body"].read().decode('utf-8')
        
        # Check if file_content is empty or None
        if not file_content:
            continue
        #check if file_content is starts with an array or not contains a newline character    
        if not is_valid_content(file_content):
            continue
            
        data = json.loads(file_content)
        flattened_data = pd.DataFrame()    
        for item in data:
            if isinstance(item, str):
                try:
                    item = json.loads(item)
                except json.JSONDecodeError:
                    print(f"unparseable JSON string. file_key={key}")
                    break
                    
            # First, flatten the nested JSON objects using pd.json_normalize()
            flattened_item_df = json_normalize(item, sep='_') 
            # add season colum into flattened_item_df
            flattened_item_df = flattened_item_df.assign(season=int(year))
            flattened_data = pd.concat([flattened_data, flattened_item_df], ignore_index=True)

        json_lines = flattened_data.to_json(orient='records', lines=True)
        #build target object path
        target_key = key.replace(source_folder, dest_folder)
        print(f"target_key={target_key}")
        # write to new file to destination folder
        s3_client.put_object(Bucket=bucket_name, Key=target_key, Body=json_lines)
        #build backup object path
        #backup_key = key.replace(source_folder, backup_folder)
        #print(f"backup_key={backup_key}")
        #move backup object
        #s3_client.copy_object(Bucket=bucket_name, CopySource=f"{bucket_name}/{key}", Key=backup_key)
        #verify that the tartet object exists
        waiter = s3_client.get_waiter('object_exists')
        waiter.wait(Bucket=bucket_name, Key=target_key)
        
        s3_client.delete_object(Bucket=bucket_name, Key=key)
        
       

target_key=league_teams/2022/100_2022teams.json
target_key=league_teams/2022/101_2022teams.json
target_key=league_teams/2022/102_2022teams.json
target_key=league_teams/2022/103_2022teams.json
target_key=league_teams/2022/104_2022teams.json
target_key=league_teams/2022/105_2022teams.json
target_key=league_teams/2022/106_2022teams.json
target_key=league_teams/2022/107_2022teams.json
target_key=league_teams/2022/108_2022teams.json
target_key=league_teams/2022/109_2022teams.json
target_key=league_teams/2022/10_2022teams.json
target_key=league_teams/2022/110_2022teams.json
target_key=league_teams/2022/111_2022teams.json
target_key=league_teams/2022/112_2022teams.json
target_key=league_teams/2022/113_2022teams.json
target_key=league_teams/2022/114_2022teams.json
target_key=league_teams/2022/115_2022teams.json
target_key=league_teams/2022/116_2022teams.json
target_key=league_teams/2022/117_2022teams.json
target_key=league_teams/2022/118_2022teams.json
target_key=league_teams/2022/119_2022team

target_key=league_teams/2022/260_2022teams.json
target_key=league_teams/2022/261_2022teams.json
target_key=league_teams/2022/262_2022teams.json
target_key=league_teams/2022/263_2022teams.json
target_key=league_teams/2022/265_2022teams.json
target_key=league_teams/2022/266_2022teams.json
target_key=league_teams/2022/267_2022teams.json
target_key=league_teams/2022/268_2022teams.json
target_key=league_teams/2022/269_2022teams.json
target_key=league_teams/2022/270_2022teams.json
target_key=league_teams/2022/271_2022teams.json
target_key=league_teams/2022/272_2022teams.json
target_key=league_teams/2022/273_2022teams.json
target_key=league_teams/2022/274_2022teams.json
target_key=league_teams/2022/275_2022teams.json
target_key=league_teams/2022/276_2022teams.json
target_key=league_teams/2022/277_2022teams.json
target_key=league_teams/2022/278_2022teams.json
target_key=league_teams/2022/279_2022teams.json
target_key=league_teams/2022/281_2022teams.json
target_key=league_teams/2022/282_2022tea

target_key=league_teams/2022/430_2022teams.json
target_key=league_teams/2022/431_2022teams.json
target_key=league_teams/2022/432_2022teams.json
target_key=league_teams/2022/433_2022teams.json
target_key=league_teams/2022/434_2022teams.json
target_key=league_teams/2022/435_2022teams.json
target_key=league_teams/2022/436_2022teams.json
target_key=league_teams/2022/439_2022teams.json
target_key=league_teams/2022/43_2022teams.json
target_key=league_teams/2022/440_2022teams.json
target_key=league_teams/2022/441_2022teams.json
target_key=league_teams/2022/442_2022teams.json
target_key=league_teams/2022/443_2022teams.json
target_key=league_teams/2022/444_2022teams.json
target_key=league_teams/2022/445_2022teams.json
target_key=league_teams/2022/446_2022teams.json
target_key=league_teams/2022/447_2022teams.json
target_key=league_teams/2022/448_2022teams.json
target_key=league_teams/2022/449_2022teams.json
target_key=league_teams/2022/44_2022teams.json
target_key=league_teams/2022/450_2022teams

target_key=league_teams/2022/60_2022teams.json
target_key=league_teams/2022/610_2022teams.json
target_key=league_teams/2022/611_2022teams.json
target_key=league_teams/2022/612_2022teams.json
target_key=league_teams/2022/613_2022teams.json
target_key=league_teams/2022/614_2022teams.json
target_key=league_teams/2022/615_2022teams.json
target_key=league_teams/2022/616_2022teams.json
target_key=league_teams/2022/617_2022teams.json
target_key=league_teams/2022/618_2022teams.json
target_key=league_teams/2022/619_2022teams.json
target_key=league_teams/2022/61_2022teams.json
target_key=league_teams/2022/620_2022teams.json
target_key=league_teams/2022/621_2022teams.json
target_key=league_teams/2022/622_2022teams.json
target_key=league_teams/2022/623_2022teams.json
target_key=league_teams/2022/624_2022teams.json
target_key=league_teams/2022/625_2022teams.json
target_key=league_teams/2022/626_2022teams.json
target_key=league_teams/2022/627_2022teams.json
target_key=league_teams/2022/628_2022teams

target_key=league_teams/2022/79_2022teams.json
target_key=league_teams/2022/800_2022teams.json
target_key=league_teams/2022/802_2022teams.json
target_key=league_teams/2022/809_2022teams.json
target_key=league_teams/2022/80_2022teams.json
target_key=league_teams/2022/810_2022teams.json
target_key=league_teams/2022/811_2022teams.json
target_key=league_teams/2022/812_2022teams.json
target_key=league_teams/2022/814_2022teams.json
target_key=league_teams/2022/815_2022teams.json
target_key=league_teams/2022/817_2022teams.json
target_key=league_teams/2022/818_2022teams.json
target_key=league_teams/2022/81_2022teams.json
target_key=league_teams/2022/820_2022teams.json
target_key=league_teams/2022/821_2022teams.json
target_key=league_teams/2022/822_2022teams.json
target_key=league_teams/2022/823_2022teams.json
target_key=league_teams/2022/824_2022teams.json
target_key=league_teams/2022/825_2022teams.json
target_key=league_teams/2022/826_2022teams.json
target_key=league_teams/2022/828_2022teams.