# Transfer Leagues json array to json multi lines file

In [1]:
import configparser
import os
import boto3
import json
import pandas as pd
from pandas.io.json import json_normalize

In [2]:
config = configparser.ConfigParser()
config.read('dl.cfg')

['dl.cfg']

In [3]:
os.environ['AWS_ACCESS_KEY_ID'] = config['AWS']['AWS_ACCESS_KEY_ID']
os.environ['AWS_SECRET_ACCESS_KEY'] = config['AWS']['AWS_SECRET_ACCESS_KEY']

In [4]:
source_folder = 'leagues/'
dest_folder = 'backup/leagues/'

In [5]:
s3_client = boto3.client('s3')
#s3_resource = boto3.resource('s3')
bucket_name='capstonefootballbucket'

## Start transfer JSON array to JSON rows file

In [7]:
paginator = s3_client.get_paginator("list_objects_v2")
response = paginator.paginate(Bucket=bucket_name,  Prefix=source_folder, Delimiter='/', PaginationConfig={"PageSize": 300})
count = 0
for page in response:
    files = page.get("Contents")
    for file in files:
        key = file['Key']
        if key.endswith('/'):
            continue
        #print(f"file path = {key}")
        #file_name = os.path.basename(key)
        # read original JSON file
        file_obj = s3_client.get_object(Bucket=bucket_name, Key=key)
        file_content = file_obj["Body"].read().decode('utf-8')
        data = json.loads(file_content)
        flattened_data = pd.DataFrame()    
        for item in data:
            # First, flatten the nested JSON objects using pd.json_normalize()
            flattened_item_df = json_normalize(item, sep='_')
            seasons_df = json_normalize(flattened_item_df['seasons'].values[0], sep='_')
            # Add 'seasons.' prefix to the column names in seasons_df
            seasons_df.columns = ['seasons_' + col for col in seasons_df.columns]
            # Duplicate flattened_item_df to match the length of seasons_df
            flattened_item_df = pd.concat([flattened_item_df]*len(seasons_df), ignore_index=True)
            flattened_item_df.drop(columns=['seasons'], inplace=True)
            # Concatenate the dataframes
            flattened_item_df = pd.concat([flattened_item_df, seasons_df], axis=1)
            flattened_data = flattened_data.append(flattened_item_df, ignore_index=True)

        json_lines = flattened_data.to_json(orient='records', lines=True)
    
        #build target object path
        target_key = key.replace(source_folder, dest_folder)
        print(f"target_key={target_key}")
        #move target object
        s3_client.copy_object(Bucket=bucket_name, CopySource=f"{bucket_name}/{key}", Key=target_key)
        #verify that the tartet object exists
        waiter = s3_client.get_waiter('object_exists')
        waiter.wait(Bucket=bucket_name, Key=target_key)
        
        s3_client.delete_object(Bucket=bucket_name, Key=key)
        
        # write to new file to destination folder
        s3_client.put_object(Bucket=bucket_name, Key=key, Body=json_lines)

target_key=backup/leagues/leagues.json
