In [None]:
from pdnd.common.functions import *
import pandas as pd
import boto3
from botocore.exceptions import NoCredentialsError, PartialCredentialsError

def get_s3_modification_times(bucket_name, prefix=''):
    # Initialize S3 client
    session = Session.from_env()
    s3_client = session.get_s3_client()

    paginator = s3_client.get_paginator('list_objects_v2')
    page_iterator = paginator.paginate(Bucket=bucket_name, Prefix=prefix)

    all_objects = []
    
    try:
        for page in page_iterator:
            if 'Contents' in page:
                for obj in page['Contents']:
                    all_objects.append({
                        'Key': obj['Key'],
                        'LastModified': obj['LastModified']
                    })
    except (NoCredentialsError, PartialCredentialsError) as e:
        print(f"Error with AWS credentials: {e}")
        return pd.DataFrame(columns=['Key', 'LastModified'])

    # Convert to DataFrame
    df = pd.DataFrame(all_objects)
    return df

# Example usage
bucket_name = ''
prefix = ''
df = get_s3_modification_times(bucket_name, prefix)

# Display the DataFrame
print(df)

In [None]:
# This is what we want
df['LastModified'].median().strftime('%Y-%m-%d %H:%M:%S.%f')

In [None]:
def split_df_by_median(df):
    if not df.empty:
        # Convert LastModified to datetime if not already
        df['LastModified'] = pd.to_datetime(df['LastModified'])

        # Calculate the median of the LastModified column
        median_time = df['LastModified'].median()

        # Split the DataFrame into two parts based on the median
        first_part = df[df['LastModified'] <= median_time]
        second_part = df[df['LastModified'] > median_time]

        # Return the two split DataFrames
        return first_part, second_part
    else:
        # Return empty DataFrames if the input DataFrame is empty
        return pd.DataFrame(), pd.DataFrame()

# Example usage assuming df is already obtained
first_part_df, second_part_df = split_df_by_median(df)

# Print total length of the original DataFrame
print(f"Total length of the DataFrame: {len(df)}")

# Display the lengths of the two parts
print(f"Length of first part: {len(first_part_df)}")
print(f"Length of second part: {len(second_part_df)}")