# S3 Bucket data processing
> Data reading, writing or Saving in S3 bucket with boto3

In [None]:
#| default_exp data_processing.s3_bucket

In [None]:
#| hide
%load_ext autoreload
%autoreload 2


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
#| export
import sys
from pathlib import Path
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)


In [None]:
#| export
import os
import boto3
from nbdev.showdoc import show_doc
from botocore.exceptions import ClientError

In [None]:
#| export
CV_TOOLS = Path(r'/home/ai_sintercra/homes/hasan/projects/git_data/cv_tools')
sys.path.append(str(CV_TOOLS))


In [None]:
#| export
custom_lib_path = Path(r'/home/ai_warstein/homes/goni/custom_libs')
sys.path.append(str(custom_lib_path))


In [None]:
#| export
from cv_tools.imports import *
from dotenv import load_dotenv


In [None]:
#| export
load_dotenv(dotenv_path=f'/home/ai_sintercra/homes/hasan/projects/git_data/2023_easy_pin_detection/private_easy_pin_detection/.env')

False

In [None]:
load_dotenv(dotenv_path=Path(Path.cwd().parent, 'private_easy_pin_detection/.env'))

True

In [None]:
#| eval:false
s3_access_key_id = os.environ['AWS_ACCESS_KEY_ID']
s3_secret_access_key = os.environ['AWS_SECRET_ACCESS_KEY']

In [None]:
#| export
CURRETNT_NB='/home/ai_sintercra/homes/hasan/projects/git_data/2023_easy_pin_detection/nbs'

In [None]:
#| export
def get_client(
    s3_access_key_id:str, 
    s3_secret_access_key:str,
    endpoint_url:str='https://s3warceph01.infineon.com',
    verify:bool=False):
    return boto3.client(
        's3', 
        endpoint_url=endpoint_url, 
        aws_access_key_id=s3_access_key_id,
        aws_secret_access_key=s3_secret_access_key,
        verify=verify,
        )
        
       

In [None]:
#client=get_client(s3_access_key_id, s3_secret_access_key)

`list_objects_v2` doc is [here](https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3/client/list_objects_v2.html)

In [None]:
#| export
def list_s3_folder_contents(
    boto3_client: boto3.client, 
    bucket_name: str, 
    folder_prefix: str,
    recursive: bool = False) -> List[str]:
    """
    Make sure to add '/' at the end of the folder_prefix
    List files and subfolders in a specific S3 bucket folder.
    If recursive is True, it will list all files and subfolders recursively.
    If recursive is False, it will only list immediate contents of the folder.
    
    """
    contents = set()
    try:
        paginator = boto3_client.get_paginator('list_objects_v2')
        
        if recursive:
            pages = paginator.paginate(Bucket=bucket_name, Prefix=folder_prefix)
        else:
            pages = paginator.paginate(Bucket=bucket_name, Prefix=folder_prefix, Delimiter='/')
        
        for page in pages:
            if not recursive and 'CommonPrefixes' in page:
            
                for prefix in page['CommonPrefixes']:
                    contents.add(prefix['Prefix'])
            if 'Contents' in page:
                for obj in page['Contents']:
                    key = obj['Key']
                    if key != folder_prefix:  # Exclude the folder itself
                        if recursive:
                            contents.add(key)
                        else:
                            # Only add files in the immediate folder
                            relative_path = key[len(folder_prefix):]
                            if not relative_path.strip('/'):  # This is the folder itself
                                continue
                            if '/' not in relative_path.strip('/'):
                                contents.add(relative_path.split('/')[0])
        
        contents_list = sorted(list(contents))
        if contents_list:
            print(f"Found {len(contents_list)} items in the folder '{folder_prefix}':")
            #for item in contents_list[:10]:  # Print first 10 items
                #print(item)
                
            if len(contents_list) > 10:
                print("...")
        else:
            print(f"No items found in the folder '{folder_prefix}'.")
        
        return contents_list
    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return []



In [None]:
#| hide
#| eval: false
#folder='projects/2024_Pin_Detection_End_Test/HG/Datasets/'
#files_ = list_s3_folder_contents(
    #boto3_client=get_client(s3_access_key_id, s3_secret_access_key), 
    #bucket_name='s3-ai-warstein', 
    #folder_prefix=folder,
    #recursive=False
#)
##| export

In [None]:
#| export
def ls_s3(
    folder_prefix: str,
    bucket_name: str='s3-ai-warstein', 
    s3_access_key_id: str=None,#os.environ['AWS_ACCESS_KEY_ID'],
    s3_secret_access_key: str=None,#os.environ['AWS_SECRET_ACCESS_KEY'],
    recursive: bool=False,
    endpoint_url: str='https://s3warceph01.infineon.com',
    verify: bool=False,
    ) -> List[str]:
    boto3_client = get_client(
        s3_access_key_id=s3_access_key_id, 
        s3_secret_access_key=s3_secret_access_key, 
        endpoint_url=endpoint_url,
        verify=verify)
    #return None
    return list_s3_folder_contents(
           boto3_client=boto3_client, 
           bucket_name=bucket_name, 
           folder_prefix=folder_prefix,
           recursive=recursive)

In [None]:
#folders = ls_s3(folder_prefix='projects/2024_Pin_Detection_End_Test/')

Found 1 items in the folder 'projects/2024_Pin_Detection_End_Test/':
projects/2024_Pin_Detection_End_Test/HG/


# Download file locally

In [None]:
#| export
def download_s3_folder(
    s3_folder: str,
    local_dir: str,
    bucket_name: str = 's3-ai-warstein',
    s3_access_key_id: str = None, # os.environ['AWS_ACCESS_KEY_ID']
    s3_secret_access_key: str = None, # os.environ['AWS_SECRET_ACCESS_KEY']
    endpoint_url: str = 'https://s3warceph01.infineon.com',
    verify: bool = False,
    file_num: int = None
):
    """
    Download all contents of an S3 folder to a local directory.

    Args:
    s3_folder (str): The S3 folder path to download from.
    local_dir (str): The local directory to save the downloaded files.
    bucket_name (str): The S3 bucket name.
    s3_access_key_id (str): AWS access key ID.
    s3_secret_access_key (str): AWS secret access key.
    endpoint_url (str): S3 endpoint URL.
    verify (bool): Whether to verify SSL certificates.
    """
    s3_client = get_client(
        s3_access_key_id=s3_access_key_id,
        s3_secret_access_key=s3_secret_access_key,
        endpoint_url=endpoint_url,
        verify=verify
    )

    # Ensure the S3 folder path ends with a '/'
    if not s3_folder.endswith('/'):
        s3_folder += '/'

    # List all objects in the S3 folder
    objects = ls_s3(
        folder_prefix=s3_folder,
        bucket_name=bucket_name,
        s3_access_key_id=s3_access_key_id,
        s3_secret_access_key=s3_secret_access_key,
        recursive=True
    )
    if file_num is not None:
        objects = objects[:file_num]

    for obj in objects:
        # Get the relative path of the file
        relative_path = obj[len(s3_folder):]
        # Construct the full local path
        local_file_path = os.path.join(local_dir, relative_path)
        
        # Create the directory if it doesn't exist
        os.makedirs(os.path.dirname(local_file_path), exist_ok=True)
        
        # Download the file
        try:
            s3_client.download_file(bucket_name, obj, local_file_path)
            print(f"Downloaded: {obj} to {local_file_path}")
        except Exception as e:
            print(f"Error downloading {obj}: {str(e)}")

    print(f"Download complete. All files from {s3_folder} have been downloaded to {local_dir}")


Found 26 items in the folder 'projects/2024_Pin_Detection_End_Test/HG/Datasets/images/':
...
Downloaded: projects/2024_Pin_Detection_End_Test/HG/Datasets/images/1_gen_image_2_VFV4.7.9.5_2024022807271781_ID_00943048112818112132406_In_87_r_1_FRONT_Additional Lead_image2_col_0_0_mean_0.png to n:\homes\hasan\projects\git_data\2023_easy_pin_detection\data\1_gen_image_2_VFV4.7.9.5_2024022807271781_ID_00943048112818112132406_In_87_r_1_FRONT_Additional Lead_image2_col_0_0_mean_0.png
Downloaded: projects/2024_Pin_Detection_End_Test/HG/Datasets/images/1_gen_image_2_VFV4.7.9.5_2024022807271781_ID_00943048112818112132406_In_87_r_1_FRONT_Additional Lead_image2_col_0_0_mean_10.png to n:\homes\hasan\projects\git_data\2023_easy_pin_detection\data\1_gen_image_2_VFV4.7.9.5_2024022807271781_ID_00943048112818112132406_In_87_r_1_FRONT_Additional Lead_image2_col_0_0_mean_10.png
Downloaded: projects/2024_Pin_Detection_End_Test/HG/Datasets/images/1_gen_image_2_VFV4.7.9.5_2024022807271781_ID_009430481128181121

In [None]:
#| hide
#| eval: false
# Example usage:
#download_s3_folder(
     #s3_folder='projects/2024_Pin_Detection_End_Test/HG/Datasets/images',
     #local_dir=data_dir,
     #bucket_name='s3-ai-warstein',
     #file_num=None
 #)

In [None]:
#| export
def upload_to_s3(
    local_path, 
    s3_folder, 
    bucket_name, 
    s3_access_key_id, 
    s3_secret_access_key,
    endpoint_url: str = 'https://s3warceph01.infineon.com',
    verify: bool = False,
    ):
    """
    Upload a file or folder to S3 bucket. If the folder doesn't exist, it will be created.
    
    :param local_path: Path to the local file or directory to upload
    :param s3_folder: S3 folder path where the file/folder will be uploaded
    :param bucket_name: Name of the S3 bucket
    :param s3_access_key_id: AWS access key ID
    :param s3_secret_access_key: AWS secret access key
    """
    # Create S3 client
    s3_client = boto3.client(
        's3',
        aws_access_key_id=s3_access_key_id,
        aws_secret_access_key=s3_secret_access_key,
        endpoint_url=endpoint_url,
        verify=verify
    )

    # Ensure the S3 folder path ends with a '/'
    if not s3_folder.endswith('/'):
        s3_folder += '/'

    if os.path.isfile(local_path):
        # Upload single file
        file_name = Path(local_path).name
        s3_path = s3_folder + file_name
        try:
            s3_client.upload_file(local_path, bucket_name, s3_path)
            print(f"Uploaded: {local_path} to {s3_path}")
        except Exception as e:
            print(f"Error uploading {local_path}: {str(e)}")
    elif Path(local_path).is_dir():
        # Upload entire folder
        for i in tqdm(Path(local_path).ls(),total=len(Path(local_path).ls())):
            if i.is_file():
                s3_path = s3_folder + i.name
                try:
                    s3_client.upload_file(str(i), bucket_name, s3_path)
                    print(f"Uploaded: {i} to {s3_path}")
                except Exception as e:
                    print(f"Error uploading {i}: {str(e)}")
            elif i.is_dir():
                upload_to_s3(str(i), s3_folder + i.name, bucket_name, s3_access_key_id, s3_secret_access_key)
    else:
        print(f"Error: {local_path} is not a valid file or directory")

    print(f"Upload complete. All files from {local_path} have been uploaded to {s3_folder} in bucket {bucket_name}")

In [None]:
#| export
@call_parse
def upload_download_s3(
    download:Param(help='whether to download or upload',type=bool, action='store_true' ),
    verify:Param(help='whether to verify ssl certificates',type=bool, action='store_false'),
    local_path:Param(help='local path to the file or folder to upload',type=str)='test',
    s3_folder:Param(help='s3 folder path where the file or folder will be uploaded',type=str)='s_test',
    bucket_name:Param(help='name of the s3 bucket',type=str)='s3-ai-warstein',
):
    """
    Upload a file or folder to S3 bucket. If the folder doesn't exist, it will be created.
    
    :param local_path: Path to the local file or directory to upload
    :param s3_folder: S3 folder path where the file/folder will be uploaded
    :param bucket_name: Name of the S3 bucket
    :param s3_access_key_id: AWS access key ID
    :param s3_secret_access_key: AWS secret access key
    """
    # load env variables
    load_dotenv(dotenv_path=f'/home/ai_sintercra/homes/hasan/projects/git_data/2023_easy_pin_detection/private_easy_pin_detection/.env')

    # get env variables
    s3_access_key_id = os.environ['AWS_ACCESS_KEY_ID']
    s3_secret_access_key = os.environ['AWS_SECRET_ACCESS_KEY']
    endpoint_url = 'https://s3warceph01.infineon.com'


    if download:
        download_s3_folder(
            s3_folder=s3_folder,
            local_dir=local_path,
            bucket_name=bucket_name,
            s3_access_key_id=s3_access_key_id,
            s3_secret_access_key=s3_secret_access_key,
            endpoint_url=endpoint_url,
            verify=verify
        )
    else:
        upload_to_s3(
            local_path=local_path,
            s3_folder=s3_folder,
            bucket_name=bucket_name,
            s3_access_key_id=s3_access_key_id,
            s3_secret_access_key=s3_secret_access_key,
            endpoint_url=endpoint_url,
            verify=verify
        )

    

In [None]:
#| hide
#| eval: false
local_dir = Path(r'E:/test')
upload_to_s3(
    local_path=local_dir,
    s3_folder='projects/2024_Pin_Detection_End_Test/HG/Datasets/test/',
    bucket_name='s3-ai-warstein',
    s3_access_key_id=os.environ['AWS_ACCESS_KEY_ID'],
    s3_secret_access_key=os.environ['AWS_SECRET_ACCESS_KEY'],
)

Uploaded: E:\test\81618774_VFV4.1.5.2_2023011217422261_ID_00047043795816187742241_In_17_r_1_FRONT_Missing Lead_image1_var_70.png to projects/2024_Pin_Detection_End_Test/HG/Datasets/test/81618774_VFV4.1.5.2_2023011217422261_ID_00047043795816187742241_In_17_r_1_FRONT_Missing Lead_image1_var_70.png
Upload complete. All files from E:\test have been uploaded to projects/2024_Pin_Detection_End_Test/HG/Datasets/test/ in bucket s3-ai-warstein


In [None]:
#| hide
import nbdev; nbdev.nbdev_export('08_data_processing.s3_bucket.ipynb')