Tests COS performance

In [None]:
!pip install ibm-cos-sdk python-dotenv pandas matplotlib

In [1]:
import ibm_boto3
from ibm_botocore.client import Config, ClientError
from dotenv import load_dotenv
import os
import time
from random import randrange
import pandas as pd
import threading

In [2]:
%load_ext dotenv
%dotenv cos_performance_test.credentials
%dotenv cos_performance_test.env

aws_access_key_id=os.environ.get('aws_access_key_id')
aws_secret_access_key=os.environ.get('aws_secret_access_key')
cos_endpoint_url=os.environ.get('cos_endpoint_url')

# block size 4 MB is default (recommended)
block_size=int(os.environ.get('block_size','4194304'))

# number of threads used to penetrate
num_threads=int(os.environ.get('num_threads','10'))

          
bucket=os.environ.get('bucket')
key=os.environ.get('key')
                


In [3]:
cos_client = ibm_boto3.client("s3",
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key,
    endpoint_url=cos_endpoint_url
)

In [4]:
"""
def get_all_s3_objects(s3, **base_kwargs):
    continuation_token = None
    while True:
        list_kwargs = dict(MaxKeys=1000, **base_kwargs)
        if continuation_token:
            list_kwargs['ContinuationToken'] = continuation_token
        response = s3.list_objects_v2(**list_kwargs)
        yield from response.get('Contents', [])
        if not response.get('IsTruncated'):  # At the end of the list?
            break
        continuation_token = response.get('NextContinuationToken')

for file in get_all_s3_objects(cos_client, Bucket='sentinel-1'):
    print(file)
"""

"\ndef get_all_s3_objects(s3, **base_kwargs):\n    continuation_token = None\n    while True:\n        list_kwargs = dict(MaxKeys=1000, **base_kwargs)\n        if continuation_token:\n            list_kwargs['ContinuationToken'] = continuation_token\n        response = s3.list_objects_v2(**list_kwargs)\n        yield from response.get('Contents', [])\n        if not response.get('IsTruncated'):  # At the end of the list?\n            break\n        continuation_token = response.get('NextContinuationToken')\n\nfor file in get_all_s3_objects(cos_client, Bucket='sentinel-1'):\n    print(file)\n"

In [5]:
def penetrate(blocks, bs, result_list) -> pd.DataFrame:
    df = pd.DataFrame(columns=['time1','time2', 'time3'])
    
    for i in range(100):
        start_bytes = randrange(blocks)*bs
        stop_bytes = start_bytes+bs
        range_string = 'bytes={}-{}'.format(start_bytes, stop_bytes)
        #print(range_string)
        start = time.time()
        resp = cos_client.get_object(Bucket=bucket, Key=key, Range=range_string)
        end = time.time()
        time1 = end - start
        start = time.time()
        res = resp['Body'].read()
        end = time.time()
        time2 = end - start
        time3 = time1 + time2
        df.loc[len(df.index)] = [time1, time2, time3]
    return result_list.append(df)
        

In [6]:
def run_test(num_threads, block_size):
    blocks = int(4277006357 / block_size) # TODO obtain object size from COS object
    
    threads = []
    dataframes = []
    
    for i in range(num_threads):
        thread = threading.Thread(target=penetrate, args=(blocks,block_size, dataframes))
        thread.start()
        threads.append(thread)
    
    
    for thread in threads:
        thread.join()
        
    df = pd.concat(dataframes)
    print(f'{num_threads} threads, {block_size} block size, Throughput: {1/df.time3.mean()*block_size/1000000000*num_threads} GB/s')

In [None]:
for num_threads in [1,2,4,8,16,32,64,128]:
    for block_size in [100000, 1000000,2*1000000,4*1000000,8*1000000,16*1000000]:
        run_test(num_threads, block_size)

In [7]:
for num_threads in [16]:
    for block_size in [32*1000000,64*1000000,128*1000000,256*1000000]:
        run_test(num_threads, block_size)

16 threads, 32000000 block size, Throughput: 0.6815147713675632 GB/s
16 threads, 64000000 block size, Throughput: 0.7323492942957063 GB/s
16 threads, 128000000 block size, Throughput: 0.8527259030804601 GB/s
16 threads, 256000000 block size, Throughput: 0.8752263597326885 GB/s


In [None]:
for num_threads in [16]:
    for block_size in [512*1000000,1024*1000000,2048*1000000,4096*1000000]:
        run_test(num_threads, block_size)