# SPLIT

This notebook will help you split the original daily files from the CHILES observation into chunks of N Mhz 

The code is for python 3.6 or above. Run the next cell to check the value.

In [None]:
from platform import python_version
print(python_version())

We keep the last 10 runs of data so you can see what you used last time.

In [None]:
import os

from configobj import ConfigObj

path_dirname = os.getcwd()
config_file_name = '{0}/aws-chiles02/aws-chiles02.settings'.format(path_dirname)
if os.path.exists(config_file_name):
    config = ConfigObj(config_file_name, indent_type='    ')
else:
    config = ConfigObj(indent_type='    ')
    config.filename = config_file_name

for key in sorted(config.keys(), reverse=True):
    if key.startswith('split'):
        value = config[key]
        if isinstance(value, dict):
            print(key)    

If you want to reuse data. Select the key that you want to use and then uncomment the tag.

In [None]:
data_to_use = config['KEY']

## Override values here
# data_to_use['create_use_json']
# data_to_use['bucket_name']
# data_to_use['width']
# data_to_use['observation_phase']
# data_to_use['shutdown']
# data_to_use['use_bash']
# data_to_use['volume']
# data_to_use['run_note']
# data_to_use['split_directory']
# data_to_use['use_bash']
# data_to_use['casa_version']
# data_to_use['ami']
# data_to_use['spot_price_i3_2xlarge']
# data_to_use['spot_price_i3_4xlarge']
# data_to_use['days_per_node']
# data_to_use['dim']
# data_to_use['nodes']
# data_to_use['parallel_streams']
print(data_to_use)

If you want to set up the values from scratch

In [None]:
from aws_chiles02.settings_file import AWS_AMI_ID

data_to_use = {
    # Choose between create, use, json
    'create_use_json': 'create',
    
    # The bucket to use
    'bucket_name': '13b-266',
    
    # The width of the frequency in MHz
    'width': 4,
    
    # The phase - 1, 2, 3
    'observation_phase': 1,
    
    # Should a shut down drop be added
    'shutdown': True,
    
    # The code can use docker of bash. Bash is preferred as it is much quicker
    'use_bash': True,

    # The directory on the host put the data and to bind to the Docker 
    # Apps (if Docker is to be used).
    'volume': '/mnt/daliuge/dlg_root',

    # Notes about this run
    'run_note': '''
Run note
'''
}

# Where the data is put in AWS
data_to_use['split_directory'] = 'split_{}_{}'.format(data_to_use['width'], data_to_use['observation_phase'])

if data_to_use['use_bash']:
    # Which version of CASA do you want to use. The AWS instance will switch to that version
    # The options are '4.7' and '5.1'
    data_to_use['casa_version'] = '5.1'

if data_to_use['create_use_json'] == 'create':
    # The AMI to use
    data_to_use['ami'] = AWS_AMI_ID
    
    # Spot Price for i3.2xlarge used for the smaller files
    data_to_use['spot_price_i3_2xlarge'] = 0.6
    
    # Spot Price for i3.4xlarge used for the bigger files
    data_to_use['spot_price_i3_4xlarge'] = 1.2
    
    # Number of days per node. THe more days per node the slower it will run
    data_to_use['days_per_node'] = 100
    
elif data_to_use['create_use_json'] == 'use':
    # The IP to the DataIsland Manager
    data_to_use['dim'] = '127.0.0.1'
else:
    # The number of nodes
    data_to_use['nodes'] = 8
    
    # The number of parallel streams
    data_to_use['parallel_streams'] =4


Check the values before we call the code

In [None]:
print(data_to_use)

Now store the run details so we can reuse it in the future

In [None]:
from datetime import datetime
today = datetime.now()
section_name = 'split_{:%Y_%m_%d_%H_%M_%S}'.format(today)

config[section_name] = data_to_use

split_keys = []
for key in sorted(config.keys(), reverse=True): 
    if key.startswith('split'):
        value = config[key]
        if isinstance(value, dict):
            split_keys.append(key)
        
if len(split_keys) > 10:
    for old_key in split_keys[10:]:
        del config[old_key]

config.write()

Make the calls to the code to start the split process

In [None]:
# Run the command
from aws_chiles02.generate_mstransform_graph import create_and_generate, use_and_generate, build_json
from aws_chiles02.settings_file import DIM_PORT

if data_to_use['create_use_json'] == 'create':
    create_and_generate(
        bucket_name=data_to_use['bucket_name'],
        frequency_width=data_to_use['width'],
        ami_id=data_to_use['ami'],
        spot_price1=data_to_use['spot_price_i3_2xlarge'],
        spot_price2=data_to_use['spot_price_i3_4xlarge'],
        volume=data_to_use['volume'],
        days_per_node=data_to_use['days_per_node'],
        add_shutdown=data_to_use['shutdown'],
        use_bash=data_to_use['use_bash'],
        casa_version=data_to_use['casa_version'],
        split_directory=data_to_use['split_directory'],
        observation_phase=data_to_use['observation_phase'],
        run_note=data_to_use['run_note'], 
    )
elif config['create_use_json'] == 'use':
    use_and_generate(
        host=data_to_use['dim'],
        port=DIM_PORT,
        bucket_name=data_to_use['bucket_name'],
        frequency_width=data_to_use['width'],
        volume=data_to_use['volume'],
        add_shutdown=data_to_use['shutdown'],
        use_bash=data_to_use['use_bash'],
        casa_version=data_to_use['casa_version'],
        split_directory=data_to_use['split_directory'],
        observation_phase=data_to_use['observation_phase'],
        run_note=data_to_use['run_note'],
    )
else:
    build_json(
        bucket=data_to_use['bucket_name'],
        width=data_to_use['width'],
        volume=data_to_use['volume'],
        nodes=data_to_use['nodes'],
        parallel_streams=data_to_use['parallel_streams'],
        add_shutdown=data_to_use['shutdown'],
        use_bash=data_to_use['use_bash'],
        casa_version=data_to_use['casa_version'],
        split_directory=data_to_use['split_directory'],
        observation_phase=data_to_use['observation_phase'],
        run_note=data_to_use['run_note'],
    )
