In [2]:
import sys
from time import sleep
from time import time
from pathlib import Path
import paramiko
from datetime import datetime
from tqdm import tqdm
from IPython.display import clear_output
import boto3
import pandas as pd
import seaborn as sns

sys.path.append('../src')
import cb_utils

In [None]:
def get_dev_ssh_client(env='dev'):
    secrets = cb_utils.get_secrets()

    client = paramiko.SSHClient()
    client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
    client.connect(
        hostname=secrets[f'{env}_sftp_hostname'],
        username=secrets[f'{env}_sftp_username'],
        password=secrets[f'{env}_sftp_password']
    )
    return client

In [None]:
def list_files(client, path='/home/csc-xx-dev/input/'):
    # PUll files that haven't been edited in the past hour to prevent pulling file that is being written to
    stdin,stdout,stderr = client.exec_command(f'sudo find {path} -cmin +60 -type f -maxdepth 1')

    files = []
    for i in stdout:
        files.append(i.strip())
    return files

In [None]:
def print_files(i, files, times):
    clear_output(wait=True)
    for j, (f, t) in enumerate(zip(files, times)):
        if j < i:
            print(f'Processed: {t:.2f}s {f}')
        elif j == i:
            print('Processing ', f)
        else:
            print('TODO: ', f)

def pull_files(client, files, client_name='csc-xx-dev', debug=False):
    sftp = client.open_sftp()
    now = datetime.today().strftime('%Y%m%d_%H%M%S')
    times = [0 for _ in files]
    local_files = []

    
    if debug:
        print(f'Pulling {len(files)} files for {client_name}')
    for i, file in enumerate(files): 
        s = time()
        if debug:
            print_files(i, files, times)       

        file_path = Path(file)
        file_name = file_path.name
        file_dir = Path(f'../data/{client_name}/{now}')
        file_dir.mkdir(parents=True, exist_ok=True)
        local_path = Path(file_dir, file_name)
        
        sftp.get(f'{file}', local_path)
        
        local_files.append(local_path)
        times[i] = time() - s

    if debug:
        print_files(i+1, files, times)
    return local_files

In [None]:
client = get_dev_ssh_client('dev')

In [None]:
files = list_files(client)

In [None]:
local_files = pull_files(client, files, debug=True)

In [None]:
client.close()

In [None]:
s3 = boto3.client('s3')

In [None]:
for file in local_files:
    data = open(file, 'rb')
    key = f'csc-xx-dev/inbound/20201113_154306/{file.name}'
    s3.upload_file(str(file), 'cb-analytics-us-east-2-prd', key, ExtraArgs={'ServerSideEncryption': 'AES256'})