In [2]:
import os
import json
import pandas as pd
from jw_utils import ncbi_datasets_fxs as ndf
from orthofinder_utils import dash_ortho_parser_d as dop
from orthofinder_utils import dash_app_preprocess as dap
from orthofinder_utils import proteomes_for_orthofinder as pfo

ncbi.datasets module not found. To install, run `pip install ncbi-datasets-pylib`.


In [6]:
data_folder = './data/ncbi_dataset/data'
accs = [f for f in os.listdir(data_folder) if f.startswith('GC')]
accs2 = []
with open('./accessions.txt', 'r') as f:
    for line in f:
         accs2.append(line.strip())

In [7]:
ndf.check_for_file(data_folder, prefix = 'GC', suffix = '.gff')
ndf.check_for_file(data_folder, prefix = 'GC', suffix = '.faa')

{}

### Move ./Proteomes to aws s3 bucket

In [16]:
aws s3 cp ./data/Proteomes/  s3://palmer-baum-dedup-orthofinder/Proteomes/ --recursive

upload: Proteomes/GCA_000899095.1.faa to s3://mukherjee-lab/dash_app_csrA/Proteomes/GCA_000899095.1.faa
upload: Proteomes/GCA_000410535.2.faa to s3://mukherjee-lab/dash_app_csrA/Proteomes/GCA_000410535.2.faa
upload: Proteomes/GCA_001428975.1.faa to s3://mukherjee-lab/dash_app_csrA/Proteomes/GCA_001428975.1.faa
upload: Proteomes/GCA_000432755.1.faa to s3://mukherjee-lab/dash_app_csrA/Proteomes/GCA_000432755.1.faa
upload: Proteomes/GCA_000436955.1.faa to s3://mukherjee-lab/dash_app_csrA/Proteomes/GCA_000436955.1.faa
upload: Proteomes/GCA_001010885.1.faa to s3://mukherjee-lab/dash_app_csrA/Proteomes/GCA_001010885.1.faa
upload: Proteomes/GCA_000467105.1.faa to s3://mukherjee-lab/dash_app_csrA/Proteomes/GCA_000467105.1.faa
upload: Proteomes/GCA_000403315.2.faa to s3://mukherjee-lab/dash_app_csrA/Proteomes/GCA_000403315.2.faa
upload: Proteomes/GCA_000158575.1.faa to s3://mukherjee-lab/dash_app_csrA/Proteomes/GCA_000158575.1.faa
upload: Proteomes/GCA_000403435.2.faa to s3://mukherjee-lab/dash

#### Running OrthoFinder on AWS involves several steps. Here is a general outline of the process:  
1. Launch an EC2 instance on AWS.  
2. Configure the instance with the necessary software and dependencies, such as Python and  
OrthoFinder.  
3. Transfer your input data to the instance.  
4. Run OrthoFinder on the instance.  
5. Transfer the output data back to your local machine.  
#### Here are more detailed instructions:  
1. Launch an EC2 instance on AWS:  
    1. Log in to your AWS account.  
    2. Navigate to the EC2 dashboard.  
    3. Click "Launch Instance" to launch a new instance.  
    4. Choose an appropriate Amazon Machine Image (AMI) for your instance. You can use a pre-configured AMI that already has   
OrthoFinder installed or you can create a custom AMI with OrthoFinder installed.  
        1. e.g. Amazon Linux 2023 AMI  
    5. Select an instance type that meets your requirements. OrthoFinder is not very resource-intensive,  
so a small or medium-sized instance should be sufficient for most analyses   
        1. T2 micro is free or very cheap  
    6. Configure any additional settings as necessary (e.g., security groups, key pairs, etc.).
        1. Select or create a key-pair name e.g. jon-orthofinder
    7. Launch the instance.  
        1. There are several ‘connect’ buttons to push here...  
2. Configure the EC2 instance:  
    1. Once your instance is running, connect to it using SSH  
    2. configure aws in the terminal
        - `$ aws configure`
        - `Enter access key: $ ######`
        - `Secret key: $ #####`  
        - `Region name: $ us-east-2`  
        - `Default output format: $ json`  
    3. Install any necessary dependencies, such as Python and OrthoFinder.
        - `$ mkdir ~/orthofinder`  
        - `$ cd ./orthofinder`
        - `$ wget https://github.com/davidemms/OrthoFinder/releases/latest/download/OrthoFinder.tar.gz`  
        - `$ tar xzvf OrthoFinder.tar.gz`  
        - `$ cd OrthoFinder/`  
        - Check to see that orthofinder is installed and working.
        - `$ ./orthofinder -h`  
    4. You may need to configure your environment variables to point to the correct paths for OrthoFinder and other software  
3. Transfer your input data to the instance:  
    - There are several ways to transfer data to your instance, including SCP, SFTP, or AWS S3.  
    - For orthofinder, I transfered data from an aws s3 bucketL:
        1. `$ mkdir ./Proteomes`  
        2. `$ aws s3 cp s3://mukherjee-lab/dash_app_bdelivibrio/Proteomes ./Proteomes --recursive`
4. Run OrthoFinder:  
    - Run OrthoFinder using the command-line interface, specifying the appropriate input and output directories.
    - `$ ./orthofinder -f ./Proteomes`  
    - Monitor the progress of the analysis and ensure that it completes successfully.
5. Transfer the output data back to your local machine:
   Once OrthoFinder has finished running, transfer the output data back to your local machine using the same method used to transfer the input data.

***Note that these steps are a general outline and may need to be modified based on your specific analysis requirements and the resources available to you.


## Script for running orthofinder on aws          

        #!/bin/bash  
        ##### Set AWS credentials  
        export AWS_ACCESS_KEY_ID=#####  
        export AWS_SECRET_ACCESS_KEY=####
        export DEFAULT_REGION_NAME=us-east-2  
        export DEFAULT_OUTPUT_FORMAT=json  

        ##### Install OrthoFinder  
        mkdir ~/orthofinder  
        cd ./orthofinder  
        wget https://github.com/davidemms/OrthoFinder/releases/latest/download/OrthoFinder.tar.gz   
        tar xzvf OrthoFinder.tar.gz   
        cd OrthoFinder/   
        mkdir ./Proteomes    

        ##### Download files from S3 bucket  
        aws s3 cp s3://mukherjee-lab/dash_app_csrA/Proteomes ./Proteomes --recursive   

        ##### Confirm files were downloaded  
        ls ./Proteomes  
        current_date=$(date +%Y-%m-%d)  
        output_folder=${current_date}_OF_Results  
        ./orthofinder -f ./Proteomes --output $output_folder  

        ##### install pigz to compress directory using multiple threads and move compressed results to s3 bucket on aws  
        sudo yum install pigz -y  
        output_folder_compressed=${output_folder}.tar.gz  
        time tar -I pigz -cf $output_folder_compressed $output_folder  
        echo $output_folder  
        aws s3 cp $output_folder_compressed s3://mukherjee-lab/dash_app_csrA/  