In [1]:
import os
import pathlib
import socket
import numpy as np
import pandas as pd
import re
# import create_sample_sheet

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:

#import modules 
import argparse
import os
import re
import itertools
import boto3
import logging
from botocore.exceptions import ClientError
import numpy as np
import pandas as pd


def create_sample_sheet(bucket_name,prefix_name,filetype="fastq",samples=None,filename="sample_sheet.txt", write=True):

    """
    A function to query an S3 bucket, list its objects, and filter the files by sample IDs. 
    The bucket_name is a string. Example: "fh-pi-my-bucket"
    The prefix_name is a string. Need trailing slash. Example: "SR/myfiles/"
    """

    #function to parse the object summary from Boto3 for fastqs
    def sample_name(s3_object_summary):
        sample = s3_object_summary.key.split("/")[2]
        sample = re.sub(r"[._][Rr][12].+$","", sample)
        return(sample)
    
    #Connection to S3 Bucket 
    s3 = boto3.resource('s3')
    bucket = s3.Bucket(bucket_name)
    delim = "/"    

    #query S3 bucket to list the fastq files 
    fqs = bucket.objects.filter(Delimiter=delim,
                                Prefix=prefix_name)

    #iterate over the fastqs 
    fqs_PE = dict()
    for obj in fqs:
        samp = sample_name(obj)
#         print(samp)
        if re.search(r"[._-][Rr][12].(fastq|fq)", obj.key):
            read = '{0}//{1}/{2}'.format("s3:", obj.bucket_name, obj.key)

            #print(samp + ":" + read)
            if samp not in fqs_PE.keys():
                fqs_PE[samp] = [read]
            else:
                fqs_PE[samp] = fqs_PE[samp] + [read]
                #final sort to ensure order
                fqs_PE[samp].sort()


    print("There are " + str(len(fqs_PE)) + " Fastq files.")

    #Create a pandas dataframe 
    #split the samples for filtering by space or comma
    #if no sample IDs are provided, the regex will include all files in the S3 bucket/prefix. 
    if samples is None:
        samples = ""

    regex = re.compile('|'.join(re.split(',| ',samples)))
    filtered = [{"Sample":sample,"R1":fastqs[0],"R2":fastqs[1]} for sample, fastqs in fqs_PE.items() if re.search(regex, sample)]
    sample_sheet = pd.DataFrame(filtered) 
    
    
    #Save the dataframe to file or return the dataframe
    if write: 
        sample_sheet.to_csv(path_or_buf=filename,
                    sep="\t", header=True,
                    index=False,quoting=None)
    
        print("Finished writing " + str(len(sample_sheet.index)) + " records to file: " + filename)
    else: 
        return(sample_sheet)
    
    

#Taken directly form https://boto3.amazonaws.com/v1/documentation/api/latest/guide/s3-presigned-urls.html 
def create_presigned_url(bucket_name, object_name, expiration=3600):
    """Generate a presigned URL to share an S3 object

    :param bucket_name: string
    :param object_name: string
    :param expiration: Time in seconds for the presigned URL to remain valid
    :return: Presigned URL as string. If error, returns None.
    """

    # Generate a presigned URL for the S3 object
    s3_client = boto3.client('s3')
    try:
        response = s3_client.generate_presigned_url('get_object',
                                                    Params={'Bucket': bucket_name,
                                                            'Key': object_name},
                                                    ExpiresIn=expiration)
    except ClientError as e:
        logging.error(e)
        return None

    # The response contains the presigned URL
    return response

In [3]:
# https://realpython.com/python-pathlib/
host = socket.gethostname()

if re.search("MacBook", host):
    TARGET="/Users/work/fast_drive/workingDir/TARGET/AML_TARGET/"

elif re.search("gizmo|rhino", host):
    TARGET="/fh/fast/meshinchi_s/workingDir/TARGET/AML_TARGET/"

TARGET=pathlib.Path(TARGET)
TARGET

PosixPath('/Users/work/fast_drive/workingDir/TARGET/AML_TARGET')

In [4]:
pathlib.Path().cwd()

PosixPath('/Users/work/home_drive/scripts/STAR-fusion-NF/python_code')

# 1. Read in the Manifest file

In [5]:
# [{print(f)}for file in TARGET.joinpath("SequencingDataMatrix/").iterdir()]

manifest_file = TARGET.joinpath("SequencingDataMatrix/TARGET_AML_Ribodepleted_Master_Manifest_8.5.20.csv")


manifest = pd.read_csv(manifest_file, 
                       engine='python')

manifest.head()
manifest.shape

Unnamed: 0.1,Unnamed: 0,Sample,Final_Patient_ID,PATIENT_ID_Original,USI,Reg.,Protocol,AML_Subtype,Group,Batch,Library,Time_point,Tissue,Protocol2,Primary.Fusion.CNV,Additional.Fusions.CNV
0,1,Kasumi.AZA.D11.03A.01R,Kasumi-AZA-D11-03A-01R,Kasumi-AZA-D11-03A-01R,Kasumi,,CellLine,CellLine,CellLine,dx1,A75511,CellLine,CellLine,,,
1,2,Kasumi.AZA.D5.03A.01R,Kasumi-AZA-D5-03A-01R,Kasumi-AZA-D5-03A-01R,Kasumi,,CellLine,CellLine,CellLine,dx1,A75509,CellLine,CellLine,,,
2,3,Kasumi.D1.03A.01R,Kasumi-D1-03A-01R,Kasumi-D1-03A-01R,Kasumi,,CellLine,CellLine,CellLine,dx1,A75507,CellLine,CellLine,,,
3,4,MV4.11.AZA.D11.03A.01R,MV4-11-AZA-D11-03A-01R,MV4-11-AZA-D11-03A-01R,MV4,,CellLine,CellLine,CellLine,dx1,A75512,CellLine,CellLine,,,
4,5,MV4.11.AZA.D5.03A.01R,MV4-11-AZA-D5-03A-01R,MV4-11-AZA-D5-03A-01R,MV4,,CellLine,CellLine,CellLine,dx1,A75510,CellLine,CellLine,,,


(2346, 16)

In [6]:
#table for the different Batchs 
manifest.Batch.value_counts()

dx1      1117
dx2       457
rlps2     302
ds1       228
rlps1     198
rlps3      36
rlps4       7
Name: Batch, dtype: int64

In [7]:
polyA_RBD_file = TARGET.joinpath("SequencingDataMatrix/TARGET_AML_RBD_PolyA_RNAseq_AWS_S3_Fastq_Manifest_8.5.20.csv")
polyA_RBD_manifest = pd.read_csv(polyA_RBD_file)


# polyA_RBD_manifest.head()
polyA_RBD_manifest.shape

(2777, 16)

# 2. Subset for Required Samples

In [8]:
manifest.fillna(value="",inplace=True) #Stella is an NA

In [9]:
ds_aml = manifest[manifest.Batch.str.contains("ds")]

# ds_aml.head()
ds_aml.shape #228 rows

(228, 16)

In [11]:
# & cbf_aml.Time_point.str.contains("diagnostic")
cbf_aml = polyA_RBD_manifest[polyA_RBD_manifest.AML_Subtype.str.contains("CBFB-MYH11|RUNX1-RUNX1T1") & polyA_RBD_manifest.Time_point.str.contains("diagnostic")]

cbf_aml.head()
cbf_aml.shape
cbf_aml.AML_Subtype.value_counts()

Unnamed: 0,fastq_sample_filename,Patient_ID,Lib_Prep,Sample,Final_Patient_ID,USI,Reg.,Protocol,AML_Subtype,Group,Batch,Library,Time_point,Tissue,Primary.Fusion.CNV,Additional.Fusions.CNV
18,PAUHXN-03A-01R_withJunctionsOnGenome_dupsFlagged,PAUHXN-03A-01R,RBS,TARGET.20.PAUHXN.03A.01R,TARGET-20-PAUHXN-03A-01R,PAUHXN,814037.0,AAML1031,RUNX1-RUNX1T1,AML,dx1,A74007,diagnostic,peripheral_blood,RUNX1-RUNX1T1,Unknown
21,PAUJCF-09A-01R_withJunctionsOnGenome_dupsFlagged,PAUJCF-09A-01R,RBS,TARGET.20.PAUJCF.09A.01R,TARGET-20-PAUJCF-09A-01R,PAUJCF,814725.0,AAML1031,CBFB-MYH11,AML,dx1,A74010,diagnostic,bone_marrow,CBFB-MYH11,Unknown
22,PAUKDH-09A-01R_withJunctionsOnGenome_dupsFlagged,PAUKDH-09A-01R,RBS,TARGET.20.PAUKDH.09A.01R,TARGET-20-PAUKDH-09A-01R,PAUKDH,815328.0,AAML1031,RUNX1-RUNX1T1,AML,dx1,A74013,diagnostic,bone_marrow,RUNX1-RUNX1T1,Unknown
25,PAULDS-09A-01R_withJunctionsOnGenome_dupsFlagged,PAULDS-09A-01R,RBS,TARGET.20.PAULDS.09A.01R,TARGET-20-PAULDS-09A-01R,PAULDS,815913.0,AAML1031,RUNX1-RUNX1T1,AML,dx1,A74016,diagnostic,bone_marrow,RUNX1-RUNX1T1,Unknown
35,PAUNVK-09A-01R_withJunctionsOnGenome_dupsFlagged,PAUNVK-09A-01R,RBS,TARGET.20.PAUNVK.09A.01R,TARGET-20-PAUNVK-09A-01R,PAUNVK,817444.0,AAML1031,RUNX1-RUNX1T1,AML,dx1,A74042,diagnostic,bone_marrow,RUNX1-RUNX1T1,Unknown


(400, 16)

RUNX1-RUNX1T1    215
CBFB-MYH11       185
Name: AML_Subtype, dtype: int64

In [12]:
cbf_aml.Time_point.value_counts()
cbf_aml.Group.value_counts()
cbf_aml.Lib_Prep.value_counts()

diagnostic    400
Name: Time_point, dtype: int64

AML    400
Name: Group, dtype: int64

RBS      283
PolyA    117
Name: Lib_Prep, dtype: int64

# 3. Run the Sample Sheet Script

In [13]:
bucket="fh-pi-meshinchi-s"
prefix="SR/picard_fq2/"
# ?create_sample_sheet.create_sample_sheet

In [None]:
samples = " ".join(ds_aml.Sample.tolist())
# type(samples)

create_sample_sheet.create_sample_sheet(bucket_name=bucket, prefix_name=prefix, samples=samples, filename="test_sheet.txt")

In [None]:
samples_cbf = " ".join(cbf_aml.fastq_sample_filename.tolist())
# samples_cbf
forURLs = create_sample_sheet(bucket_name=bucket, prefix_name=prefix, samples=samples_cbf, write=False)
           


In [18]:
type(forURLs)
forURLs.R1.head()

pandas.core.frame.DataFrame

0    s3://fh-pi-meshinchi-s/SR/picard_fq2/PAUHXN-03...
1    s3://fh-pi-meshinchi-s/SR/picard_fq2/PAUJCF-09...
2    s3://fh-pi-meshinchi-s/SR/picard_fq2/PAUKDH-09...
3    s3://fh-pi-meshinchi-s/SR/picard_fq2/PAULDS-09...
4    s3://fh-pi-meshinchi-s/SR/picard_fq2/PAUNVK-09...
Name: R1, dtype: object

In [40]:
r1 = forURLs.R1.tolist()[1]
type(r1)
end = len(r1)
start = len(bucket) + 6
# r1

expiration = 604800
object_name = r1[start:end]
bucket
object_name
expiration

str

'fh-pi-meshinchi-s'

'SR/picard_fq2/PAUJCF-09A-01R_withJunctionsOnGenome_dupsFlagged_r1.fq.gz'

604800

In [42]:
# Generate a presigned URL for the S3 object
s3_client = boto3.client('s3')
response = s3_client.generate_presigned_url('get_object',
                                            Params={'Bucket': bucket,
                                                    'Key': object_name},
                                            ExpiresIn=expiration)
response

'https://fh-pi-meshinchi-s.s3.amazonaws.com/SR/picard_fq2/PAUJCF-09A-01R_withJunctionsOnGenome_dupsFlagged_r1.fq.gz?AWSAccessKeyId=AKIA3AANZ4DLPN42E365&Signature=Kzx%2Fyk4msdfzEuJfcs2iGFkRg48%3D&Expires=1601577904'