In [3]:
import pandas as pd
import os
import re
import gzip


In [29]:
def get_sequence_fasta(sequenceFile: str):
    '''
    args: input fasta file@str
          fasta format
    returns: @pd.DataFrame
    '''
    pattern = re.compile(r'^>')
    sequences = []
    with open(sequenceFile, 'r') as File:
        for line in File:
            if re.match(pattern, line):
                sequenceId = "".join(line.split(" ")[0].split(":")[-4:-1])
            else:
                sequenceLine = line.strip("\n")
                sequences.append((sequenceId, sequenceLine))
    outData = pd.DataFrame(sequences, columns=['seq_id', 'seq'])
    return outData


def reversed_sequence(sequence):
    '''
    reversed and complement the fasta sequences
    '''
    sequence = sequence.upper()[::-1]
    basecomplement = {
        "A": "T",
        "T": "A",
        "G": "C",
        "C": "G",
        'N':'N'
    }
    letters = list(sequence)
    letters = [basecomplement[base] for base in letters]
    return ''.join(letters)


def fastq2fasta(fastqFile, reversedSeq=False):
    '''transform fastaq to fasta
     args:
        -fastqFile: raw sequence fastaq file
        -reverseseq: reversed and complement the R2 sequence
    '''
    try:
        File = gzip.open(fastqFile, 'rt')
        File.readline()
        File.seek(0)
    except:
        File = open(fastqFile, 'rt')
    out = []
    index = 0
    for line in File:
        if index % 4 == 0:
          #! sequence Id
            sequenceId = "".join(line.split(" ")[0].split(":")[-4:-1])
            index += 1
        elif index % 4 == 1:
            #! sequence line
            if reversedSeq:
                sequenceLine = reversed_sequence(line.strip("\n"))
                out.append((sequenceId, sequenceLine))
            else:
                sequenceLine = reversed_sequence(line.strip("\n"))
                out.append((sequenceId, sequenceLine))
            index += 1
        else:
            #! other line
            index += 1
    outSequence = pd.DataFrame(out, columns=['seq_id', 'seq'])
    return(outSequence)


In [30]:
a=fastq2fasta("./testData/test_R2.fq",reversedSeq=True)
b=fastq2fasta("./testData/test_R1.fq",reversedSeq=False)

In [34]:
#############################
#!merge sequence
#############################
mergeSequence=pd.merge(left=a, right=b, left_on='seq_id', right_on='seq_id')

In [37]:
barcodePattern=re.compile(r'[ATCG]*([ATCG]{9}TATAAGCGAAAGAAGCATCAGATGGGCAAACAAAGCACCAGTGGTCTAGTGGTAGAATAGTACCCTGCCACGGTACAGACCCGGGTTCGATTCCCGGCTGGTGCA.*TAAAATAAGGCTAGTCCGTTATCAACTTGAAAAAGTGGCACCGAGTCGGTGCTTTTTTGTTTTAGAGCTAGAAATAGCAAGTTAAAATAAGGCTAGTCCGTTTTTAGCGCGTGCATGCCTGCAGGTCCACAAATTCGGGTC[ATGC]{6})[ATCG]*')
containBarcodeSequence=mergeSequence.apply(lambda x: x['seq_x']+x['seq_y'] if barcodePattern.match(x['seq_x']+x['seq_y']) else None ,axis=1)


In [41]:
barcodeDict['R1-7~R2-2']

'GACGTCACGACCATG'

In [38]:
barcodesequence=[]
for sequence in containBarcodeSequence:
    if sequence:
        barcode=sequence[0:9]+reversed_sequence(sequence)[0:6]
        sgRNA=sequence[114:134]
        barcodesequence.append((barcode,sgRNA))
    else:
        pass
################################
#!barcode and sgRNA sequence
################################
barcodeData=pd.DataFrame(barcodesequence,columns=['barcode','sgRNA'])
outData=[]
for barcodesequence in barcodeDict.keys():
    sgRNAData=barcodeData.loc[barcodeData['barcode']==barcodesequence]
    if sgRNAData.empty:
        #! without sequence sgRNA
        pass
    else:
        #! count the sgRNA number
        totalCount=sgRNAData.shape[0]
        sgRNAcount=dict(sgRNAData['sgRNA'].value_counts())
        for key,value in sgRNAcount.items():
            outData.append((
                barcodeDict[barcodesequence],
                barcodesequence, #barcode sequence
                totalCount,     #barcode count
                key,    #sgRNA sequence
                value   #sgRNA count 
            ))

outData=pd.DataFrame(outData,columns['barcodeID','barcodesequence','barcodeCount','sgRNAsequence','sgRNACount'])
outData.to_csv()





0    None
1    None
2    None
3    None
4    None
dtype: object

In [44]:
data=pd.DataFrame([
    [0,0],
    [0,1],
    [2,1]
])
a=data[1].value_counts()

In [48]:
dict(a)

{1: 2, 0: 1}