In [5]:
# Screening HIV-1 sequences worldwide

import RNA

def DNA_reverse_complement_RNA(DNA):
    complement = {'A': 'U', 'C': 'G', 'G': 'C', 'T': 'A'}
    return ''.join(complement.get(base, base) for base in reversed(DNA))

def single_base_permutation_3end_preserve_forward(DNA):
    permutation_set = []
    for i in range (1, len(DNA)-1):
        permutation = DNA[:i-1] + 'A' + DNA[i:] 
        permutation_set.append(permutation)
        permutation = DNA[:i-1] + 'C' + DNA[i:] 
        permutation_set.append(permutation)
        permutation = DNA[:i-1] + 'G' + DNA[i:] 
        permutation_set.append(permutation)
        permutation = DNA[:i-1] + 'T' + DNA[i:] 
        permutation_set.append(permutation)
    return permutation_set

def single_base_permutation_3end_preserve_reverse(DNA):
    permutation_set = []
    for i in range (4, len(DNA)+1):
        permutation = DNA[:i-1] + 'A' + DNA[i:] 
        permutation_set.append(permutation)
        permutation = DNA[:i-1] + 'C' + DNA[i:] 
        permutation_set.append(permutation)
        permutation = DNA[:i-1] + 'G' + DNA[i:] 
        permutation_set.append(permutation)
        permutation = DNA[:i-1] + 'T' + DNA[i:] 
        permutation_set.append(permutation)
    return permutation_set

def single_base_permutation_crRNA_target(DNA):
    permutation_set = []
    for i in range (1, len(DNA)+1):
        permutation = DNA[:i-1] + 'A' + DNA[i:] 
        permutation_set.append(permutation)
        permutation = DNA[:i-1] + 'C' + DNA[i:] 
        permutation_set.append(permutation)
        permutation = DNA[:i-1] + 'G' + DNA[i:] 
        permutation_set.append(permutation)
        permutation = DNA[:i-1] + 'T' + DNA[i:] 
        permutation_set.append(permutation)
    return permutation_set

def single_base_insertion_3end_preserve_forward(DNA):
    insertion_set = []
    for i in range (1, len(DNA)-1):
        insertion = DNA[:i-1] + 'A' + DNA[i-1:]
        insertion_set.append(insertion)
        insertion = DNA[:i-1] + 'C' + DNA[i-1:]
        insertion_set.append(insertion)
        insertion = DNA[:i-1] + 'G' + DNA[i-1:]
        insertion_set.append(insertion)
        insertion = DNA[:i-1] + 'T' + DNA[i-1:]
        insertion_set.append(insertion)
    return insertion_set
        
def single_base_insertion_3end_preserve_reverse(DNA):
    insertion_set = []
    for i in range (4, len(DNA)+1):
        insertion = DNA[:i-1] + 'A' + DNA[i-1:]
        insertion_set.append(insertion)
        insertion = DNA[:i-1] + 'C' + DNA[i-1:]
        insertion_set.append(insertion)
        insertion = DNA[:i-1] + 'G' + DNA[i-1:]
        insertion_set.append(insertion)
        insertion = DNA[:i-1] + 'T' + DNA[i-1:]
        insertion_set.append(insertion)
    return insertion_set
        
def single_base_insertion_crRNA_target(DNA):
    insertion_set = []
    for i in range (1, len(DNA)+1):
        insertion = DNA[:i-1] + 'A' + DNA[i-1:]
        insertion_set.append(insertion)
        insertion = DNA[:i-1] + 'C' + DNA[i-1:]
        insertion_set.append(insertion)
        insertion = DNA[:i-1] + 'G' + DNA[i-1:]
        insertion_set.append(insertion)
        insertion = DNA[:i-1] + 'T' + DNA[i-1:]
        insertion_set.append(insertion)
    return insertion_set
    
def single_base_deletion_3end_preserve_forward(DNA):
    deletion_set = []
    for i in range (1, len(DNA)-2):
        deletion = DNA[:i-1] + DNA[i:]
    return deletion_set
    
def single_base_deletion_3end_preserve_reverse(DNA):
    deletion_set = []
    for i in range (4, len(DNA)):
        deletion = DNA[:i-1] + DNA[i:]
    return deletion_set
    
def single_base_deletion_crRNA_target(DNA):
    deletion_set = []
    for i in range (1, len(DNA)):
        deletion = DNA[:i-1] + DNA[i:]
    return deletion_set
    

# Main function        
ref_amplicon = 'AATGAGGAAGCTGCAGAATGGGATAGATTGCATCCCGTGCAGGCAGGGCCTGTTGCACCAGGCCAGATAAGA'
ref_amplicon_corrected = 'AATGAGGAAGCTGCAGAATGGGATAGAGTGCATCCAGTGCATGCAGGGCCTATTGCACCAGGCCAGATGAGAGAACCAAGGGG'
forward = 'AAGCAGCCATGCAAATGTTAAAAGAAACCATC'
reverse = 'GATCCAAGGGGAAGTGACATAGCAGGAACTAC'
forward_corrected = 'AAGCAGCCATGCAAATGTTAAAAGAGACCATC'
reverse_corrected = 'GAACCAAGGGGAAGTGACATAGCAGGAACTAC'
As_repeat = 'UAAUUUCUACUCUUGUAGAU'
Lb_repeat = 'UAAUUUCUACUAAGUGUAGAU'
print('forward primer length: ' + str(len(forward_corrected)))
print('reverse primer length: ' + str(len(reverse_corrected)))

permutation_set_forward = single_base_permutation_3end_preserve_forward(forward_corrected)
permutation_set_reverse = single_base_permutation_3end_preserve_reverse(reverse_corrected)
insertion_set_forward = single_base_insertion_3end_preserve_forward(forward_corrected)
insertion_set_reverse = single_base_insertion_3end_preserve_reverse(reverse_corrected)
deletion_set_forward = single_base_deletion_3end_preserve_forward(forward_corrected)
deletion_set_reverse = single_base_deletion_3end_preserve_reverse(reverse_corrected)

file = open('sequences-HIV-1-worldwide.fasta', 'r')
lines = file.readlines()
whole_seq = []
for line in lines:
    if line[0] == '>':
        whole_seq.append('\n')
        whole_seq.append(line)
        continue
    else:
        line = line.strip()
        whole_seq.append(line)
    
whole_seq = ''.join(whole_seq)
whole_seq = whole_seq[1:]

whole_seq = whole_seq.splitlines()
total_seq_num = int(len(whole_seq)/2)
print('Total sequence number in database: ' + str(total_seq_num))

# primer screen, the amplicon sequences with matched primers will be extracted for further processing

amplicon_set = []
primer_match_num = 0

for i in range (0, len(whole_seq)):
    if i%2 == 1:
        start = -1
        end = -1
        for j in range (0, len(whole_seq[i])-32+1):
            if whole_seq[i][j:j+32] in permutation_set_forward:
                start = j+32
                break
        if start == -1:
            for j in range (0, len(whole_seq[i])-33+1):
                if whole_seq[i][j:j+33] in insertion_set_forward:
                    start = j+33
                    break
            if start == -1:
                for j in range (0, len(whole_seq[i])-31+1):
                    if whole_seq[i][j:j+31] in deletion_set_forward:
                        start = j+31
                        break

        for j in range (0, len(whole_seq[i])-32+1):
            if whole_seq[i][j:j+32] in permutation_set_reverse:
                end = j
                break
        if start == -1:
            for j in range (0, len(whole_seq[i])-33+1):
                if whole_seq[i][j:j+33] in insertion_set_reverse:
                    end = j
                    break
            if start == -1:
                for j in range (0, len(whole_seq[i])-31+1):
                    if whole_seq[i][j:j+31] in deletion_set_reverse:
                        end = j
                        break

        if start != -1 and end != -1:
            primer_match_num += 1
            insert = whole_seq[i][start:end+11]
            amplicon_set.append(insert)



print('Matched primer number: ' + str(primer_match_num))        
ratio_1 = primer_match_num/total_seq_num
print('primer match ratio: ' + str(ratio_1))
  
print('----------------------------------------')    
    
# crRNA target screening
for s in range (0, len(ref_amplicon_corrected)-20+1):
    ref_target = ref_amplicon_corrected[s:s+20]

    permutation_set_ref_target = single_base_permutation_crRNA_target(ref_target)
    insertion_set_ref_target = single_base_insertion_crRNA_target(ref_target)
    deletion_set_ref_target = single_base_deletion_crRNA_target(ref_target)

    target_match_num = 0
    '''
    print('Target sequence: ' + str(ref_target))
    '''
    for i in range (0, len(amplicon_set)):
        error_flag = 1
        for k in range (0, len(amplicon_set[i])-20+1):
            if amplicon_set[i][k:k+20] == ref_target:
                error_flag = -1
                break
                        
        if error_flag == -1:
            target_match_num += 1

    '''
    print('Matched target number: ' + str(target_match_num))
    '''
    ratio_2 = target_match_num/primer_match_num
    '''
    print('target match ratio: ' + str(ratio_2))
    '''
    print(ratio_2)
    if ratio_2 >= 0:
        As_crRNA = As_repeat + DNA_reverse_complement_RNA(ref_target)
        Lb_crRNA = Lb_repeat + DNA_reverse_complement_RNA(ref_target)
        
        # Set global switch for unique ML decomposition
        RNA.cvar.uniq_ML = 1

        # create new fold_compound object
        As_fc = RNA.fold_compound(As_crRNA)
        Lb_fc = RNA.fold_compound(Lb_crRNA)

        # compute minimum free energy (mfe) and corresponding structure
        (As_ss, As_mfe) = As_fc.mfe()
        (Lb_ss, Lb_mfe) = Lb_fc.mfe()
        (As_bp_propensity, As_dG) = As_fc.pf() 
        (Lb_bp_propensity, Lb_dG) = Lb_fc.pf() 
        As_dG_crRNA = As_dG + 4.80
        Lb_dG_crRNA = Lb_dG + 4.70
        
        # This following part displays the energy properties of crRNA
        '''
        print('As crRNA sequence: ' + str(As_crRNA))
        print('MFE strcture of As crRNA: ' + str(As_ss))
        print('Ensemble energy of As crRNA: ' + str(As_dG_crRNA))
        print('------------------------')
        print('Lb crRNA sequence: ' + str(Lb_crRNA))
        print('MFE strcture of Lb crRNA: ' + str(Lb_ss))
        print('Ensemble energy of Lb crRNA: ' + str(Lb_dG_crRNA))
        '''


forward primer length: 32
reverse primer length: 32
Total sequence number in database: 1061945
Matched primer number: 43320
primer match ratio: 0.04079307308758928
----------------------------------------
0.5600877192982456
0.5600184672206833
0.5595106186518929
0.5615650969529086
0.5372576177285319
0.5370729455216989
0.5720914127423823
0.5060941828254848
0.09078947368421053
0.12220683287165282
0.09958448753462604
0.09965373961218836
0.10228531855955679
0.09524469067405356
0.09524469067405356
0.09545244690674054
0.09489843028624192
0.0948522622345337
0.09475992613111726
0.08111726685133887
0.08434903047091413
0.08524930747922438
0.0603185595567867
0.06034164358264081
0.06200369344413666
0.06184210526315789
0.06184210526315789
0.06348107109879964
0.33871191135734074
0.3411588180978763
0.46712834718374885
0.439196675900277
0.30814866112650047
0.3203139427516159
0.30964912280701756
0.3090720221606648
0.3124653739612188
0.31119575253924286
0.31121883656509697
0.4361034164358264
0.4324330563

In [4]:
# Screening HIV-1 sequences in North America only

import RNA

def DNA_reverse_complement_RNA(DNA):
    complement = {'A': 'U', 'C': 'G', 'G': 'C', 'T': 'A'}
    return ''.join(complement.get(base, base) for base in reversed(DNA))

def single_base_permutation_3end_preserve_forward(DNA):
    permutation_set = []
    for i in range (1, len(DNA)-1):
        permutation = DNA[:i-1] + 'A' + DNA[i:] 
        permutation_set.append(permutation)
        permutation = DNA[:i-1] + 'C' + DNA[i:] 
        permutation_set.append(permutation)
        permutation = DNA[:i-1] + 'G' + DNA[i:] 
        permutation_set.append(permutation)
        permutation = DNA[:i-1] + 'T' + DNA[i:] 
        permutation_set.append(permutation)
    return permutation_set

def single_base_permutation_3end_preserve_reverse(DNA):
    permutation_set = []
    for i in range (4, len(DNA)+1):
        permutation = DNA[:i-1] + 'A' + DNA[i:] 
        permutation_set.append(permutation)
        permutation = DNA[:i-1] + 'C' + DNA[i:] 
        permutation_set.append(permutation)
        permutation = DNA[:i-1] + 'G' + DNA[i:] 
        permutation_set.append(permutation)
        permutation = DNA[:i-1] + 'T' + DNA[i:] 
        permutation_set.append(permutation)
    return permutation_set

def single_base_permutation_crRNA_target(DNA):
    permutation_set = []
    for i in range (1, len(DNA)+1):
        permutation = DNA[:i-1] + 'A' + DNA[i:] 
        permutation_set.append(permutation)
        permutation = DNA[:i-1] + 'C' + DNA[i:] 
        permutation_set.append(permutation)
        permutation = DNA[:i-1] + 'G' + DNA[i:] 
        permutation_set.append(permutation)
        permutation = DNA[:i-1] + 'T' + DNA[i:] 
        permutation_set.append(permutation)
    return permutation_set

def single_base_insertion_3end_preserve_forward(DNA):
    insertion_set = []
    for i in range (1, len(DNA)-1):
        insertion = DNA[:i-1] + 'A' + DNA[i-1:]
        insertion_set.append(insertion)
        insertion = DNA[:i-1] + 'C' + DNA[i-1:]
        insertion_set.append(insertion)
        insertion = DNA[:i-1] + 'G' + DNA[i-1:]
        insertion_set.append(insertion)
        insertion = DNA[:i-1] + 'T' + DNA[i-1:]
        insertion_set.append(insertion)
    return insertion_set
        
def single_base_insertion_3end_preserve_reverse(DNA):
    insertion_set = []
    for i in range (4, len(DNA)+1):
        insertion = DNA[:i-1] + 'A' + DNA[i-1:]
        insertion_set.append(insertion)
        insertion = DNA[:i-1] + 'C' + DNA[i-1:]
        insertion_set.append(insertion)
        insertion = DNA[:i-1] + 'G' + DNA[i-1:]
        insertion_set.append(insertion)
        insertion = DNA[:i-1] + 'T' + DNA[i-1:]
        insertion_set.append(insertion)
    return insertion_set
        
def single_base_insertion_crRNA_target(DNA):
    insertion_set = []
    for i in range (1, len(DNA)+1):
        insertion = DNA[:i-1] + 'A' + DNA[i-1:]
        insertion_set.append(insertion)
        insertion = DNA[:i-1] + 'C' + DNA[i-1:]
        insertion_set.append(insertion)
        insertion = DNA[:i-1] + 'G' + DNA[i-1:]
        insertion_set.append(insertion)
        insertion = DNA[:i-1] + 'T' + DNA[i-1:]
        insertion_set.append(insertion)
    return insertion_set
    
def single_base_deletion_3end_preserve_forward(DNA):
    deletion_set = []
    for i in range (1, len(DNA)-2):
        deletion = DNA[:i-1] + DNA[i:]
    return deletion_set
    
def single_base_deletion_3end_preserve_reverse(DNA):
    deletion_set = []
    for i in range (4, len(DNA)):
        deletion = DNA[:i-1] + DNA[i:]
    return deletion_set
    
def single_base_deletion_crRNA_target(DNA):
    deletion_set = []
    for i in range (1, len(DNA)):
        deletion = DNA[:i-1] + DNA[i:]
    return deletion_set
    

# Main function        
ref_amplicon = 'AATGAGGAAGCTGCAGAATGGGATAGATTGCATCCCGTGCAGGCAGGGCCTGTTGCACCAGGCCAGATAAGA'
ref_amplicon_corrected = 'AATGAGGAAGCTGCAGAATGGGATAGAGTGCATCCAGTGCATGCAGGGCCTATTGCACCAGGCCAGATGAGAGAACCAAGGGG'
forward = 'AAGCAGCCATGCAAATGTTAAAAGAAACCATC'
reverse = 'GATCCAAGGGGAAGTGACATAGCAGGAACTAC'
forward_corrected = 'AAGCAGCCATGCAAATGTTAAAAGAGACCATC'
reverse_corrected = 'GAACCAAGGGGAAGTGACATAGCAGGAACTAC'
As_repeat = 'UAAUUUCUACUCUUGUAGAU'
Lb_repeat = 'UAAUUUCUACUAAGUGUAGAU'
print('forward primer length: ' + str(len(forward_corrected)))
print('reverse primer length: ' + str(len(reverse_corrected)))

permutation_set_forward = single_base_permutation_3end_preserve_forward(forward_corrected)
permutation_set_reverse = single_base_permutation_3end_preserve_reverse(reverse_corrected)
insertion_set_forward = single_base_insertion_3end_preserve_forward(forward_corrected)
insertion_set_reverse = single_base_insertion_3end_preserve_reverse(reverse_corrected)
deletion_set_forward = single_base_deletion_3end_preserve_forward(forward_corrected)
deletion_set_reverse = single_base_deletion_3end_preserve_reverse(reverse_corrected)

file = open('sequences-HIV-1-North America.fasta', 'r')
lines = file.readlines()
whole_seq = []
for line in lines:
    if line[0] == '>':
        whole_seq.append('\n')
        whole_seq.append(line)
        continue
    else:
        line = line.strip()
        whole_seq.append(line)
    
whole_seq = ''.join(whole_seq)
whole_seq = whole_seq[1:]

whole_seq = whole_seq.splitlines()
total_seq_num = int(len(whole_seq)/2)
print('Total sequence number in database: ' + str(total_seq_num))

# primer screen, the amplicon sequences with matched primers will be extracted for further processing

amplicon_set = []
primer_match_num = 0

for i in range (0, len(whole_seq)):
    if i%2 == 1:
        start = -1
        end = -1
        for j in range (0, len(whole_seq[i])-32+1):
            if whole_seq[i][j:j+32] in permutation_set_forward:
                start = j+32
                break
        if start == -1:
            for j in range (0, len(whole_seq[i])-33+1):
                if whole_seq[i][j:j+33] in insertion_set_forward:
                    start = j+33
                    break
            if start == -1:
                for j in range (0, len(whole_seq[i])-31+1):
                    if whole_seq[i][j:j+31] in deletion_set_forward:
                        start = j+31
                        break

        for j in range (0, len(whole_seq[i])-32+1):
            if whole_seq[i][j:j+32] in permutation_set_reverse:
                end = j
                break
        if start == -1:
            for j in range (0, len(whole_seq[i])-33+1):
                if whole_seq[i][j:j+33] in insertion_set_reverse:
                    end = j
                    break
            if start == -1:
                for j in range (0, len(whole_seq[i])-31+1):
                    if whole_seq[i][j:j+31] in deletion_set_reverse:
                        end = j
                        break

        if start != -1 and end != -1:
            primer_match_num += 1
            insert = whole_seq[i][start:end+11]
            amplicon_set.append(insert)



print('Matched primer number: ' + str(primer_match_num))        
ratio_1 = primer_match_num/total_seq_num
print('primer match ratio: ' + str(ratio_1))
  
print('----------------------------------------')    
    
# crRNA target screening
for s in range (0, len(ref_amplicon_corrected)-20+1):
    ref_target = ref_amplicon_corrected[s:s+20]

    permutation_set_ref_target = single_base_permutation_crRNA_target(ref_target)
    insertion_set_ref_target = single_base_insertion_crRNA_target(ref_target)
    deletion_set_ref_target = single_base_deletion_crRNA_target(ref_target)

    target_match_num = 0
    '''
    print('Target sequence: ' + str(ref_target))
    '''
    for i in range (0, len(amplicon_set)):
        error_flag = 1
        for k in range (0, len(amplicon_set[i])-20+1):
            if amplicon_set[i][k:k+20] == ref_target:
                error_flag = -1
                break
                        
        if error_flag == -1:
            target_match_num += 1

    '''
    print('Matched target number: ' + str(target_match_num))
    '''
    ratio_2 = target_match_num/primer_match_num
    '''
    print('target match ratio: ' + str(ratio_2))
    '''
    print(ratio_2)
    if ratio_2 >= 0:
        As_crRNA = As_repeat + DNA_reverse_complement_RNA(ref_target)
        Lb_crRNA = Lb_repeat + DNA_reverse_complement_RNA(ref_target)
        
        # Set global switch for unique ML decomposition
        RNA.cvar.uniq_ML = 1

        # create new fold_compound object
        As_fc = RNA.fold_compound(As_crRNA)
        Lb_fc = RNA.fold_compound(Lb_crRNA)

        # compute minimum free energy (mfe) and corresponding structure
        (As_ss, As_mfe) = As_fc.mfe()
        (Lb_ss, Lb_mfe) = Lb_fc.mfe()
        (As_bp_propensity, As_dG) = As_fc.pf() 
        (Lb_bp_propensity, Lb_dG) = Lb_fc.pf() 
        As_dG_crRNA = As_dG + 4.80
        Lb_dG_crRNA = Lb_dG + 4.70
        
        # This following part displays the energy properties of crRNA
        '''
        print('As crRNA sequence: ' + str(As_crRNA))
        print('MFE strcture of As crRNA: ' + str(As_ss))
        print('Ensemble energy of As crRNA: ' + str(As_dG_crRNA))
        print('------------------------')
        print('Lb crRNA sequence: ' + str(Lb_crRNA))
        print('MFE strcture of Lb crRNA: ' + str(Lb_ss))
        print('Ensemble energy of Lb crRNA: ' + str(Lb_dG_crRNA))
        '''


forward primer length: 32
reverse primer length: 32
Total sequence number in database: 301407
Matched primer number: 23290
primer match ratio: 0.07727093265916186
----------------------------------------
0.6598540145985401
0.6594675826534994
0.6592958351223701
0.6623014169171318
0.638643194504079
0.6383426363246029
0.6806784027479605
0.6670674109059682
0.11021897810218978
0.15294117647058825
0.13035637612709317
0.13035637612709317
0.13520824388149422
0.12954057535422928
0.12954057535422928
0.12971232288535853
0.12889652211249464
0.12889652211249464
0.1283812795191069
0.10961786174323744
0.1133104336625161
0.11382567625590383
0.06852726492056677
0.06857020180334908
0.07136109918419922
0.07136109918419922
0.07140403606698154
0.07256333190210391
0.4218548733361958
0.4249463288965221
0.49192786603692573
0.4837269214255045
0.36496350364963503
0.398196650923143
0.3845427221983684
0.38432803778445684
0.3875912408759124
0.38724774581365395
0.38729068269643624
0.5346930012881065
0.5296264491197