In [1]:
import pandas as pd
import re

In [2]:
test_file = 'test3.table'
control_file = 'control_albicans.table'

df_test = pd.read_csv(test_file, sep='\t')
df_control = pd.read_csv(control_file, sep='\t')

df_test = df_test.rename(columns={"NS.2209.003.UDP0161_i7---UDP0161_i5.fRS1_.GT": "sample1.GT", "NS.2209.003.UDP0163_i7---UDP0163_i5.fRS757_.GT": "sample2.GT", "NS.2209.003.UDP0167_i7---UDP0167_i5.A12_.GT" : "sample3.GT"})
df_control = df_control.rename(columns={"NS.2209.003.UDP0161_i7---UDP0161_i5.fRS1_.GT": "sample1.GT", "NS.2209.003.UDP0163_i7---UDP0163_i5.fRS757_.GT": "sample2.GT", "NS.2209.003.UDP0167_i7---UDP0167_i5.A12_.GT" : "sample3.GT"})

df_test.columns

Index(['CHROM', 'POS', 'REF', 'ALT', 'AC', 'AF', 'AN', 'sample1.GT',
       'sample2.GT', 'sample3.GT'],
      dtype='object')

In [3]:
# Only keep albicans in df_test

df_test_filtered = df_test[df_test['CHROM'] == 'Candidaalbicans']

df_test_filtered

Unnamed: 0,CHROM,POS,REF,ALT,AC,AF,AN,sample1.GT,sample2.GT,sample3.GT
1,Candidaalbicans,118,A,G,2,0.500,4,A/G,A/G,./.
2,Candidaalbicans,148,T,C,2,0.500,4,T/C,T/C,./.
3,Candidaalbicans,182,CTA,C,1,0.250,4,CTA/CTA,CTA/C,./.
4,Candidaalbicans,186,G,GGT,1,0.250,4,G/G,G/GGT,./.
5,Candidaalbicans,478,G,A,2,0.500,4,G/A,G/A,./.
...,...,...,...,...,...,...,...,...,...,...
161,Candidaalbicans,51443,TGG,T,2,0.333,6,TGG/TGG,TGG/TGG,T/T
162,Candidaalbicans,51448,G,GAA,2,0.333,6,G/G,G/G,GAA/GAA
163,Candidaalbicans,51460,T,C,2,0.333,6,T/T,T/T,C/C
164,Candidaalbicans,51463,T,C,2,0.333,6,T/T,T/T,C/C


In [4]:
# Merge on REF and ALT

df_merged = df_test_filtered.merge(df_control, how='left', on=['REF', 'ALT', 'sample1.GT', 'sample2.GT'], suffixes=('_test', '_control'))

# df_merged = df_merged[(df_merged['sample1.GT'] != './.' & df_merged['sample2.GT'] != './.')]
#df_merged = df_merged.loc[(df_merged['sample1.GT'] != './.') & (df_merged['sample2.GT'] != './.')]
df_merged #[0:15]

Unnamed: 0,CHROM_test,POS_test,REF,ALT,AC_test,AF_test,AN_test,sample1.GT,sample2.GT,sample3.GT_test,CHROM_control,POS_control,AC_control,AF_control,AN_control,sample3.GT_control
0,Candidaalbicans,118,A,G,2,0.500,4,A/G,A/G,./.,Ca22chr1A_C_albicans_SC5314,4031.0,2,0.500,4.0,./.
1,Candidaalbicans,118,A,G,2,0.500,4,A/G,A/G,./.,Ca22chr1A_C_albicans_SC5314,4042.0,2,0.500,4.0,./.
2,Candidaalbicans,118,A,G,2,0.500,4,A/G,A/G,./.,Ca22chr1A_C_albicans_SC5314,4044.0,2,0.500,4.0,./.
3,Candidaalbicans,118,A,G,2,0.500,4,A/G,A/G,./.,Ca22chr1A_C_albicans_SC5314,6656.0,2,0.500,4.0,./.
4,Candidaalbicans,118,A,G,2,0.500,4,A/G,A/G,./.,Ca22chr1A_C_albicans_SC5314,8174.0,2,0.500,4.0,./.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87277,Candidaalbicans,51475,G,T,2,0.333,6,G/G,G/G,T/T,Ca22chr5B_C_albicans_SC5314,348856.0,2,0.333,6.0,T/T
87278,Candidaalbicans,51475,G,T,2,0.333,6,G/G,G/G,T/T,Ca22chr5B_C_albicans_SC5314,946773.0,2,0.333,6.0,T/T
87279,Candidaalbicans,51475,G,T,2,0.333,6,G/G,G/G,T/T,Ca22chr5B_C_albicans_SC5314,946782.0,2,0.333,6.0,T/T
87280,Candidaalbicans,51475,G,T,2,0.333,6,G/G,G/G,T/T,Ca22chr5B_C_albicans_SC5314,946812.0,2,0.333,6.0,T/T


In [5]:
# Get gene spans from synthetic genome only albicans ?

synth_gen = '../synth_genome/v4_synthetic_genome.fa'

with open(synth_gen) as genome:
    genome_content = genome.read()

# Removes first line puts a space
#dna = re.sub(">.*", "", genome_content)
genome_string = re.sub("\n", "", genome_content)
albicans_only = re.findall(r">Candidaalbicans\w*", genome_string, flags=re.MULTILINE)[0]

print(len(albicans_only))
print(albicans_only[0:100])

52547
>CandidaalbicansTTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGTTCAGTAGGTAAAACCACCATTGAACTATAATCAGGGTC


In [6]:
# remove header
albicans_only_sequences = re.sub(">Candidaalbicans", "", albicans_only)

print(albicans_only_sequences[0:100])

TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGTTCAGTAGGTAAAACCACCATTGAACTATAATCAGGGTCAGGCACTTTATAACCA


In [7]:
# Gene ids
gene_id_list = ['C5_00660C_A', 'C1_04770C_A', 'C1_02420C_A', 'CR_00850C_A', 'C5_03390C_A',
             'C6_00620W_A', 'C3_05920W_A', 'C3_05220W_A', 'C5_01840C_A', 'C1_08460C_A',
             'C3_04890W_A', 'C3_07860C_A', 'C6_03170C_A', 'C1_08590C_A', 'C3_02220W_A',
             'C1_00800C_A', 'C1_03780C_A', 'C3_06850W_A', 'CR_08780W_A', 'CR_08800W_A',
             'C1_00710C_A', "end"]

In [8]:
pattern = re.compile("[^N]*N*")
matches = pattern.finditer(albicans_only_sequences)

span_list = []
match_list = []

for i, match in enumerate(matches):
    gene = gene_id_list[i]
    span_list.append(match.span())
    #print(f'span {gene} : ', match.span())
    match_list.append(match.group())
    #print(f'match {gene} : ', match.group())

df_albicans_genes = pd.DataFrame()
df_albicans_genes['Gene ID'] = gene_id_list
df_albicans_genes['synth genome positions'] = span_list
df_albicans_genes['match seq'] = match_list

df_albicans_genes

Unnamed: 0,Gene ID,synth genome positions,match seq
0,C5_00660C_A,"(0, 1887)",TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...
1,C1_04770C_A,"(1887, 3348)",TCATTGTTCAACATATTCTCTATCGTCAACTTTACCTTCAACTTCT...
2,C1_02420C_A,"(3348, 9342)",TTAAACTCTGAATGGATTTGTAGAATAAGGGGTGGTAGTTCCAGTA...
3,CR_00850C_A,"(9342, 14565)",CTAGTAATGCTTAATTTCTAATTTTTTCTTTAATCCTTTCAGATTT...
4,C5_03390C_A,"(14565, 15522)",TTAAATACAGTAATATCTATCACCGAAATCACCTAGACCTGGAACA...
5,C6_00620W_A,"(15522, 16345)",ATGACGTTTGACGACAAAAAAGGTTTACAAGTTGCTCTTGATCAAG...
6,C3_05920W_A,"(16345, 19972)",ATGTCAATTGCCACCACCCCTATAGAAACACCTAAAAGTCCTAAAA...
7,C3_05220W_A,"(19972, 24778)",ATGTCAGATTCTAAGATGTCGTCGCAAGATGAATCTAAATTAGAAA...
8,C5_01840C_A,"(24778, 28024)",TTAAATCCCCAAATTATTGTCAAAGAAAAAATTGGGTAAATTATTA...
9,C1_08460C_A,"(28024, 30463)",CTATTTCATATTCATAAACCCATTATCACCTTGCATTAAGTCTAAA...


In [9]:
from Bio.Seq import Seq

In [10]:
def extract_exon_sequences(dna_sequence, exon_positions):
    gene_length = len(dna_sequence)
    exon_ranges = re.findall(r'(\d+)\.\.(\d+)', exon_positions)
    exon_sequences = []
    ## ignore this if all exon start at position 1
    # Calculate the adjustment value based on the start of the first exon
    if int(exon_ranges[-1][1]) > gene_length:
        first_start = int(exon_ranges[0][0])
        adjustment_value = first_start -1
    else:
        adjustment_value = 0
    for start, end in exon_ranges:
        start, end = int(start), int(end)
        adjusted_start = start - adjustment_value
        adjusted_end = end - adjustment_value
        exon_sequence = dna_sequence[adjusted_start -1:adjusted_end-1]
        exon_sequences.append(exon_sequence)
    return ''.join(exon_sequences)

In [11]:
def convert_position(exon_positions, position):
    exon_ranges = re.findall(r'(\d+)\.\.(\d+)', exon_positions)
    introns = []
    # Add an intron before the first exon
    first_exon_start = int(exon_ranges[0][0])
    if first_exon_start > 1:
        introns.append((1, first_exon_start - 1))
    # Iterate through consecutive exons to infer introns
    for i in range(1, len(exon_ranges)):
        previous_exon_end = int(exon_ranges[i - 1][1])
        current_exon_start = int(exon_ranges[i][0])
        # Infer intron between consecutive exons
        intron_start = previous_exon_end + 1
        intron_end = current_exon_start - 1
        # Add inferred intron to the list
        introns.append((intron_start, intron_end))
    adjusted_position = position
    for intron_start, intron_end in introns:
        if intron_end <= adjusted_position:
            # Adjust for intron
            adjusted_position -= (intron_end - intron_start + 1)
    protein_position = (adjusted_position - 1) // 3 + 1
    return protein_position

In [12]:
gene_seq_list = []

for match in match_list:
    gene_seq = re.sub("N", "", match)
    gene_seq_list.append(gene_seq)

df_albicans_genes['dna seq'] = gene_seq_list

df_albicans_genes

Unnamed: 0,Gene ID,synth genome positions,match seq,dna seq
0,C5_00660C_A,"(0, 1887)",TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...,TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...
1,C1_04770C_A,"(1887, 3348)",TCATTGTTCAACATATTCTCTATCGTCAACTTTACCTTCAACTTCT...,TCATTGTTCAACATATTCTCTATCGTCAACTTTACCTTCAACTTCT...
2,C1_02420C_A,"(3348, 9342)",TTAAACTCTGAATGGATTTGTAGAATAAGGGGTGGTAGTTCCAGTA...,TTAAACTCTGAATGGATTTGTAGAATAAGGGGTGGTAGTTCCAGTA...
3,CR_00850C_A,"(9342, 14565)",CTAGTAATGCTTAATTTCTAATTTTTTCTTTAATCCTTTCAGATTT...,CTAGTAATGCTTAATTTCTAATTTTTTCTTTAATCCTTTCAGATTT...
4,C5_03390C_A,"(14565, 15522)",TTAAATACAGTAATATCTATCACCGAAATCACCTAGACCTGGAACA...,TTAAATACAGTAATATCTATCACCGAAATCACCTAGACCTGGAACA...
5,C6_00620W_A,"(15522, 16345)",ATGACGTTTGACGACAAAAAAGGTTTACAAGTTGCTCTTGATCAAG...,ATGACGTTTGACGACAAAAAAGGTTTACAAGTTGCTCTTGATCAAG...
6,C3_05920W_A,"(16345, 19972)",ATGTCAATTGCCACCACCCCTATAGAAACACCTAAAAGTCCTAAAA...,ATGTCAATTGCCACCACCCCTATAGAAACACCTAAAAGTCCTAAAA...
7,C3_05220W_A,"(19972, 24778)",ATGTCAGATTCTAAGATGTCGTCGCAAGATGAATCTAAATTAGAAA...,ATGTCAGATTCTAAGATGTCGTCGCAAGATGAATCTAAATTAGAAA...
8,C5_01840C_A,"(24778, 28024)",TTAAATCCCCAAATTATTGTCAAAGAAAAAATTGGGTAAATTATTA...,TTAAATCCCCAAATTATTGTCAAAGAAAAAATTGGGTAAATTATTA...
9,C1_08460C_A,"(28024, 30463)",CTATTTCATATTCATAAACCCATTATCACCTTGCATTAAGTCTAAA...,CTATTTCATATTCATAAACCCATTATCACCTTGCATTAAGTCTAAA...


In [13]:
### get exons

def getExons(geneID):
    
    gff_file = '../synth_genome/gff_files/C_albicans_SC5314_version_A22-s07-m01-r195.gff'
    
    with open(gff_file) as gff:
        gff_content = gff.read()
    
    # gene range
    y = re.findall(f"^.*\t.*\tgene\t.*?{geneID}", gff_content, flags=re.MULTILINE)
    coordinates = re.findall("[^\\t]+", y[0])
    #print(coordinates)
    gene_begin = int(coordinates[3]) -1 # Minus 1 to get to base 0 
    gene_end = int(coordinates[4]) # Here we do not substract because we want to include the last nucleotide.
    gene_range = gene_end - gene_begin
    #print(gene_range)
    
    # exons ranges
    w = re.findall(f"^.*\t.*\texon\t.*?{geneID}", gff_content, flags=re.MULTILINE)
    exon_relCoord_list = []
    for index, j in enumerate(w):
        exon_coord = re.findall("[^\\t]+", w[index])
        exon_begin = int(exon_coord[3]) - 1 - gene_begin
        exon_end = int(exon_coord[4]) - gene_begin
        exon_relCoord_list.append((exon_begin, exon_end))
    return exon_relCoord_list

#testing
#getExons(df_albicans_genes.iloc[5]['Gene ID'])
#print(len(df_albicans_genes.iloc[5]['dna seq']))

df_albicans_genes['exon coords'] = df_albicans_genes.apply(lambda row: getExons(row['Gene ID']), axis=1)

df_albicans_genes


Unnamed: 0,Gene ID,synth genome positions,match seq,dna seq,exon coords
0,C5_00660C_A,"(0, 1887)",TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...,TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...,"[(0, 1587)]"
1,C1_04770C_A,"(1887, 3348)",TCATTGTTCAACATATTCTCTATCGTCAACTTTACCTTCAACTTCT...,TCATTGTTCAACATATTCTCTATCGTCAACTTTACCTTCAACTTCT...,"[(0, 1161)]"
2,C1_02420C_A,"(3348, 9342)",TTAAACTCTGAATGGATTTGTAGAATAAGGGGTGGTAGTTCCAGTA...,TTAAACTCTGAATGGATTTGTAGAATAAGGGGTGGTAGTTCCAGTA...,"[(0, 5694)]"
3,CR_00850C_A,"(9342, 14565)",CTAGTAATGCTTAATTTCTAATTTTTTCTTTAATCCTTTCAGATTT...,CTAGTAATGCTTAATTTCTAATTTTTTCTTTAATCCTTTCAGATTT...,"[(0, 4923)]"
4,C5_03390C_A,"(14565, 15522)",TTAAATACAGTAATATCTATCACCGAAATCACCTAGACCTGGAACA...,TTAAATACAGTAATATCTATCACCGAAATCACCTAGACCTGGAACA...,"[(0, 657)]"
5,C6_00620W_A,"(15522, 16345)",ATGACGTTTGACGACAAAAAAGGTTTACAAGTTGCTCTTGATCAAG...,ATGACGTTTGACGACAAAAAAGGTTTACAAGTTGCTCTTGATCAAG...,"[(0, 55), (125, 523)]"
6,C3_05920W_A,"(16345, 19972)",ATGTCAATTGCCACCACCCCTATAGAAACACCTAAAAGTCCTAAAA...,ATGTCAATTGCCACCACCCCTATAGAAACACCTAAAAGTCCTAAAA...,"[(0, 3327)]"
7,C3_05220W_A,"(19972, 24778)",ATGTCAGATTCTAAGATGTCGTCGCAAGATGAATCTAAATTAGAAA...,ATGTCAGATTCTAAGATGTCGTCGCAAGATGAATCTAAATTAGAAA...,"[(0, 4506)]"
8,C5_01840C_A,"(24778, 28024)",TTAAATCCCCAAATTATTGTCAAAGAAAAAATTGGGTAAATTATTA...,TTAAATCCCCAAATTATTGTCAAAGAAAAAATTGGGTAAATTATTA...,"[(0, 2946)]"
9,C1_08460C_A,"(28024, 30463)",CTATTTCATATTCATAAACCCATTATCACCTTGCATTAAGTCTAAA...,CTATTTCATATTCATAAACCCATTATCACCTTGCATTAAGTCTAAA...,"[(0, 2139)]"


In [14]:
def getExonSeq(Exon_coord, dna_seq):
    Exon_seqs_list = []
    for i in range(len(Exon_coord)):
        Exon_seqs_list.append(dna_seq[Exon_coord[0][0]:Exon_coord[0][1]])
    #print(Exon_seqs_list)
    s = ""
    return s.join(Exon_seqs_list)

#getExonSeq(df_albicans_genes.iloc[5]['exon coords'], df_albicans_genes.iloc[5]['dna seq'])

df_albicans_genes['exon seq'] = df_albicans_genes.apply(lambda row: getExonSeq(row['exon coords'], row['dna seq']), axis=1)

df_albicans_genes

Unnamed: 0,Gene ID,synth genome positions,match seq,dna seq,exon coords,exon seq
0,C5_00660C_A,"(0, 1887)",TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...,TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...,"[(0, 1587)]",TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...
1,C1_04770C_A,"(1887, 3348)",TCATTGTTCAACATATTCTCTATCGTCAACTTTACCTTCAACTTCT...,TCATTGTTCAACATATTCTCTATCGTCAACTTTACCTTCAACTTCT...,"[(0, 1161)]",TCATTGTTCAACATATTCTCTATCGTCAACTTTACCTTCAACTTCT...
2,C1_02420C_A,"(3348, 9342)",TTAAACTCTGAATGGATTTGTAGAATAAGGGGTGGTAGTTCCAGTA...,TTAAACTCTGAATGGATTTGTAGAATAAGGGGTGGTAGTTCCAGTA...,"[(0, 5694)]",TTAAACTCTGAATGGATTTGTAGAATAAGGGGTGGTAGTTCCAGTA...
3,CR_00850C_A,"(9342, 14565)",CTAGTAATGCTTAATTTCTAATTTTTTCTTTAATCCTTTCAGATTT...,CTAGTAATGCTTAATTTCTAATTTTTTCTTTAATCCTTTCAGATTT...,"[(0, 4923)]",CTAGTAATGCTTAATTTCTAATTTTTTCTTTAATCCTTTCAGATTT...
4,C5_03390C_A,"(14565, 15522)",TTAAATACAGTAATATCTATCACCGAAATCACCTAGACCTGGAACA...,TTAAATACAGTAATATCTATCACCGAAATCACCTAGACCTGGAACA...,"[(0, 657)]",TTAAATACAGTAATATCTATCACCGAAATCACCTAGACCTGGAACA...
5,C6_00620W_A,"(15522, 16345)",ATGACGTTTGACGACAAAAAAGGTTTACAAGTTGCTCTTGATCAAG...,ATGACGTTTGACGACAAAAAAGGTTTACAAGTTGCTCTTGATCAAG...,"[(0, 55), (125, 523)]",ATGACGTTTGACGACAAAAAAGGTTTACAAGTTGCTCTTGATCAAG...
6,C3_05920W_A,"(16345, 19972)",ATGTCAATTGCCACCACCCCTATAGAAACACCTAAAAGTCCTAAAA...,ATGTCAATTGCCACCACCCCTATAGAAACACCTAAAAGTCCTAAAA...,"[(0, 3327)]",ATGTCAATTGCCACCACCCCTATAGAAACACCTAAAAGTCCTAAAA...
7,C3_05220W_A,"(19972, 24778)",ATGTCAGATTCTAAGATGTCGTCGCAAGATGAATCTAAATTAGAAA...,ATGTCAGATTCTAAGATGTCGTCGCAAGATGAATCTAAATTAGAAA...,"[(0, 4506)]",ATGTCAGATTCTAAGATGTCGTCGCAAGATGAATCTAAATTAGAAA...
8,C5_01840C_A,"(24778, 28024)",TTAAATCCCCAAATTATTGTCAAAGAAAAAATTGGGTAAATTATTA...,TTAAATCCCCAAATTATTGTCAAAGAAAAAATTGGGTAAATTATTA...,"[(0, 2946)]",TTAAATCCCCAAATTATTGTCAAAGAAAAAATTGGGTAAATTATTA...
9,C1_08460C_A,"(28024, 30463)",CTATTTCATATTCATAAACCCATTATCACCTTGCATTAAGTCTAAA...,CTATTTCATATTCATAAACCCATTATCACCTTGCATTAAGTCTAAA...,"[(0, 2139)]",CTATTTCATATTCATAAACCCATTATCACCTTGCATTAAGTCTAAA...


In [15]:
def getSynthLandmark(mutation_pos):
    for i in range(0,21):
        if ((int(mutation_pos) >= int(df_albicans_genes.iloc[i]['synth genome positions'][0])) & (int(mutation_pos) <= int(df_albicans_genes.iloc[i]['synth genome positions'][1]))):
            #print(df_albicans_genes.iloc[i]['synth genome positions'][0], df_albicans_genes.iloc[i]['synth genome positions'][1])
            return df_albicans_genes.iloc[i]['synth genome positions'][0]

In [16]:
df_merged['synth landmark'] = df_merged.apply(lambda row: getSynthLandmark(row['POS_test']), axis=1)

df_merged

Unnamed: 0,CHROM_test,POS_test,REF,ALT,AC_test,AF_test,AN_test,sample1.GT,sample2.GT,sample3.GT_test,CHROM_control,POS_control,AC_control,AF_control,AN_control,sample3.GT_control,synth landmark
0,Candidaalbicans,118,A,G,2,0.500,4,A/G,A/G,./.,Ca22chr1A_C_albicans_SC5314,4031.0,2,0.500,4.0,./.,0
1,Candidaalbicans,118,A,G,2,0.500,4,A/G,A/G,./.,Ca22chr1A_C_albicans_SC5314,4042.0,2,0.500,4.0,./.,0
2,Candidaalbicans,118,A,G,2,0.500,4,A/G,A/G,./.,Ca22chr1A_C_albicans_SC5314,4044.0,2,0.500,4.0,./.,0
3,Candidaalbicans,118,A,G,2,0.500,4,A/G,A/G,./.,Ca22chr1A_C_albicans_SC5314,6656.0,2,0.500,4.0,./.,0
4,Candidaalbicans,118,A,G,2,0.500,4,A/G,A/G,./.,Ca22chr1A_C_albicans_SC5314,8174.0,2,0.500,4.0,./.,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87277,Candidaalbicans,51475,G,T,2,0.333,6,G/G,G/G,T/T,Ca22chr5B_C_albicans_SC5314,348856.0,2,0.333,6.0,T/T,50502
87278,Candidaalbicans,51475,G,T,2,0.333,6,G/G,G/G,T/T,Ca22chr5B_C_albicans_SC5314,946773.0,2,0.333,6.0,T/T,50502
87279,Candidaalbicans,51475,G,T,2,0.333,6,G/G,G/G,T/T,Ca22chr5B_C_albicans_SC5314,946782.0,2,0.333,6.0,T/T,50502
87280,Candidaalbicans,51475,G,T,2,0.333,6,G/G,G/G,T/T,Ca22chr5B_C_albicans_SC5314,946812.0,2,0.333,6.0,T/T,50502


In [17]:
def getMutationPosition(pos, landmark):
    return (pos - landmark - 1) # base 0 strings

In [18]:
df_merged['Mutation pos in dna seq'] = df_merged.apply(lambda row: getMutationPosition(row['POS_test'], row['synth landmark']), axis=1)

df_merged

Unnamed: 0,CHROM_test,POS_test,REF,ALT,AC_test,AF_test,AN_test,sample1.GT,sample2.GT,sample3.GT_test,CHROM_control,POS_control,AC_control,AF_control,AN_control,sample3.GT_control,synth landmark,Mutation pos in dna seq
0,Candidaalbicans,118,A,G,2,0.500,4,A/G,A/G,./.,Ca22chr1A_C_albicans_SC5314,4031.0,2,0.500,4.0,./.,0,117
1,Candidaalbicans,118,A,G,2,0.500,4,A/G,A/G,./.,Ca22chr1A_C_albicans_SC5314,4042.0,2,0.500,4.0,./.,0,117
2,Candidaalbicans,118,A,G,2,0.500,4,A/G,A/G,./.,Ca22chr1A_C_albicans_SC5314,4044.0,2,0.500,4.0,./.,0,117
3,Candidaalbicans,118,A,G,2,0.500,4,A/G,A/G,./.,Ca22chr1A_C_albicans_SC5314,6656.0,2,0.500,4.0,./.,0,117
4,Candidaalbicans,118,A,G,2,0.500,4,A/G,A/G,./.,Ca22chr1A_C_albicans_SC5314,8174.0,2,0.500,4.0,./.,0,117
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87277,Candidaalbicans,51475,G,T,2,0.333,6,G/G,G/G,T/T,Ca22chr5B_C_albicans_SC5314,348856.0,2,0.333,6.0,T/T,50502,972
87278,Candidaalbicans,51475,G,T,2,0.333,6,G/G,G/G,T/T,Ca22chr5B_C_albicans_SC5314,946773.0,2,0.333,6.0,T/T,50502,972
87279,Candidaalbicans,51475,G,T,2,0.333,6,G/G,G/G,T/T,Ca22chr5B_C_albicans_SC5314,946782.0,2,0.333,6.0,T/T,50502,972
87280,Candidaalbicans,51475,G,T,2,0.333,6,G/G,G/G,T/T,Ca22chr5B_C_albicans_SC5314,946812.0,2,0.333,6.0,T/T,50502,972


In [19]:
def getDNAseq(landmark):
    for i in range(0,21):
        if landmark == df_albicans_genes.iloc[i]['synth genome positions'][0]:
            return df_albicans_genes.iloc[i]['dna seq']

In [21]:
df_merged['dna seq'] = df_merged.apply(lambda row: getDNAseq(row['synth landmark']), axis=1)

df_merged

Unnamed: 0,CHROM_test,POS_test,REF,ALT,AC_test,AF_test,AN_test,sample1.GT,sample2.GT,sample3.GT_test,CHROM_control,POS_control,AC_control,AF_control,AN_control,sample3.GT_control,synth landmark,Mutation pos in dna seq,dna seq
0,Candidaalbicans,118,A,G,2,0.500,4,A/G,A/G,./.,Ca22chr1A_C_albicans_SC5314,4031.0,2,0.500,4.0,./.,0,117,TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...
1,Candidaalbicans,118,A,G,2,0.500,4,A/G,A/G,./.,Ca22chr1A_C_albicans_SC5314,4042.0,2,0.500,4.0,./.,0,117,TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...
2,Candidaalbicans,118,A,G,2,0.500,4,A/G,A/G,./.,Ca22chr1A_C_albicans_SC5314,4044.0,2,0.500,4.0,./.,0,117,TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...
3,Candidaalbicans,118,A,G,2,0.500,4,A/G,A/G,./.,Ca22chr1A_C_albicans_SC5314,6656.0,2,0.500,4.0,./.,0,117,TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...
4,Candidaalbicans,118,A,G,2,0.500,4,A/G,A/G,./.,Ca22chr1A_C_albicans_SC5314,8174.0,2,0.500,4.0,./.,0,117,TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87277,Candidaalbicans,51475,G,T,2,0.333,6,G/G,G/G,T/T,Ca22chr5B_C_albicans_SC5314,348856.0,2,0.333,6.0,T/T,50502,972,TTATTCCATGGCGGCATCTTCTAATGGGATTTCATCGGCATATTCT...
87278,Candidaalbicans,51475,G,T,2,0.333,6,G/G,G/G,T/T,Ca22chr5B_C_albicans_SC5314,946773.0,2,0.333,6.0,T/T,50502,972,TTATTCCATGGCGGCATCTTCTAATGGGATTTCATCGGCATATTCT...
87279,Candidaalbicans,51475,G,T,2,0.333,6,G/G,G/G,T/T,Ca22chr5B_C_albicans_SC5314,946782.0,2,0.333,6.0,T/T,50502,972,TTATTCCATGGCGGCATCTTCTAATGGGATTTCATCGGCATATTCT...
87280,Candidaalbicans,51475,G,T,2,0.333,6,G/G,G/G,T/T,Ca22chr5B_C_albicans_SC5314,946812.0,2,0.333,6.0,T/T,50502,972,TTATTCCATGGCGGCATCTTCTAATGGGATTTCATCGGCATATTCT...


In [22]:
def getExonseqForMergedDF(landmark):
    for i in range(0,21):
        if landmark == df_albicans_genes.iloc[i]['synth genome positions'][0]:
            return df_albicans_genes.iloc[i]['exon seq']

In [23]:
df_merged['exon seq'] = df_merged.apply(lambda row: getExonseqForMergedDF(row['synth landmark']), axis=1)

df_merged

Unnamed: 0,CHROM_test,POS_test,REF,ALT,AC_test,AF_test,AN_test,sample1.GT,sample2.GT,sample3.GT_test,CHROM_control,POS_control,AC_control,AF_control,AN_control,sample3.GT_control,synth landmark,Mutation pos in dna seq,dna seq,exon seq
0,Candidaalbicans,118,A,G,2,0.500,4,A/G,A/G,./.,Ca22chr1A_C_albicans_SC5314,4031.0,2,0.500,4.0,./.,0,117,TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...,TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...
1,Candidaalbicans,118,A,G,2,0.500,4,A/G,A/G,./.,Ca22chr1A_C_albicans_SC5314,4042.0,2,0.500,4.0,./.,0,117,TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...,TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...
2,Candidaalbicans,118,A,G,2,0.500,4,A/G,A/G,./.,Ca22chr1A_C_albicans_SC5314,4044.0,2,0.500,4.0,./.,0,117,TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...,TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...
3,Candidaalbicans,118,A,G,2,0.500,4,A/G,A/G,./.,Ca22chr1A_C_albicans_SC5314,6656.0,2,0.500,4.0,./.,0,117,TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...,TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...
4,Candidaalbicans,118,A,G,2,0.500,4,A/G,A/G,./.,Ca22chr1A_C_albicans_SC5314,8174.0,2,0.500,4.0,./.,0,117,TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...,TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87277,Candidaalbicans,51475,G,T,2,0.333,6,G/G,G/G,T/T,Ca22chr5B_C_albicans_SC5314,348856.0,2,0.333,6.0,T/T,50502,972,TTATTCCATGGCGGCATCTTCTAATGGGATTTCATCGGCATATTCT...,TTATTCCATGGCGGCATCTTCTAATGGGATTTCATCGGCATATTCT...
87278,Candidaalbicans,51475,G,T,2,0.333,6,G/G,G/G,T/T,Ca22chr5B_C_albicans_SC5314,946773.0,2,0.333,6.0,T/T,50502,972,TTATTCCATGGCGGCATCTTCTAATGGGATTTCATCGGCATATTCT...,TTATTCCATGGCGGCATCTTCTAATGGGATTTCATCGGCATATTCT...
87279,Candidaalbicans,51475,G,T,2,0.333,6,G/G,G/G,T/T,Ca22chr5B_C_albicans_SC5314,946782.0,2,0.333,6.0,T/T,50502,972,TTATTCCATGGCGGCATCTTCTAATGGGATTTCATCGGCATATTCT...,TTATTCCATGGCGGCATCTTCTAATGGGATTTCATCGGCATATTCT...
87280,Candidaalbicans,51475,G,T,2,0.333,6,G/G,G/G,T/T,Ca22chr5B_C_albicans_SC5314,946812.0,2,0.333,6.0,T/T,50502,972,TTATTCCATGGCGGCATCTTCTAATGGGATTTCATCGGCATATTCT...,TTATTCCATGGCGGCATCTTCTAATGGGATTTCATCGGCATATTCT...


In [24]:
def mutate_dna_sequence(dna_sequence, position, alt_aa):
    # Create a Biopython Seq object
    seq = Seq(dna_sequence)
    # Mutate the sequence at the specified position
    #mutated_seq = seq[:position-1] + alt_aa + seq[position:]
    mutated_seq = seq[:position] + alt_aa + seq[position:]
    return str(mutated_seq)

In [26]:
df_merged['mutated dna seq'] = df_merged.apply(lambda row: mutate_dna_sequence(row['dna seq'], row['Mutation pos in dna seq'], row['ALT']), axis=1)

df_merged

Unnamed: 0,CHROM_test,POS_test,REF,ALT,AC_test,AF_test,AN_test,sample1.GT,sample2.GT,sample3.GT_test,...,POS_control,AC_control,AF_control,AN_control,sample3.GT_control,synth landmark,Mutation pos in dna seq,dna seq,exon seq,mutated dna seq
0,Candidaalbicans,118,A,G,2,0.500,4,A/G,A/G,./.,...,4031.0,2,0.500,4.0,./.,0,117,TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...,TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...,TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...
1,Candidaalbicans,118,A,G,2,0.500,4,A/G,A/G,./.,...,4042.0,2,0.500,4.0,./.,0,117,TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...,TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...,TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...
2,Candidaalbicans,118,A,G,2,0.500,4,A/G,A/G,./.,...,4044.0,2,0.500,4.0,./.,0,117,TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...,TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...,TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...
3,Candidaalbicans,118,A,G,2,0.500,4,A/G,A/G,./.,...,6656.0,2,0.500,4.0,./.,0,117,TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...,TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...,TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...
4,Candidaalbicans,118,A,G,2,0.500,4,A/G,A/G,./.,...,8174.0,2,0.500,4.0,./.,0,117,TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...,TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...,TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87277,Candidaalbicans,51475,G,T,2,0.333,6,G/G,G/G,T/T,...,348856.0,2,0.333,6.0,T/T,50502,972,TTATTCCATGGCGGCATCTTCTAATGGGATTTCATCGGCATATTCT...,TTATTCCATGGCGGCATCTTCTAATGGGATTTCATCGGCATATTCT...,TTATTCCATGGCGGCATCTTCTAATGGGATTTCATCGGCATATTCT...
87278,Candidaalbicans,51475,G,T,2,0.333,6,G/G,G/G,T/T,...,946773.0,2,0.333,6.0,T/T,50502,972,TTATTCCATGGCGGCATCTTCTAATGGGATTTCATCGGCATATTCT...,TTATTCCATGGCGGCATCTTCTAATGGGATTTCATCGGCATATTCT...,TTATTCCATGGCGGCATCTTCTAATGGGATTTCATCGGCATATTCT...
87279,Candidaalbicans,51475,G,T,2,0.333,6,G/G,G/G,T/T,...,946782.0,2,0.333,6.0,T/T,50502,972,TTATTCCATGGCGGCATCTTCTAATGGGATTTCATCGGCATATTCT...,TTATTCCATGGCGGCATCTTCTAATGGGATTTCATCGGCATATTCT...,TTATTCCATGGCGGCATCTTCTAATGGGATTTCATCGGCATATTCT...
87280,Candidaalbicans,51475,G,T,2,0.333,6,G/G,G/G,T/T,...,946812.0,2,0.333,6.0,T/T,50502,972,TTATTCCATGGCGGCATCTTCTAATGGGATTTCATCGGCATATTCT...,TTATTCCATGGCGGCATCTTCTAATGGGATTTCATCGGCATATTCT...,TTATTCCATGGCGGCATCTTCTAATGGGATTTCATCGGCATATTCT...


In [28]:
# Proof of mutation
print(df_merged.iloc[0]['dna seq'][117])

print(df_merged.iloc[0]['mutated dna seq'][117])

A
G


In [29]:
### Get mutated exon sequence

# Add exon coords to merged df...
def getExonCoordForMergedDF(landmark):
    for i in range(0,21):
        if landmark == df_albicans_genes.iloc[i]['synth genome positions'][0]:
            return df_albicans_genes.iloc[i]['exon coords']
        
df_merged['exon coords'] = df_merged.apply(lambda row: getExonCoordForMergedDF(row['synth landmark']), axis=1)

df_merged

Unnamed: 0,CHROM_test,POS_test,REF,ALT,AC_test,AF_test,AN_test,sample1.GT,sample2.GT,sample3.GT_test,...,AC_control,AF_control,AN_control,sample3.GT_control,synth landmark,Mutation pos in dna seq,dna seq,exon seq,mutated dna seq,exon coords
0,Candidaalbicans,118,A,G,2,0.500,4,A/G,A/G,./.,...,2,0.500,4.0,./.,0,117,TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...,TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...,TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...,"[(0, 1587)]"
1,Candidaalbicans,118,A,G,2,0.500,4,A/G,A/G,./.,...,2,0.500,4.0,./.,0,117,TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...,TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...,TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...,"[(0, 1587)]"
2,Candidaalbicans,118,A,G,2,0.500,4,A/G,A/G,./.,...,2,0.500,4.0,./.,0,117,TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...,TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...,TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...,"[(0, 1587)]"
3,Candidaalbicans,118,A,G,2,0.500,4,A/G,A/G,./.,...,2,0.500,4.0,./.,0,117,TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...,TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...,TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...,"[(0, 1587)]"
4,Candidaalbicans,118,A,G,2,0.500,4,A/G,A/G,./.,...,2,0.500,4.0,./.,0,117,TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...,TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...,TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...,"[(0, 1587)]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87277,Candidaalbicans,51475,G,T,2,0.333,6,G/G,G/G,T/T,...,2,0.333,6.0,T/T,50502,972,TTATTCCATGGCGGCATCTTCTAATGGGATTTCATCGGCATATTCT...,TTATTCCATGGCGGCATCTTCTAATGGGATTTCATCGGCATATTCT...,TTATTCCATGGCGGCATCTTCTAATGGGATTTCATCGGCATATTCT...,"[(0, 1302), (1466, 1502), (1717, 1729)]"
87278,Candidaalbicans,51475,G,T,2,0.333,6,G/G,G/G,T/T,...,2,0.333,6.0,T/T,50502,972,TTATTCCATGGCGGCATCTTCTAATGGGATTTCATCGGCATATTCT...,TTATTCCATGGCGGCATCTTCTAATGGGATTTCATCGGCATATTCT...,TTATTCCATGGCGGCATCTTCTAATGGGATTTCATCGGCATATTCT...,"[(0, 1302), (1466, 1502), (1717, 1729)]"
87279,Candidaalbicans,51475,G,T,2,0.333,6,G/G,G/G,T/T,...,2,0.333,6.0,T/T,50502,972,TTATTCCATGGCGGCATCTTCTAATGGGATTTCATCGGCATATTCT...,TTATTCCATGGCGGCATCTTCTAATGGGATTTCATCGGCATATTCT...,TTATTCCATGGCGGCATCTTCTAATGGGATTTCATCGGCATATTCT...,"[(0, 1302), (1466, 1502), (1717, 1729)]"
87280,Candidaalbicans,51475,G,T,2,0.333,6,G/G,G/G,T/T,...,2,0.333,6.0,T/T,50502,972,TTATTCCATGGCGGCATCTTCTAATGGGATTTCATCGGCATATTCT...,TTATTCCATGGCGGCATCTTCTAATGGGATTTCATCGGCATATTCT...,TTATTCCATGGCGGCATCTTCTAATGGGATTTCATCGGCATATTCT...,"[(0, 1302), (1466, 1502), (1717, 1729)]"


In [30]:
# Get mutated exon seq

#df_merged['mutated exon seq'] = df_merged.apply(lambda row: getExonseqForMergedDF(row['dna seq'], row['Mutation pos in dna seq'], row['ALT']), axis=1)
#df_merged

df_merged['mutated exon seq'] = df_merged.apply(lambda row: getExonSeq(row['exon coords'], row['mutated dna seq']), axis=1)

df_merged

Unnamed: 0,CHROM_test,POS_test,REF,ALT,AC_test,AF_test,AN_test,sample1.GT,sample2.GT,sample3.GT_test,...,AF_control,AN_control,sample3.GT_control,synth landmark,Mutation pos in dna seq,dna seq,exon seq,mutated dna seq,exon coords,mutated exon seq
0,Candidaalbicans,118,A,G,2,0.500,4,A/G,A/G,./.,...,0.500,4.0,./.,0,117,TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...,TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...,TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...,"[(0, 1587)]",TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...
1,Candidaalbicans,118,A,G,2,0.500,4,A/G,A/G,./.,...,0.500,4.0,./.,0,117,TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...,TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...,TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...,"[(0, 1587)]",TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...
2,Candidaalbicans,118,A,G,2,0.500,4,A/G,A/G,./.,...,0.500,4.0,./.,0,117,TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...,TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...,TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...,"[(0, 1587)]",TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...
3,Candidaalbicans,118,A,G,2,0.500,4,A/G,A/G,./.,...,0.500,4.0,./.,0,117,TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...,TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...,TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...,"[(0, 1587)]",TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...
4,Candidaalbicans,118,A,G,2,0.500,4,A/G,A/G,./.,...,0.500,4.0,./.,0,117,TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...,TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...,TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...,"[(0, 1587)]",TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87277,Candidaalbicans,51475,G,T,2,0.333,6,G/G,G/G,T/T,...,0.333,6.0,T/T,50502,972,TTATTCCATGGCGGCATCTTCTAATGGGATTTCATCGGCATATTCT...,TTATTCCATGGCGGCATCTTCTAATGGGATTTCATCGGCATATTCT...,TTATTCCATGGCGGCATCTTCTAATGGGATTTCATCGGCATATTCT...,"[(0, 1302), (1466, 1502), (1717, 1729)]",TTATTCCATGGCGGCATCTTCTAATGGGATTTCATCGGCATATTCT...
87278,Candidaalbicans,51475,G,T,2,0.333,6,G/G,G/G,T/T,...,0.333,6.0,T/T,50502,972,TTATTCCATGGCGGCATCTTCTAATGGGATTTCATCGGCATATTCT...,TTATTCCATGGCGGCATCTTCTAATGGGATTTCATCGGCATATTCT...,TTATTCCATGGCGGCATCTTCTAATGGGATTTCATCGGCATATTCT...,"[(0, 1302), (1466, 1502), (1717, 1729)]",TTATTCCATGGCGGCATCTTCTAATGGGATTTCATCGGCATATTCT...
87279,Candidaalbicans,51475,G,T,2,0.333,6,G/G,G/G,T/T,...,0.333,6.0,T/T,50502,972,TTATTCCATGGCGGCATCTTCTAATGGGATTTCATCGGCATATTCT...,TTATTCCATGGCGGCATCTTCTAATGGGATTTCATCGGCATATTCT...,TTATTCCATGGCGGCATCTTCTAATGGGATTTCATCGGCATATTCT...,"[(0, 1302), (1466, 1502), (1717, 1729)]",TTATTCCATGGCGGCATCTTCTAATGGGATTTCATCGGCATATTCT...
87280,Candidaalbicans,51475,G,T,2,0.333,6,G/G,G/G,T/T,...,0.333,6.0,T/T,50502,972,TTATTCCATGGCGGCATCTTCTAATGGGATTTCATCGGCATATTCT...,TTATTCCATGGCGGCATCTTCTAATGGGATTTCATCGGCATATTCT...,TTATTCCATGGCGGCATCTTCTAATGGGATTTCATCGGCATATTCT...,"[(0, 1302), (1466, 1502), (1717, 1729)]",TTATTCCATGGCGGCATCTTCTAATGGGATTTCATCGGCATATTCT...


In [31]:
def translate(exon_sequence):
    # Create a Biopython Seq object
    seq = Seq(exon_sequence)
    # Translate the sequence using the specified genetic code
    aa_sequence = seq.translate(table=12)
    return str(aa_sequence)

In [32]:
df_merged['AA seq'] = df_merged.apply(lambda row: translate(row['exon seq']), axis=1)
df_merged['mutated AA seq'] = df_merged.apply(lambda row: translate(row['mutated exon seq']), axis=1)
df_merged



Unnamed: 0,CHROM_test,POS_test,REF,ALT,AC_test,AF_test,AN_test,sample1.GT,sample2.GT,sample3.GT_test,...,sample3.GT_control,synth landmark,Mutation pos in dna seq,dna seq,exon seq,mutated dna seq,exon coords,mutated exon seq,AA seq,mutated AA seq
0,Candidaalbicans,118,A,G,2,0.500,4,A/G,A/G,./.,...,./.,0,117,TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...,TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...,TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...,"[(0, 1587)]",TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...,LKHTSFSFFPNDFCWFSR*NHH*TIIRVRHFITINSPS*IINKSS*...,LKHTSFSFFPNDFCWFSR*NHH*TIIRVRHFITINSPS*DYKQK*L...
1,Candidaalbicans,118,A,G,2,0.500,4,A/G,A/G,./.,...,./.,0,117,TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...,TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...,TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...,"[(0, 1587)]",TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...,LKHTSFSFFPNDFCWFSR*NHH*TIIRVRHFITINSPS*IINKSS*...,LKHTSFSFFPNDFCWFSR*NHH*TIIRVRHFITINSPS*DYKQK*L...
2,Candidaalbicans,118,A,G,2,0.500,4,A/G,A/G,./.,...,./.,0,117,TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...,TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...,TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...,"[(0, 1587)]",TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...,LKHTSFSFFPNDFCWFSR*NHH*TIIRVRHFITINSPS*IINKSS*...,LKHTSFSFFPNDFCWFSR*NHH*TIIRVRHFITINSPS*DYKQK*L...
3,Candidaalbicans,118,A,G,2,0.500,4,A/G,A/G,./.,...,./.,0,117,TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...,TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...,TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...,"[(0, 1587)]",TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...,LKHTSFSFFPNDFCWFSR*NHH*TIIRVRHFITINSPS*IINKSS*...,LKHTSFSFFPNDFCWFSR*NHH*TIIRVRHFITINSPS*DYKQK*L...
4,Candidaalbicans,118,A,G,2,0.500,4,A/G,A/G,./.,...,./.,0,117,TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...,TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...,TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...,"[(0, 1587)]",TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...,LKHTSFSFFPNDFCWFSR*NHH*TIIRVRHFITINSPS*IINKSS*...,LKHTSFSFFPNDFCWFSR*NHH*TIIRVRHFITINSPS*DYKQK*L...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87277,Candidaalbicans,51475,G,T,2,0.333,6,G/G,G/G,T/T,...,T/T,50502,972,TTATTCCATGGCGGCATCTTCTAATGGGATTTCATCGGCATATTCT...,TTATTCCATGGCGGCATCTTCTAATGGGATTTCATCGGCATATTCT...,TTATTCCATGGCGGCATCTTCTAATGGGATTTCATCGGCATATTCT...,"[(0, 1302), (1466, 1502), (1717, 1729)]",TTATTCCATGGCGGCATCTTCTAATGGGATTTCATCGGCATATTCT...,LFHGGIF*WDFIGIF*FFFINTSFLVLLVFTNQIIHIRFSFSKFHF...,LFHGGIF*WDFIGIF*FFFINTSFLVLLVFTNQIIHIRFSFSKFHF...
87278,Candidaalbicans,51475,G,T,2,0.333,6,G/G,G/G,T/T,...,T/T,50502,972,TTATTCCATGGCGGCATCTTCTAATGGGATTTCATCGGCATATTCT...,TTATTCCATGGCGGCATCTTCTAATGGGATTTCATCGGCATATTCT...,TTATTCCATGGCGGCATCTTCTAATGGGATTTCATCGGCATATTCT...,"[(0, 1302), (1466, 1502), (1717, 1729)]",TTATTCCATGGCGGCATCTTCTAATGGGATTTCATCGGCATATTCT...,LFHGGIF*WDFIGIF*FFFINTSFLVLLVFTNQIIHIRFSFSKFHF...,LFHGGIF*WDFIGIF*FFFINTSFLVLLVFTNQIIHIRFSFSKFHF...
87279,Candidaalbicans,51475,G,T,2,0.333,6,G/G,G/G,T/T,...,T/T,50502,972,TTATTCCATGGCGGCATCTTCTAATGGGATTTCATCGGCATATTCT...,TTATTCCATGGCGGCATCTTCTAATGGGATTTCATCGGCATATTCT...,TTATTCCATGGCGGCATCTTCTAATGGGATTTCATCGGCATATTCT...,"[(0, 1302), (1466, 1502), (1717, 1729)]",TTATTCCATGGCGGCATCTTCTAATGGGATTTCATCGGCATATTCT...,LFHGGIF*WDFIGIF*FFFINTSFLVLLVFTNQIIHIRFSFSKFHF...,LFHGGIF*WDFIGIF*FFFINTSFLVLLVFTNQIIHIRFSFSKFHF...
87280,Candidaalbicans,51475,G,T,2,0.333,6,G/G,G/G,T/T,...,T/T,50502,972,TTATTCCATGGCGGCATCTTCTAATGGGATTTCATCGGCATATTCT...,TTATTCCATGGCGGCATCTTCTAATGGGATTTCATCGGCATATTCT...,TTATTCCATGGCGGCATCTTCTAATGGGATTTCATCGGCATATTCT...,"[(0, 1302), (1466, 1502), (1717, 1729)]",TTATTCCATGGCGGCATCTTCTAATGGGATTTCATCGGCATATTCT...,LFHGGIF*WDFIGIF*FFFINTSFLVLLVFTNQIIHIRFSFSKFHF...,LFHGGIF*WDFIGIF*FFFINTSFLVLLVFTNQIIHIRFSFSKFHF...


In [33]:
# testing
#list(df_merged.iloc[0]['AA seq'])[0]

'L'

In [34]:
def getDeltaAA(WT, MUT):
    for i, aa in enumerate(list(MUT)):
        if aa != list(WT)[i]:
            return (i, aa)
    
    return 0

In [35]:
df_merged['delta AA'] = df_merged.apply(lambda row: getDeltaAA(row['AA seq'], row['mutated AA seq']), axis=1)

df_merged

Unnamed: 0,CHROM_test,POS_test,REF,ALT,AC_test,AF_test,AN_test,sample1.GT,sample2.GT,sample3.GT_test,...,synth landmark,Mutation pos in dna seq,dna seq,exon seq,mutated dna seq,exon coords,mutated exon seq,AA seq,mutated AA seq,delta AA
0,Candidaalbicans,118,A,G,2,0.500,4,A/G,A/G,./.,...,0,117,TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...,TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...,TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...,"[(0, 1587)]",TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...,LKHTSFSFFPNDFCWFSR*NHH*TIIRVRHFITINSPS*IINKSS*...,LKHTSFSFFPNDFCWFSR*NHH*TIIRVRHFITINSPS*DYKQK*L...,"(39, D)"
1,Candidaalbicans,118,A,G,2,0.500,4,A/G,A/G,./.,...,0,117,TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...,TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...,TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...,"[(0, 1587)]",TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...,LKHTSFSFFPNDFCWFSR*NHH*TIIRVRHFITINSPS*IINKSS*...,LKHTSFSFFPNDFCWFSR*NHH*TIIRVRHFITINSPS*DYKQK*L...,"(39, D)"
2,Candidaalbicans,118,A,G,2,0.500,4,A/G,A/G,./.,...,0,117,TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...,TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...,TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...,"[(0, 1587)]",TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...,LKHTSFSFFPNDFCWFSR*NHH*TIIRVRHFITINSPS*IINKSS*...,LKHTSFSFFPNDFCWFSR*NHH*TIIRVRHFITINSPS*DYKQK*L...,"(39, D)"
3,Candidaalbicans,118,A,G,2,0.500,4,A/G,A/G,./.,...,0,117,TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...,TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...,TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...,"[(0, 1587)]",TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...,LKHTSFSFFPNDFCWFSR*NHH*TIIRVRHFITINSPS*IINKSS*...,LKHTSFSFFPNDFCWFSR*NHH*TIIRVRHFITINSPS*DYKQK*L...,"(39, D)"
4,Candidaalbicans,118,A,G,2,0.500,4,A/G,A/G,./.,...,0,117,TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...,TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...,TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...,"[(0, 1587)]",TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...,LKHTSFSFFPNDFCWFSR*NHH*TIIRVRHFITINSPS*IINKSS*...,LKHTSFSFFPNDFCWFSR*NHH*TIIRVRHFITINSPS*DYKQK*L...,"(39, D)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87277,Candidaalbicans,51475,G,T,2,0.333,6,G/G,G/G,T/T,...,50502,972,TTATTCCATGGCGGCATCTTCTAATGGGATTTCATCGGCATATTCT...,TTATTCCATGGCGGCATCTTCTAATGGGATTTCATCGGCATATTCT...,TTATTCCATGGCGGCATCTTCTAATGGGATTTCATCGGCATATTCT...,"[(0, 1302), (1466, 1502), (1717, 1729)]",TTATTCCATGGCGGCATCTTCTAATGGGATTTCATCGGCATATTCT...,LFHGGIF*WDFIGIF*FFFINTSFLVLLVFTNQIIHIRFSFSKFHF...,LFHGGIF*WDFIGIF*FFFINTSFLVLLVFTNQIIHIRFSFSKFHF...,"(324, C)"
87278,Candidaalbicans,51475,G,T,2,0.333,6,G/G,G/G,T/T,...,50502,972,TTATTCCATGGCGGCATCTTCTAATGGGATTTCATCGGCATATTCT...,TTATTCCATGGCGGCATCTTCTAATGGGATTTCATCGGCATATTCT...,TTATTCCATGGCGGCATCTTCTAATGGGATTTCATCGGCATATTCT...,"[(0, 1302), (1466, 1502), (1717, 1729)]",TTATTCCATGGCGGCATCTTCTAATGGGATTTCATCGGCATATTCT...,LFHGGIF*WDFIGIF*FFFINTSFLVLLVFTNQIIHIRFSFSKFHF...,LFHGGIF*WDFIGIF*FFFINTSFLVLLVFTNQIIHIRFSFSKFHF...,"(324, C)"
87279,Candidaalbicans,51475,G,T,2,0.333,6,G/G,G/G,T/T,...,50502,972,TTATTCCATGGCGGCATCTTCTAATGGGATTTCATCGGCATATTCT...,TTATTCCATGGCGGCATCTTCTAATGGGATTTCATCGGCATATTCT...,TTATTCCATGGCGGCATCTTCTAATGGGATTTCATCGGCATATTCT...,"[(0, 1302), (1466, 1502), (1717, 1729)]",TTATTCCATGGCGGCATCTTCTAATGGGATTTCATCGGCATATTCT...,LFHGGIF*WDFIGIF*FFFINTSFLVLLVFTNQIIHIRFSFSKFHF...,LFHGGIF*WDFIGIF*FFFINTSFLVLLVFTNQIIHIRFSFSKFHF...,"(324, C)"
87280,Candidaalbicans,51475,G,T,2,0.333,6,G/G,G/G,T/T,...,50502,972,TTATTCCATGGCGGCATCTTCTAATGGGATTTCATCGGCATATTCT...,TTATTCCATGGCGGCATCTTCTAATGGGATTTCATCGGCATATTCT...,TTATTCCATGGCGGCATCTTCTAATGGGATTTCATCGGCATATTCT...,"[(0, 1302), (1466, 1502), (1717, 1729)]",TTATTCCATGGCGGCATCTTCTAATGGGATTTCATCGGCATATTCT...,LFHGGIF*WDFIGIF*FFFINTSFLVLLVFTNQIIHIRFSFSKFHF...,LFHGGIF*WDFIGIF*FFFINTSFLVLLVFTNQIIHIRFSFSKFHF...,"(324, C)"


In [36]:
# df_merged.to_csv("vcf_analysis.csv") the csv is 1.2 gigabytes...