In [28]:
import pandas as pd
import re

In [29]:
test_file = 'test3.table'
control_file = 'control_albicans.table'

df_test = pd.read_csv(test_file, sep='\t')
df_control = pd.read_csv(control_file, sep='\t')

df_test = df_test.rename(columns={"NS.2209.003.UDP0161_i7---UDP0161_i5.fRS1_.GT": "sample1.GT", "NS.2209.003.UDP0163_i7---UDP0163_i5.fRS757_.GT": "sample2.GT", "NS.2209.003.UDP0167_i7---UDP0167_i5.A12_.GT" : "sample3.GT"})
df_control = df_control.rename(columns={"NS.2209.003.UDP0161_i7---UDP0161_i5.fRS1_.GT": "sample1.GT", "NS.2209.003.UDP0163_i7---UDP0163_i5.fRS757_.GT": "sample2.GT", "NS.2209.003.UDP0167_i7---UDP0167_i5.A12_.GT" : "sample3.GT"})

df_test.columns

Index(['CHROM', 'POS', 'REF', 'ALT', 'AC', 'AF', 'AN', 'sample1.GT',
       'sample2.GT', 'sample3.GT'],
      dtype='object')

In [30]:
# Only keep albicans in df_test

df_test_filtered = df_test[df_test['CHROM'] == 'Candidaalbicans']

df_test_filtered

Unnamed: 0,CHROM,POS,REF,ALT,AC,AF,AN,sample1.GT,sample2.GT,sample3.GT
1,Candidaalbicans,118,A,G,2,0.500,4,A/G,A/G,./.
2,Candidaalbicans,148,T,C,2,0.500,4,T/C,T/C,./.
3,Candidaalbicans,182,CTA,C,1,0.250,4,CTA/CTA,CTA/C,./.
4,Candidaalbicans,186,G,GGT,1,0.250,4,G/G,G/GGT,./.
5,Candidaalbicans,478,G,A,2,0.500,4,G/A,G/A,./.
...,...,...,...,...,...,...,...,...,...,...
161,Candidaalbicans,51443,TGG,T,2,0.333,6,TGG/TGG,TGG/TGG,T/T
162,Candidaalbicans,51448,G,GAA,2,0.333,6,G/G,G/G,GAA/GAA
163,Candidaalbicans,51460,T,C,2,0.333,6,T/T,T/T,C/C
164,Candidaalbicans,51463,T,C,2,0.333,6,T/T,T/T,C/C


In [31]:
# Merge on REF and ALT

df_merged = df_test_filtered.merge(df_control, how='left', on=['REF', 'ALT', 'sample1.GT', 'sample2.GT'], suffixes=('_test', '_control'))

df_merged #[0:15]

Unnamed: 0,CHROM_test,POS_test,REF,ALT,AC_test,AF_test,AN_test,sample1.GT,sample2.GT,sample3.GT_test,CHROM_control,POS_control,AC_control,AF_control,AN_control,sample3.GT_control
0,Candidaalbicans,118,A,G,2,0.500,4,A/G,A/G,./.,Ca22chr1A_C_albicans_SC5314,4031.0,2,0.500,4.0,./.
1,Candidaalbicans,118,A,G,2,0.500,4,A/G,A/G,./.,Ca22chr1A_C_albicans_SC5314,4042.0,2,0.500,4.0,./.
2,Candidaalbicans,118,A,G,2,0.500,4,A/G,A/G,./.,Ca22chr1A_C_albicans_SC5314,4044.0,2,0.500,4.0,./.
3,Candidaalbicans,118,A,G,2,0.500,4,A/G,A/G,./.,Ca22chr1A_C_albicans_SC5314,6656.0,2,0.500,4.0,./.
4,Candidaalbicans,118,A,G,2,0.500,4,A/G,A/G,./.,Ca22chr1A_C_albicans_SC5314,8174.0,2,0.500,4.0,./.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87277,Candidaalbicans,51475,G,T,2,0.333,6,G/G,G/G,T/T,Ca22chr5B_C_albicans_SC5314,348856.0,2,0.333,6.0,T/T
87278,Candidaalbicans,51475,G,T,2,0.333,6,G/G,G/G,T/T,Ca22chr5B_C_albicans_SC5314,946773.0,2,0.333,6.0,T/T
87279,Candidaalbicans,51475,G,T,2,0.333,6,G/G,G/G,T/T,Ca22chr5B_C_albicans_SC5314,946782.0,2,0.333,6.0,T/T
87280,Candidaalbicans,51475,G,T,2,0.333,6,G/G,G/G,T/T,Ca22chr5B_C_albicans_SC5314,946812.0,2,0.333,6.0,T/T


In [46]:
# Get gene spans from synthetic genome only albicans ?

synth_gen = '../synth_genome/v4_synthetic_genome.fa'

with open(synth_gen) as genome:
    genome_content = genome.read()

# Removes first line puts a space
#dna = re.sub(">.*", "", genome_content)
genome_string = re.sub("\n", "", genome_content)
albicans_only = re.findall(r">Candidaalbicans\w*", genome_string, flags=re.MULTILINE)[0]

print(len(albicans_only))
print(albicans_only[0:100])

52547
>CandidaalbicansTTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGTTCAGTAGGTAAAACCACCATTGAACTATAATCAGGGTC


In [47]:
# remove header
albicans_only_sequences = re.sub(">Candidaalbicans", "", albicans_only)

print(albicans_only_sequences[0:100])

TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGTTCAGTAGGTAAAACCACCATTGAACTATAATCAGGGTCAGGCACTTTATAACCA


In [55]:
# Gene ids
gene_list = ['C5_00660C_A', 'C1_04770C_A', 'C1_02420C_A', 'CR_00850C_A', 'C5_03390C_A',
             'C6_00620W_A', 'C3_05920W_A', 'C3_05220W_A', 'C5_01840C_A', 'C1_08460C_A',
             'C3_04890W_A', 'C3_07860C_A', 'C6_03170C_A', 'C1_08590C_A', 'C3_02220W_A',
             'C1_00800C_A', 'C1_03780C_A', 'C3_06850W_A', 'CR_08780W_A', 'CR_08800W_A',
             'C1_00710C_A', "end"]

In [59]:
pattern = re.compile("[^N]*N*")
matches = pattern.finditer(albicans_only_sequences)

span_list = []
match_list = []

for i, match in enumerate(matches):
    gene = gene_list[i]
    span_list.append(match.span())
    #print(f'span {gene} : ', match.span())
    match_list.append(match.group())
    #print(f'match {gene} : ', match.group())

df_albicans_genes = pd.DataFrame()
df_albicans_genes['ID'] = gene_list
df_albicans_genes['positions'] = span_list
df_albicans_genes['seq'] = match_list

df_albicans_genes

Unnamed: 0,ID,positions,seq
0,C5_00660C_A,"(0, 1887)",TTAAAACATACAAGTTTCTCTTTTTTCCCAAATGATTTCTGCTGGT...
1,C1_04770C_A,"(1887, 3348)",TCATTGTTCAACATATTCTCTATCGTCAACTTTACCTTCAACTTCT...
2,C1_02420C_A,"(3348, 9342)",TTAAACTCTGAATGGATTTGTAGAATAAGGGGTGGTAGTTCCAGTA...
3,CR_00850C_A,"(9342, 14565)",CTAGTAATGCTTAATTTCTAATTTTTTCTTTAATCCTTTCAGATTT...
4,C5_03390C_A,"(14565, 15522)",TTAAATACAGTAATATCTATCACCGAAATCACCTAGACCTGGAACA...
5,C6_00620W_A,"(15522, 16345)",ATGACGTTTGACGACAAAAAAGGTTTACAAGTTGCTCTTGATCAAG...
6,C3_05920W_A,"(16345, 19972)",ATGTCAATTGCCACCACCCCTATAGAAACACCTAAAAGTCCTAAAA...
7,C3_05220W_A,"(19972, 24778)",ATGTCAGATTCTAAGATGTCGTCGCAAGATGAATCTAAATTAGAAA...
8,C5_01840C_A,"(24778, 28024)",TTAAATCCCCAAATTATTGTCAAAGAAAAAATTGGGTAAATTATTA...
9,C1_08460C_A,"(28024, 30463)",CTATTTCATATTCATAAACCCATTATCACCTTGCATTAAGTCTAAA...
