# Make Synthetic Genome
Author : Mathieu Giguere \
Date : 02/07/2024 \
Brief : Strings together the genes of interest of some species into a singular synthetic genome. \
Dependencies : 

## Plan

### 1. Read genome files (fasta) -> big string
### 2. Read gffs files -> then use regex to extract the coordinates of the genes of interest
### 3. For each genome, for each gene of interest, extract its dna sequence in the appropriate genome file
### 4. Append each string into 1 genome with spacers
### 5. Write synthetic genome file in fasta format

In [1]:
import re
import pandas as pd

In [2]:
#with open("genes-species.xlsx") as xlsx:
#    xlsx_content = xlsx.read()

#csv = re.sub("\\t", ",", xlsx_content)


#new_csv = open("gene-species.csv", "w")
#new_csv.write(csv)
#new_csv.close()

In [3]:
# Make dataframe that links the species to their genes of interest

df = pd.read_csv("gene-species.csv")

df = df.groupby('Species')['Gene ID'].apply(list).reset_index()

df

Unnamed: 0,Species,Gene ID
0,Aspergillus fumigatus,"[AFUA_4G06890, AFUA_6G05140, AFUA_2G00320, AFU..."
1,Candida albicans,"[C5_00660C_A, C1_04770C_A, C1_02420C_A, CR_008..."
2,Candida auris,"[B9J08_001448, B9J08_003737, B9J08_000964, B9J..."
3,Candida parapsilosis,"[CPAR2_303740, CPAR2_105550, CPAR2_106400, CPA..."
4,Candida tropicalis,"[CTRG_05283, CTRG_04480, CTRG_04661, CTRG_0268..."
5,Cryptococcus neoformans,"[CNA00300, CNN02320, CNE02100, CNA05950, CNF04..."
6,Nasakeomyces glabrata,"[CAGL0E04334g, CAGL0F01793g, CAGL0G01034g, CAG..."
7,Pichia kudriavzevii,"[JL09_g2508, JL09_g3074, JL09_g1956, JL09_g200..."


In [4]:

species_genome_file_list = ["fasta_files/unzipped/Aspergillus_fumigatus.ASM265v1.dna.toplevel.fa",
                           "fasta_files/unzipped/C_albicans_SC5314_version_A22-s07-m01-r195_chromosomes.fasta",
                           "fasta_files/unzipped/C_auris_B8441_version_s01-m03-r08_chromosomes.fasta",
                           "fasta_files/unzipped/C_glabrata_CBS138_version_s05-m03-r06_chromosomes.fasta",
                           "fasta_files/unzipped/C_parapsilosis_CDC317_version_s01-m06-r03_chromosomes.fasta",
                           "fasta_files/unzipped/Candida_tropicalis.GCA000006335v3.dna.toplevel.fa",
                           "fasta_files/unzipped/Cryptococcus_neoformans.ASM9104v1.dna.toplevel.fa",
                           "fasta_files/unzipped/Pichia_kudriavzevii_gca_000764455.ASM76445v1.dna.toplevel.fa"]

species_list = ["Aspergillus fumigatus", "Candida albicans", "Candida auris", "Nasakeomyces glabrata", "Candida parapsilosis", "Candida tropicalis", "Cryptococcus neoformans", "Pichia kudriavzevii"]

gff_files_list = ["gff_files/Aspergillus_fumigatus.ASM265v1.59.gff3",
                 "gff_files/C_albicans_SC5314_version_A22-s07-m01-r195.gff",
                 "gff_files/C_auris_B8441_version_s01-m03-r08.gff",
                 "gff_files/C_glabrata_CBS138_version_s05-m03-r06.gff",
                 "gff_files/C_parapsilosis_CDC317_version_s01-m06-r03.gff",
                 "gff_files/Candida_tropicalis.GCA000006335v3.59.gff3",
                 "gff_files/Cryptococcus_neoformans.ASM9104v1.59.gff3",
                 "gff_files/Pichia_kudriavzevii_gca_000764455.ASM76445v1.59.gff3"]

In [5]:
synth = ""
spacer = "NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN"

for i, file in enumerate(species_genome_file_list):
    species = species_list[i]
    print(species)
    with open(file) as genome:
        genome_content = genome.read()
    
    # Removes first line puts a space
    dna = re.sub(">.*", "", genome_content)
    dna = re.sub("\n", "", dna)
    
    gff_file = gff_files_list[i]
    with open(gff_file) as gff:
        gff_content = gff.read()
    
    gene_list = df['Gene ID'][df['Species'] == species].values[0]
    
    for g in gene_list:
        print(f'Gene ID : {g}')
        
        # find gene coordinates
        y = re.findall(f"gene\\t.*?{g};", gff_content)
        coordinates = re.findall("[0-9+-]+", y[0])
        print(f'Coordinates : {coordinates[0:3]}')
        begin = int(coordinates[0])
        end = int(coordinates[1])
        strand = coordinates[2]
        
        gene_seq = dna[begin-1:end]
        #if strand == '-':
        #    gene_seq = dna[begin-1:end][::-1]
        #else:
        #    gene_seq = dna[begin-1:end]
        
        print(f'Gene length : {len(gene_seq)}')
        #print(gene_seq)
        synth += gene_seq + spacer * 5
    
    synth += spacer * 5

Aspergillus fumigatus
Gene ID : AFUA_4G06890
Coordinates : ['1780204', '1781822', '-']
Gene length : 1619
Gene ID : AFUA_6G05140
Coordinates : ['1228802', '1229934', '-']
Gene length : 1133
Gene ID : AFUA_2G00320
Coordinates : ['61020', '62153', '+']
Gene length : 1134
Gene ID : AFUA_6G12400
Coordinates : ['3126474', '3132292', '-']
Gene length : 5819
Gene ID : AFUA_5G05460
Coordinates : ['1268053', '1268981', '-']
Gene length : 929
Gene ID : AFUA_1G05050
Coordinates : ['1449702', '1450353', '+']
Gene length : 652
Gene ID : AFUA_2G15130
Coordinates : ['3997636', '4002389', '-']
Gene length : 4754
Gene ID : AFUA_4G12560
Coordinates : ['3289801', '3292477', '-']
Gene length : 2677
Gene ID : AFUA_2G03700
Coordinates : ['984934', '988503', '+']
Gene length : 3570
Gene ID : AFUA_3G05760
Coordinates : ['1406209', '1407279', '-']
Gene length : 1071
Gene ID : AFUA_5G06070
Coordinates : ['1446973', '1451166', '+']
Gene length : 4194
Gene ID : AFUA_1G10910
Coordinates : ['2848155', '2850137', '-

# Fasta file creation

In [6]:
synth_w_newlines = re.sub("(.{60})", "\\1\n", synth, 0, re.DOTALL)

fasta_header = ">synthetic genome, author GiguereM\n"

my_synth_genome = open("my_synthetic_genome.txt", "w")
my_synth_genome.write(fasta_header + synth_w_newlines)
my_synth_genome.close()

### Is it finding the right sequences.... I'm not sure. Large NNN sequence in Cryptococcus ?? 

https://fungi.ensembl.org/Pichia_kudriavzevii_gca_000764455/Transcript/Exons?db=core;g=JL09_g2004;r=scaffold00015:63910-67645;t=KGK38844 according to this I'm really not finding the right sequences. For gene JL09_g2004

For gene JL09_g66 the gene seems to be on the other DNA strand and in reverse.https://fungi.ensembl.org/Pichia_kudriavzevii_gca_000764455/Transcript/Exons?db=core;g=JL09_g66;r=scaffold00001:170857-171309;t=KGK40609

However, for some genes such as : AFUA_1G05050, https://fungi.ensembl.org/Aspergillus_fumigatus/Transcript/Exons?db=core;g=AFUA_1G05050;r=1:1449702-1450353;t=EAL88216 it seems very good !

same for CPAR2_602820 with http://www.candidagenome.org/cgi-bin/getSeq?seq=CPAR2_602820&flankl=0&flankr=0&map=a3map&seq_source=C.%20parapsilosis%20CDC317

CNA05950 : reverse strand. https://fungi.ensembl.org/Cryptococcus_neoformans/Transcript/Exons?db=core;g=CNA05950;r=1:1593426-1594383;t=AAW41129

Problem of many >headers in genome fasta files... and with added ' ' not good.

# Troubleshooting REGEX

In [7]:
# Gene C5_03390C_A
genome_content = ""
with open('fasta_files/unzipped/C_parapsilosis_CDC317_version_s01-m06-r03_chromosomes.fasta') as genome:
        genome_content = genome.read()

# Removes first line puts a space
print(len(genome_content))
dna = re.sub(">.*", "", genome_content)
print(len(dna))
dna = re.sub("[^A-Z]*", "", dna)
print(len(dna))
res = re.search(r'TTATATGCAGTAATATCTGTCACCAAAATCTCCCAAACCAGGAACAATGTACTTGTCTTCATCAAGCTTCTCATCTATTCCTCCAGTGATGATTGTGACTTCAGGGTACTTGTCATGGAACGCTTTTATCCCCTCAGGTGCTGCCAACAAGTTCAAAAAGAATATTCTTTCCATCTTGACTCCTCTCGCAAGCAACACTTCCACTGCCATCATTGCTGAGCCACCAGTTGCCAACATTGGATCTAACAAAAACACAAAACGTTCACTAATGTCTTCTGGCAATTTTTCGTAAAACAATTTAGGCAAAGCAGTCTCTTCATCTCGTTGAATCAAAATTTTGCCTATTCTCACCGATCTACAGCAATCTCTTAAACCCATTTCCATGGATTCACCAGCTCTAACAATGGAAACACCGCAGATTTTGCCCAAGAATTTAGTCCCCTTGTATTTATGACCTCCATGGCATTCAATAATGCATTCTTCCACTGGTAGTTGATTCAACCCTTCTTCAACTAAAAGACGAATAATACGATCAGAATAGAATACAAAGTCGCCTCTTTTGGTACTTTGATCACGAATGATACTGTAAAGGCCAATTAGTTGATTTGTTTGTGGAAGCAAGATCACGTTCTTGTCTACTTGTTCGGGGACAGACAT', dna)

print(res.span())

## Result: Not good, the genome only has the 5' to 3' sequence even if the gene is located on the 3' to 5' strand. And it isn't at the right coordinates according to the gff.

13247856
13247356
13030174
(10705754, 10706411)


In [8]:
# end of code
genome_content = ""
with open('fasta_files/unzipped/C_parapsilosis_CDC317_version_s01-m06-r03_chromosomes.fasta') as genome:
        genome_content = genome.read()

# Removes first line puts a space
print(len(genome_content))
dna = re.sub(">.*", "", genome_content)
print(len(dna))
dna = re.sub("\n", "", dna)
print(len(dna))
res = re.search(r'ATGTCTTCTTCTTCTTCTTCTTCTTCTTCTTCTTCATTTGACGATAAAAAGGGTATGCAAATTGCTCTTGAACAAGCTAAAAAATCATTTGCTGAAGGTGGTATCCCCATTGGCGGTTGCTTAATCAAATCAGATGGTACTTTATTATCCACAGGACATAACCAAAGAGTACAAAAAGGGTCAGCTATATTGCATGGTGAGATGTCAGTCTTGGAACATGCTGGTAGATTACCGGCATCAACTTATCGTGATTGTACAATGTACACTACGTTATCGCCTTGCTCAATGTGCACTGGTGCCATATTGCTATATGGAATCAAGCGAGTCGTTGTTGGTGAAAACGAAACGTTCATGGGTGGTGAAGCATTGTTGAAGCAAAATGGAGTTGAAGTTGTTAATTTGGACGATGAAGGGTGTAAAGAAATTTTGCAAAAATTTATTAGTGAGAAGCCACATTGTTGGAATGAGGATATAGGTGTATAA', dna)
print(res.span())
print(res.group())

## Result: Good 

13247856
13247356
13030174
(674660, 675143)
ATGTCTTCTTCTTCTTCTTCTTCTTCTTCTTCTTCATTTGACGATAAAAAGGGTATGCAAATTGCTCTTGAACAAGCTAAAAAATCATTTGCTGAAGGTGGTATCCCCATTGGCGGTTGCTTAATCAAATCAGATGGTACTTTATTATCCACAGGACATAACCAAAGAGTACAAAAAGGGTCAGCTATATTGCATGGTGAGATGTCAGTCTTGGAACATGCTGGTAGATTACCGGCATCAACTTATCGTGATTGTACAATGTACACTACGTTATCGCCTTGCTCAATGTGCACTGGTGCCATATTGCTATATGGAATCAAGCGAGTCGTTGTTGGTGAAAACGAAACGTTCATGGGTGGTGAAGCATTGTTGAAGCAAAATGGAGTTGAAGTTGTTAATTTGGACGATGAAGGGTGTAAAGAAATTTTGCAAAAATTTATTAGTGAGAAGCCACATTGTTGGAATGAGGATATAGGTGTATAA


In [9]:
# end of code
genome_content = ""
with open('fasta_files/unzipped/Aspergillus_fumigatus.ASM265v1.dna.toplevel.fa') as genome:
        genome_content = genome.read()


res = re.compile('>.*')
matches = res.finditer(genome_content)

for i, match in enumerate(matches):
    print(f'match {i} : ', match.group())
    print(f'span {i} : ', match.span())

match 0 :  >1 dna:chromosome chromosome:ASM265v1:1:1:4918979:1 REF
span 0 :  (0, 55)
match 1 :  >2 dna:chromosome chromosome:ASM265v1:2:1:4844472:1 REF
span 1 :  (5001018, 5001073)
match 2 :  >3 dna:chromosome chromosome:ASM265v1:3:1:4079167:1 REF
span 2 :  (9926288, 9926343)
match 3 :  >4 dna:chromosome chromosome:ASM265v1:4:1:3923705:1 REF
span 3 :  (14073498, 14073553)
match 4 :  >5 dna:chromosome chromosome:ASM265v1:5:1:3948441:1 REF
span 4 :  (18062655, 18062710)
match 5 :  >6 dna:chromosome chromosome:ASM265v1:6:1:3778736:1 REF
span 5 :  (22076960, 22077015)
match 6 :  >7 dna:chromosome chromosome:ASM265v1:7:1:2058334:1 REF
span 6 :  (25918731, 25918786)
match 7 :  >8 dna:chromosome chromosome:ASM265v1:8:1:1833124:1 REF
span 7 :  (28011427, 28011482)


In [10]:
# end of code
genome_content = ""
with open('fasta_files/unzipped/C_parapsilosis_CDC317_version_s01-m06-r03_chromosomes.fasta') as genome:
        genome_content = genome.read()


res = re.compile('>.*')
matches = res.finditer(genome_content)

for i, match in enumerate(matches):
    print(f'match {i} : ', match.group())
    print(f'span {i} : ', match.span())

match 0 :  >Contig005504_C_parapsilosis_CDC317 (898305 nucleotides)
span 0 :  (0, 56)
match 1 :  >Contig005569_C_parapsilosis_CDC317 (2235583 nucleotides)
span 1 :  (913334, 913391)
match 2 :  >Contig005806_C_parapsilosis_CDC317 (1039767 nucleotides)
span 2 :  (3186235, 3186292)
match 3 :  >Contig005807_C_parapsilosis_CDC317 (2091826 nucleotides)
span 3 :  (4243390, 4243447)
match 4 :  >Contig005809_C_parapsilosis_CDC317 (3023470 nucleotides)
span 4 :  (6370138, 6370195)
match 5 :  >Contig006110_C_parapsilosis_CDC317 (957321 nucleotides)
span 5 :  (9444058, 9444114)
match 6 :  >Contig006139_C_parapsilosis_CDC317 (962442 nucleotides)
span 6 :  (10417392, 10417448)
match 7 :  >Contig006372_C_parapsilosis_CDC317 (1789679 nucleotides)
span 7 :  (11395932, 11395989)
match 8 :  >mito_C_parapsilosis_CDC317 (31781 nucleotides)
span 8 :  (13215497, 13215544)


In [11]:
# end of code
genome_content = ""
with open('fasta_files/unzipped/C_albicans_SC5314_version_A22-s07-m01-r195_chromosomes.fasta') as genome:
        genome_content = genome.read()


res = re.compile('>.*')
matches = res.finditer(genome_content)

for i, match in enumerate(matches):
    print(f'match {i} : ', match.group())
    print(f'span {i} : ', match.span())

match 0 :  >Ca22chr1A_C_albicans_SC5314 (3188341 nucleotides)
span 0 :  (0, 50)
match 1 :  >Ca22chr1B_C_albicans_SC5314 (3188396 nucleotides)
span 1 :  (3241532, 3241582)
match 2 :  >Ca22chr2A_C_albicans_SC5314 (2231883 nucleotides)
span 2 :  (6483119, 6483169)
match 3 :  >Ca22chr2B_C_albicans_SC5314 (2231750 nucleotides)
span 3 :  (8752252, 8752302)
match 4 :  >Ca22chr3A_C_albicans_SC5314 (1799298 nucleotides)
span 4 :  (11021249, 11021299)
match 5 :  >Ca22chr3B_C_albicans_SC5314 (1799271 nucleotides)
span 5 :  (12850587, 12850637)
match 6 :  >Ca22chr4A_C_albicans_SC5314 (1603259 nucleotides)
span 6 :  (14679897, 14679947)
match 7 :  >Ca22chr4B_C_albicans_SC5314 (1603311 nucleotides)
span 7 :  (16309928, 16309978)
match 8 :  >Ca22chr5A_C_albicans_SC5314 (1190869 nucleotides)
span 8 :  (17940012, 17940062)
match 9 :  >Ca22chr5B_C_albicans_SC5314 (1190991 nucleotides)
span 9 :  (19150780, 19150830)
match 10 :  >Ca22chr6A_C_albicans_SC5314 (1033292 nucleotides)
span 10 :  (20361672, 2036