# Make Synthetic Genome
Author : Mathieu Giguere \
Date : 02/07/2024 \
Brief : Strings together the genes of interest of some species into a singular synthetic genome. \
Dependencies : 

## Plan

### 1. Read genome files (fasta) -> big string
### 2. Read gffs files -> then use regex to extract the coordinates of the genes of interest
### 3. For each genome, for each gene of interest, extract its dna sequence in the appropriate genome file
### 4. Append each string into 1 genome with spacers
### 5. Write synthetic genome file in fasta format

In [1]:
import re
import pandas as pd

In [2]:
species_genome_file_list = ["fasta_files/unzipped/Aspergillus_fumigatus.ASM265v1.dna.toplevel.fa",
                           "fasta_files/unzipped/C_albicans_SC5314_version_A22-s07-m01-r195_chromosomes.fasta",
                           "fasta_files/unzipped/C_auris_B8441_version_s01-m03-r08_chromosomes.fasta",
                           "fasta_files/unzipped/C_glabrata_CBS138_version_s05-m03-r06_chromosomes.fasta",
                           "fasta_files/unzipped/C_parapsilosis_CDC317_version_s01-m06-r03_chromosomes.fasta",
                           "fasta_files/unzipped/Candida_tropicalis.GCA000006335v3.dna.toplevel.fa",
                           "fasta_files/unzipped/Cryptococcus_neoformans.ASM9104v1.dna.toplevel.fa",
                           "fasta_files/unzipped/Pichia_kudriavzevii_gca_000764455.ASM76445v1.dna.toplevel.fa"]

species_list = ["Aspergillus fumigatus", "Candida albicans", "Candida auris", "Nasakeomyces glabrata", "Candida parapsilosis", "Candida tropicalis", "Cryptococcus neoformans", "Pichia kudriavzevii"]

gff_files_list = ["gff_files/Aspergillus_fumigatus.ASM265v1.59.gff3",
                 "gff_files/C_albicans_SC5314_version_A22-s07-m01-r195.gff",
                 "gff_files/C_auris_B8441_version_s01-m03-r08.gff",
                 "gff_files/C_glabrata_CBS138_version_s05-m03-r06.gff",
                 "gff_files/C_parapsilosis_CDC317_version_s01-m06-r03.gff",
                 "gff_files/Candida_tropicalis.GCA000006335v3.59.gff3",
                 "gff_files/Cryptococcus_neoformans.ASM9104v1.59.gff3",
                 "gff_files/Pichia_kudriavzevii_gca_000764455.ASM76445v1.59.gff3"]

In [3]:
# Make dataframe that links the species to their genes of interest

df = pd.read_csv("gene-species.csv")

df = df.groupby('Species')['Gene ID'].apply(list).reset_index()

df

Unnamed: 0,Species,Gene ID
0,Aspergillus fumigatus,"[AFUA_4G06890, AFUA_6G05140, AFUA_2G00320, AFU..."
1,Candida albicans,"[C5_00660C_A, C1_04770C_A, C1_02420C_A, CR_008..."
2,Candida auris,"[B9J08_001448, B9J08_003737, B9J08_000964, B9J..."
3,Candida parapsilosis,"[CPAR2_303740, CPAR2_105550, CPAR2_106400, CPA..."
4,Candida tropicalis,"[CTRG_05283, CTRG_04480, CTRG_04661, CTRG_0268..."
5,Cryptococcus neoformans,"[CNA00300, CNN02320, CNE02100, CNA05950, CNF04..."
6,Nasakeomyces glabrata,"[CAGL0E04334g, CAGL0F01793g, CAGL0G01034g, CAG..."
7,Pichia kudriavzevii,"[JL09_g2508, JL09_g3074, JL09_g1956, JL09_g200..."


In [18]:
synth = ""
spacer = "NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN"

for i, file in enumerate(species_genome_file_list):
    species = species_list[i]
    print(species)
    with open(file) as genome:
        genome_content = genome.read()
    
    # Removes first line puts a space
    dna = re.sub(">.*", "", genome_content)
    dna = re.sub("\n", "", dna)
    
    gff_file = gff_files_list[i]
    with open(gff_file) as gff:
        gff_content = gff.read()
    
    gene_list = df['Gene ID'][df['Species'] == species].values[0]
    
    for g in gene_list:
        print(f'Gene ID : {g}')
        
        # find gene coordinates
        #y = re.findall(f"^.*\\tgene\\t.*?{g};", gff_content)
        y = re.findall(f"^.*\\t.*\\tgene\\t.*?{g}", gff_content, flags=re.MULTILINE)
        print(y)
        #coordinates = re.sub("\t", "", y[0])
        coordinates = re.findall("[0-9+-]+", y[0])
        print(f'Coordinates : {coordinates[0:4]}')
        #begin = int(coordinates[0])
        #end = int(coordinates[1])
        #strand = coordinates[2]
        
        #gene_seq = dna[begin-1:end]
        
        #print(f'Gene length : {len(gene_seq)}')
        #print(gene_seq)
        #synth += gene_seq + spacer * 5
    
    #synth += spacer * 5

Aspergillus fumigatus
Gene ID : AFUA_4G06890
['4\tena\tgene\t1780204\t1781822\t.\t-\t.\tID=gene:AFUA_4G06890']
Coordinates : ['4', '1780204', '1781822', '-']
Gene ID : AFUA_6G05140
['6\tena\tgene\t1228802\t1229934\t.\t-\t.\tID=gene:AFUA_6G05140']
Coordinates : ['6', '1228802', '1229934', '-']
Gene ID : AFUA_2G00320
['2\tena\tgene\t61020\t62153\t.\t+\t.\tID=gene:AFUA_2G00320']
Coordinates : ['2', '61020', '62153', '+']
Gene ID : AFUA_6G12400
['6\tena\tgene\t3126474\t3132292\t.\t-\t.\tID=gene:AFUA_6G12400']
Coordinates : ['6', '3126474', '3132292', '-']
Gene ID : AFUA_5G05460
['5\tena\tgene\t1268053\t1268981\t.\t-\t.\tID=gene:AFUA_5G05460']
Coordinates : ['5', '1268053', '1268981', '-']
Gene ID : AFUA_1G05050
['1\tena\tgene\t1449702\t1450353\t.\t+\t.\tID=gene:AFUA_1G05050']
Coordinates : ['1', '1449702', '1450353', '+']
Gene ID : AFUA_2G15130
['2\tena\tgene\t3997636\t4002389\t.\t-\t.\tID=gene:AFUA_2G15130']
Coordinates : ['2', '3997636', '4002389', '-']
Gene ID : AFUA_4G12560
['4\tena\tg

# Fasta file creation

In [None]:
synth_w_newlines = re.sub("(.{60})", "\\1\n", synth, 0, re.DOTALL)

fasta_header = ">synthetic genome, author GiguereM\n"

my_synth_genome = open("my_synthetic_genome.txt", "w")
my_synth_genome.write(fasta_header + synth_w_newlines)
my_synth_genome.close()

# Update !!

### In GFF, the start and end coordinates of the feature are given in positive 1-based integer coordinates, relative to the **landmark given in column one**.

### So I will implement a regex to get the landmark of a gene, and add it to its sequence coordinates !!!