In [1]:
import re
import pandas as pd
from BCBio import GFF

In [101]:
species_genome_file_list = ["fasta_files/unzipped/Aspergillus_fumigatus.ASM265v1.dna.toplevel.fa",
                           "fasta_files/unzipped/C_albicans_SC5314_version_A22-s07-m01-r195_chromosomes.fasta",
                           "fasta_files/unzipped/C_auris_B8441_version_s01-m03-r08_chromosomes.fasta",
                           "fasta_files/unzipped/C_glabrata_CBS138_version_s05-m03-r06_chromosomes.fasta",
                           "fasta_files/unzipped/C_parapsilosis_CDC317_version_s01-m06-r03_chromosomes.fasta",
                           "fasta_files/unzipped/Candida_tropicalis.GCA000006335v3.dna.toplevel.fa",
                           "fasta_files/unzipped/Cryptococcus_neoformans.ASM9104v1.dna.toplevel.fa",
                           "fasta_files/unzipped/Pichia_kudriavzevii_gca_000764455.ASM76445v1.dna.toplevel.fa"]

species_list = ["Aspergillus fumigatus", "Candida albicans", "Candida auris", "Nasakeomyces glabrata", "Candida parapsilosis", "Candida tropicalis", "Cryptococcus neoformans", "Pichia kudriavzevii"]

gff_files_list = ["gff_files/Aspergillus_fumigatus.ASM265v1.59.gff3",
                 "gff_files/C_albicans_SC5314_version_A22-s07-m01-r195.gff",
                 "gff_files/C_auris_B8441_version_s01-m03-r08.gff",
                 "gff_files/C_parapsilosis_CDC317_version_s01-m06-r03.gff",
                 "gff_files/Candida_tropicalis.GCA000006335v3.59.gff3",
                 "gff_files/Cryptococcus_neoformans.ASM9104v1.59.gff3",
                  "gff_files/C_glabrata_CBS138_version_s05-m03-r06.gff",
                 "gff_files/Pichia_kudriavzevii_gca_000764455.ASM76445v1.59.gff3"]

In [102]:
# Make dataframe that links the species to their genes of interest

df = pd.read_csv("gene-species.csv")

df = df.groupby('Species')['Gene ID'].apply(list).reset_index()

df

Unnamed: 0,Species,Gene ID
0,Aspergillus fumigatus,"[AFUA_4G06890, AFUA_6G05140, AFUA_2G00320, AFU..."
1,Candida albicans,"[C5_00660C_A, C1_04770C_A, C1_02420C_A, CR_008..."
2,Candida auris,"[B9J08_001448, B9J08_003737, B9J08_000964, B9J..."
3,Candida parapsilosis,"[CPAR2_303740, CPAR2_105550, CPAR2_106400, CPA..."
4,Candida tropicalis,"[CTRG_05283, CTRG_04480, CTRG_04661, CTRG_0268..."
5,Cryptococcus neoformans,"[CNA00300, CNN02320, CNE02100, CNA05950, CNF04..."
6,Nasakeomyces glabrata,"[CAGL0E04334g, CAGL0F01793g, CAGL0G01034g, CAG..."
7,Pichia kudriavzevii,"[JL09_g2508, JL09_g3074, JL09_g1956, JL09_g200..."


In [103]:
gff_candid = ["gff_files/C_albicans_SC5314_version_A22-s07-m01-r195.gff",
              "gff_files/C_auris_B8441_version_s01-m03-r08.gff",
              "gff_files/C_glabrata_CBS138_version_s05-m03-r06.gff",
              "gff_files/C_parapsilosis_CDC317_version_s01-m06-r03.gff"]

gff_ensembl = ["gff_files/Aspergillus_fumigatus.ASM265v1.59.gff3",
               "gff_files/Candida_tropicalis.GCA000006335v3.59.gff3",
               "gff_files/Cryptococcus_neoformans.ASM9104v1.59.gff3",
               "gff_files/Pichia_kudriavzevii_gca_000764455.ASM76445v1.59.gff3"]

In [104]:
list1 = []

for gff_file in gff_files_list:
    
    with open(gff_file) as gff:
        gff_content = gff.read()

    landmark_dict = {}
    landmark = 0
    
    if gff_file in gff_ensembl:
        gff_landmarks = re.findall("##sequence-region.*\\n", gff_content)
        
        for i in gff_landmarks:
            index = re.findall("  .+? ", i)[0]
            index = re.sub(" ", "", index)
            coord = re.sub(".*? ", "", i)
            landmark += int(coord) - 1
            landmark_dict[index] = landmark
        
        list1.append(landmark_dict)
        
    else:
        gff_landmarks = re.findall(r"^.*CGD\tchromosome.*|^.*CGD\tcontig.*", gff_content, flags=re.MULTILINE)
        for i in gff_landmarks:
            #print(i)
            index = re.findall(r"(?:^|(?:[.!?]\s))(\w+)", i)[0]
            coord = re.findall("\t[0-9]{4}.+?\t", i)[0]
            coord = re.sub("[^0-9]", "", coord)
            #print(coord)
            landmark += int(coord) - 1
            landmark_dict[index] = landmark

        list1.append(landmark_dict)
    
#print(list1)

df["Landmarks"] = list1 

df

Unnamed: 0,Species,Gene ID,Landmarks
0,Aspergillus fumigatus,"[AFUA_4G06890, AFUA_6G05140, AFUA_2G00320, AFU...","{'1': 4918978, '2': 9763449, '3': 13842615, '4..."
1,Candida albicans,"[C5_00660C_A, C1_04770C_A, C1_02420C_A, CR_008...","{'Ca22chr1A_C_albicans_SC5314': 3188340, 'Ca22..."
2,Candida auris,"[B9J08_001448, B9J08_003737, B9J08_000964, B9J...","{'PEKT02000001_C_auris_B8441': 1083521, 'PEKT0..."
3,Candida parapsilosis,"[CPAR2_303740, CPAR2_105550, CPAR2_106400, CPA...","{'Contig005504_C_parapsilosis_CDC317': 898304,..."
4,Candida tropicalis,"[CTRG_05283, CTRG_04480, CTRG_04661, CTRG_0268...","{'GG692395': 2474447, 'GG692396': 4783116, 'GG..."
5,Cryptococcus neoformans,"[CNA00300, CNN02320, CNE02100, CNA05950, CNF04...","{'1': 2300532, '10': 3386251, '11': 4406096, '..."
6,Nasakeomyces glabrata,"[CAGL0E04334g, CAGL0F01793g, CAGL0G01034g, CAG...","{'ChrA_C_glabrata_CBS138': 527799, 'ChrB_C_gla..."
7,Pichia kudriavzevii,"[JL09_g2508, JL09_g3074, JL09_g1956, JL09_g200...","{'contig03002': 1995, 'contig03003': 3981, 'co..."


In [105]:
df['Landmarks'][df['Species'] == 'Nasakeomyces glabrata'].values[0]

{'ChrA_C_glabrata_CBS138': 527799,
 'ChrB_C_glabrata_CBS138': 1040453,
 'ChrC_C_glabrata_CBS138': 1634978,
 'ChrD_C_glabrata_CBS138': 2302877,
 'ChrE_C_glabrata_CBS138': 3010299,
 'ChrF_C_glabrata_CBS138': 3967813,
 'ChrG_C_glabrata_CBS138': 4978104,
 'ChrH_C_glabrata_CBS138': 6035491,
 'ChrI_C_glabrata_CBS138': 7178015,
 'ChrJ_C_glabrata_CBS138': 8425577,
 'ChrK_C_glabrata_CBS138': 9732666,
 'ChrL_C_glabrata_CBS138': 11260929,
 'ChrM_C_glabrata_CBS138': 12726200,
 'mito_C_glabrata_CBS138': 12746262}

In [49]:
for i, file in enumerate(species_genome_file_list):
    species = species_list[i]
    print(species)
    with open(file) as genome:
        genome_content = genome.read()
    
    # Removes first line puts a space
    dna = re.sub(">.*", "", genome_content)
    dna = re.sub("\n", "", dna)

    gff_file = gff_files_list[i]
    in_handle = open(gff_file)

    gene_list = df['Gene ID'][df['Species'] == species].values[0]

    for rec in GFF.parse(in_handle):
        #print(rec.id)
        #print(rec)
        for feature in rec.features:
            #print(feature.type)
            if feature.type == "gene":
                #print(feature.qualifiers)
                id_gene = re.sub(".*:", "", feature.qualifiers.get("ID")[0])
                #print(id_gene)
                if id_gene in gene_list:
                    print(id_gene)
                    print(feature.location)
                    print(rec.id)
                    print(rec.annotations.get("sequence-region"))
                    for n in range(0, int(rec.id)):
                        print("hello")
                    #print(rec.annotations.get("sequence-region")[0][2])
                    #print(feature.location.start)
                    #print(feature.location.end)
                    #print(feature.coordinates)

    in_handle.close()

Aspergillus fumigatus
AFUA_1G05050
[1449701:1450353](+)
1
[('1', 0, 4918979), ('2', 0, 4844472), ('3', 0, 4079167), ('4', 0, 3923705), ('5', 0, 3948441), ('6', 0, 3778736), ('7', 0, 2058334), ('8', 0, 1833124)]
hello
AFUA_1G10910
[2848154:2850137](-)
1
[('1', 0, 4918979), ('2', 0, 4844472), ('3', 0, 4079167), ('4', 0, 3923705), ('5', 0, 3948441), ('6', 0, 3778736), ('7', 0, 2058334), ('8', 0, 1833124)]
hello
AFUA_2G00320
[61019:62153](+)
2
[('1', 0, 4918979), ('2', 0, 4844472), ('3', 0, 4079167), ('4', 0, 3923705), ('5', 0, 3948441), ('6', 0, 3778736), ('7', 0, 2058334), ('8', 0, 1833124)]
hello
hello
AFUA_2G03700
[984933:988503](+)
2
[('1', 0, 4918979), ('2', 0, 4844472), ('3', 0, 4079167), ('4', 0, 3923705), ('5', 0, 3948441), ('6', 0, 3778736), ('7', 0, 2058334), ('8', 0, 1833124)]
hello
hello
AFUA_2G15130
[3997635:4002389](-)
2
[('1', 0, 4918979), ('2', 0, 4844472), ('3', 0, 4079167), ('4', 0, 3923705), ('5', 0, 3948441), ('6', 0, 3778736), ('7', 0, 2058334), ('8', 0, 1833124)]
hel

ValueError: invalid literal for int() with base 10: 'Ca22chr1A_C_albicans_SC5314'

In [34]:
in_file = "gff_files/C_albicans_SC5314_version_A22-s07-m01-r195.gff"

in_handle = open(in_file)

species = "Candida albicans"

gene_list = df['Gene ID'][df['Species'] == species].values[0]

for rec in GFF.parse(in_handle):
    #print(rec.id)
    #print(rec)
    for feature in rec.features:
        #print(feature.type)
        if feature.type == "gene":
            #print(feature.qualifiers)
            id_gene = re.sub(".*:", "", feature.qualifiers.get("ID")[0])
            #print(id_gene)
            if id_gene in gene_list:
                print(id_gene)
                print(feature.location)
                print(rec)
                #print(feature.location.start)
                #print(feature.location.end)
                #print(feature.coordinates)

in_handle.close()

C1_00710C_A
[123868:125597](-)
ID: Ca22chr1A_C_albicans_SC5314
Name: <unknown name>
Description: <unknown description>
Number of features: 1464
/gff-version=['3']
Undefined sequence of length 3188341
C1_00800C_A
[154632:155286](-)
ID: Ca22chr1A_C_albicans_SC5314
Name: <unknown name>
Description: <unknown description>
Number of features: 1464
/gff-version=['3']
Undefined sequence of length 3188341
C1_02420C_A
[505968:511662](-)
ID: Ca22chr1A_C_albicans_SC5314
Name: <unknown name>
Description: <unknown description>
Number of features: 1464
/gff-version=['3']
Undefined sequence of length 3188341
C1_03780C_A
[790363:793585](-)
ID: Ca22chr1A_C_albicans_SC5314
Name: <unknown name>
Description: <unknown description>
Number of features: 1464
/gff-version=['3']
Undefined sequence of length 3188341
C1_04770C_A
[991621:992782](-)
ID: Ca22chr1A_C_albicans_SC5314
Name: <unknown name>
Description: <unknown description>
Number of features: 1464
/gff-version=['3']
Undefined sequence of length 3188341


In [28]:
# Seems to have a minus 1 position... so might be in 0 base.

In [35]:
dir(GFF)

['DiscoGFFParser',
 'GFF3Writer',
 'GFFExaminer',
 'GFFOutput',
 'GFFParser',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 '__version__',
 'parse',
 'parse_simple',
 'write']

In [37]:
import pprint
from BCBio.GFF import GFFExaminer

in_file = "gff_files/Aspergillus_fumigatus.ASM265v1.59.gff3"
examiner = GFFExaminer()
in_handle = open(in_file)
pprint.pprint(examiner.available_limits(in_handle))
in_handle.close()


{'gff_id': {('1',): 12998,
            ('2',): 12919,
            ('3',): 10734,
            ('4',): 9769,
            ('5',): 10864,
            ('6',): 9985,
            ('7',): 4790,
            ('8',): 4752},
 'gff_source': {('ASM265v1',): 8,
                ('ena',): 76439,
                ('ena_assembly_gap',): 11,
                ('ena_mobile_element',): 353},
 'gff_source_type': {('ASM265v1', 'chromosome'): 8,
                     ('ena', 'CDS'): 28229,
                     ('ena', 'exon'): 28488,
                     ('ena', 'five_prime_UTR'): 4,
                     ('ena', 'gene'): 9623,
                     ('ena', 'mRNA'): 9623,
                     ('ena', 'ncRNA_gene'): 229,
                     ('ena', 'pseudogene'): 7,
                     ('ena', 'pseudogenic_transcript'): 7,
                     ('ena', 'tRNA'): 229,
                     ('ena_assembly_gap', 'biological_region'): 11,
                     ('ena_mobile_element', 'biological_region'): 353},
 'gff_type':

In [2]:
import re

with open("my_synthetic_genome.txt") as genome:
    genome_content = genome.read()

# Removes first line puts a space
dna = re.sub(">.*", "", genome_content)
dna = re.sub("\n", "", dna)

len(dna)

298411