In [None]:
#Setting up the code, importing all necessary packages and data

import pandas as pd
import numpy as np
from Bio import Entrez
from Bio import SeqIO
import os
import time
Entrez.email = "spatuzzi@stud.uni-heidelberg.de"

In [33]:
# code(find_product_name)
#
# @description: This function read an mxn csv file that contains gene IDs from the NCBI nucleotide database for n genes and m organisms 
# (not all cells need to be filled out) and returns a dictionary of lists of product names sorted by most frequent for each gene.
#
# @argument ID: path to the csv file
# @argument limit: the number of IDs for each gene to examine
# @argument top: the maximum number of product names sorted by 
# frequency (in descending order) to return in the lists of the 
# result dictionary 
# @argument bad_IDs: boolean, if TRUE the dictionary will have an 
# extra index called "badIDs" that contains another dictionary 
# with a list of IDs for each gene for the which (for many possible 
# reasons) no gene product could be retrieved 
#
# @value: returns a dictionary with either n or n+1 indexes 
# depending on the value of "bad_IDs"
#
# @author Matteo Spatuzzi (2022)
#
# @import pandas as pd
# @import numpy as np
# @from Bio import Entrez
# @from Bio import SeqIO
# @import os
# @import time
#

def find_product_name(file, limit = 40, top = 1, bad_IDs = True):
    
    ID_data = pd.read_csv(file, index_col = 0, encoding='cp1252')
    
    import collections
    
    product_dict = {}
    bad_IDs_dict = {}
    
    for gene in ID_data.columns:
        id_list = ID_data.loc[:,gene].dropna()[0:limit]
        product_names = []
        bad_ids = []

        for ID in id_list:

            try:
                handle = Entrez.efetch(db="nucleotide", id= ID, rettype="gb", retmode="text")
                gb_record = SeqIO.read(handle, "gb")
                features = gb_record.features

                for feature in features:
                    if "product" in feature.qualifiers :
                        product_names.append(feature.qualifiers["product"][0])
                        print("Gene: "+gene+"; ID: " +ID+"; Product: "+ feature.qualifiers["product"][0] )
            except:
                bad_ids.append(ID)
                pass 


        counter = collections.Counter(product_names)

        max_values = sorted(counter.values(), reverse = True)  # maximum value
        max_keys = [k for k, v in counter.items() if v in max_values[0:(top-1)]] # getting all keys containing the `maximum`

        product_dict[gene] = max_keys 
        bad_IDs_dict[gene] = bad_ids
    
    if bad_IDs: 
        product_dict["badIDs"] = bad_IDs_dict
        return(product_dict)
    else:
        return(product_dict)

            


In [36]:
# code(search_new_sequences)
#
# @description: This function reads an mxn csv file that contains 
# gene sequences or IDs from the NCBI nucleotide database for n 
# genes and m organisms and cross-references it with a dictionary 
# of product names for the genes to try to find sequences to fill 
# out the empty cells
#
# @argument file: path to the csv file, empty cells should be 
# empty strings
# @argument product_dict: a dictionary with indexes identical 
# to the columns names of the csv
# @argument max_tries: numerical, since the internet connection can 
# falter and thei function can run for hours every search attempt 
# is run up to this argument's value times if it cannot connect 
# with the internet before moving to the next gene-organism combination
# @argument add: boolean, if True the fucntion returns the input csv with
# the addtional sequences, if False (default) it returns an new dataframe 
# of equal dimensions that only contains the new seqeunces
#
# @value: returns a dataframe of equal dimensions of the imported csv 
# file full of newly downloaded seqeunces
#
# @author Matteo Spatuzzi (2022)
#
# @import pandas as pd
# @import numpy as np
# @from Bio import Entrez
# @from Bio import SeqIO
# @import os
# @import time
#

def search_new_sequences(file, product_dict, max_tries, add = False):
    
    phy_dat = pd.read_csv(file, index_col = 0, encoding='cp1252')
    
    if(add):
        new_seq_df = phy_dat.copy()
    else: 
        new_seq_df = pd.DataFrame( np.empty((len(phy_dat.index), len(phy_dat.columns)),dtype=pd.Timestamp), index = phy_dat.index, columns = phy_dat.columns)
    
    for name in phy_dat.index:
        for gene in phy_dat.columns:
            if(phy_dat.loc[name, gene].count("-") == len(phy_dat.loc[name, gene])):
                phy_dat.loc[name,gene] = ""
   
    #Iterate over ever species
    for name in phy_dat.index:

        #Iterate over every Gene
        for gene in phy_dat.columns[1:19]:

            #Check if there is already a sequence in this position by removing all "gaps"
            #If it's only gaps, continue

            if(phy_dat.loc[name, gene] == ""):

                #To keep better track in the console, announce every time gene and organism
                print("Name:" + name + ", Gene:" + gene )

                #Concatenate the term for the Entrez function and prepare empty list for the results

                term = name + "[Orgn] AND " + product_dict[gene][0] + "[Prd]"
                ID_list = []

                #This iteration can easily be interrupted by a temporary connection issue at this step
                #The following chunk of code ensures that the code is run multiple times (up to 100 times) to attempt to download entrez IDs 

                max_tries
                for i in range(max_tries):
                    try:
                        time.sleep(0.1) 

                        #Find IDs that match the "product" string

                        handle = Entrez.esearch(db="nucleotide", term = term, idtype="acc")
                        record = Entrez.read(handle)
                        ID_list = record["IdList"]
                        print(ID_list)
                        break
                        
                    except Exception:
                        print("retry")
                        continue

                if(len(ID_list) > 0):

                    #Take sequence for each ID with Entrez.efetch function
                    #Edit the downloaded string object to only contain the coding sequence

                    seq_list = []
                    for ID in ID_list: 

                        handle = Entrez.efetch(db="nucleotide", id=ID, rettype="gb", retmode="text")
                        gb_record = SeqIO.read(handle, "gb")
                        features = gb_record.features
                        for feature in features:
                            print(feature.qualifiers)
                            if "product" in feature.qualifiers :
                                if feature.type == 'CDS' and feature.qualifiers["product"] == [product_dict[gene][0]]:
                                    sequence=feature.extract(gb_record)
                                    seq_list.append(str(sequence.seq))

                    #"Empty" result lists still contain a warning string, in which case they must not be appended to the result

                    if(len(seq_list) > 0 and seq_list != ['EDIDPARAMETERISEMPTY.']):
                        new_seq_df.loc[name, gene] = seq_list
                        print(" Replaced:" + gene + " for " + name)
                        
    return(new_seq_df)

In [None]:
# code(fasta_from_pd_df)
#
# @description: This function reads an mxn dataframe that contains 
# gene sequences for n  genes and m organisms, creates a new directory and writes fasta
# files for each gene in the chosen directory
#
# @argument df: dataframe that contains gene sequences, row names 
# and column names should be organism and gene names respectively
# @argument writing_dir: the path where the new directory with the 
# fasta files will be created
# @argument dir_name: the name of the new directory
#
# @value: writes fasta files in the directory but returns no object
#
# @author Matteo Spatuzzi (2022)
#
# @import pandas as pd
# @import numpy as np
# @from Bio import Entrez
# @from Bio import SeqIO
# @import os
# @import time
#

def fasta_from_pd_df(df, writing_dir, dir_name):
    
    #Create new directory
    os.system("mkdir " + writing_dir)
    os.chdir(writing_dir + "/" + dir_name)

    #Iterate over every gene, for each write a file with each new sequence for each organism. Organisms that have multiple sequences for the same gene 
    # will have an  additional cypher (1,2,3...etc.) to differentiate them.
    
    for gene in df.columns:

        with open('New_alignment_sequences_'+ gene, 'w') as f:
            seq_str = ""
            for name in df.index:
                if(df.loc[name, gene] != None):

                    line_list = ["> " + name + " "+ str(i)+ "\n" + df.loc[name, gene][i] for i in range(len(df.loc[name, gene]))]

                    line =("\n".join(line_list))
                    f.write(line + "\n")


    
    

In [None]:
#Edit the Dataframes

#We change to row names to the scientific names in order to use the .loc method, but do not get rid of the "Scientific name" 
#column because we will revert to a numbered index when we have multiple sequences for some gene-organism match
#Replace empty values with "None"

Product_dict = find_product_name("GenBank_amph.csv", 20, top = 2, bad_IDs = False)


In [None]:

new_seq_df = search_new_sequences("Results/Phy_Genes_1.csv", Product_dict, 20, add = False)


Name:Acanthixalus_spinosus, Gene:CXCR4
[]
Name:Acanthixalus_spinosus, Gene:H3A
[]
Name:Acanthixalus_spinosus, Gene:NCX1
[]
Name:Acanthixalus_spinosus, Gene:ND1
['AF215427.1', 'AJ437002.1', 'AF465438.1']
OrderedDict([('organism', ['Acanthixalus spinosus']), ('organelle', ['mitochondrion']), ('mol_type', ['genomic DNA']), ('isolate', ['16SAcanthixalus_spinosus']), ('db_xref', ['taxon:143455'])])
OrderedDict([('product', ['16S ribosomal RNA'])])
OrderedDict([('organism', ['Acanthixalus spinosus']), ('organelle', ['mitochondrion']), ('mol_type', ['genomic DNA']), ('db_xref', ['taxon:143455'])])
OrderedDict([('gene', ['16S rRNA'])])
OrderedDict([('gene', ['16S rRNA']), ('product', ['16S ribosomal RNA'])])
OrderedDict([('organism', ['Acanthixalus spinosus']), ('organelle', ['mitochondrion']), ('mol_type', ['genomic DNA']), ('specimen_voucher', ['ZFMK 72000']), ('db_xref', ['taxon:143455']), ('sex', ['female']), ('country', ['Cameroon: Mt. Kupe'])])
OrderedDict([('product', ['16S ribosomal RN

In [None]:
fasta_from_pd_df(new_seq_df, "Results", "Trial_py")