In [None]:
#Setting up the code, importing all necessary packages and data

import pandas as pd
import numpy as np
from Bio import Entrez
from Bio import SeqIO
import os
import time
Entrez.email = "spatuzzi@stud.uni-heidelberg.de"

In [None]:
os.getcwd()

In [None]:
def find_product_name(ID, limit = 40, top = 1, bad_IDs = True):
    
    ID_data = pd.read_csv(ID, index_col = 0, encoding='cp1252')
    
    import collections
    
    product_dict = {}
    bad_IDs_dict = {}
    
    for gene in ID_data.columns:
        id_list = ID_data.loc[:,gene].dropna()[0:limit]
        product_names = []
        bad_ids = []

        for ID in id_list:

            try:
                handle = Entrez.efetch(db="nucleotide", id= ID, rettype="gb", retmode="text")
                gb_record = SeqIO.read(handle, "gb")
                features = gb_record.features

                for feature in features:
                    if "product" in feature.qualifiers :
                        product_names.append(feature.qualifiers["product"][0])
                        print("Gene: "+gene+"; ID: " +ID+"; Product: "+ feature.qualifiers["product"][0] )
            except:
                bad_ids.append(ID)
                pass 


        counter = collections.Counter(product_names)

        max_values = sorted(counter.values(), reverse = True)  # maximum value
        max_keys = [k for k, v in counter.items() if v in max_values[0:top]] # getting all keys containing the `maximum`

        product_dict[gene] = max_keys 
        bad_IDs_dict[gene] = bad_ids
    
    if bad_IDs: 
        product_dict["badIDs"] = bad_IDs_dict
        return(product_dict)
    else:
        return(product_dict)

            


In [60]:
# Code for iteration

def search_new_sequences(phy):
    
    phy_dat = phy
    
    #Iterate over ever species
    for name in phy_dat.index:

        #Iterate over every Gene
        for gene in phy_dat.columns[1:19]:

            #Check if there is already a sequence in this position by removing all "gaps"
            #If it's only gaps, continue

            if(phy_dat.loc[name, gene] == None):

                #To keep better track in the console, announce every time gene and organism
                print("Name:" + name + ", Gene:" + gene )

                #Concatenate the term for the Entrez function and prepare empty list for the results

                term = name + "[Orgn] AND " + Product_dict[gene][0] + "[Prd]"
                ID_list = []

                #This iteration can easily be interrupted by a temporary connection issue at this step
                #The following chunk of code ensures that the code is run multiple times (up to 100 times) to attempt to download entrez IDs 

                max_tries = 100
                for i in range(max_tries):
                    try:
                        time.sleep(0.1) 

                        #Find IDs that match the "product" string

                        handle = Entrez.esearch(db="nucleotide", term = term, idtype="acc")
                        record = Entrez.read(handle)
                        ID_list = record["IdList"]
                        print(ID_list)
                        break
                        
                    except Exception:
                        print("retry")
                        continue

                if(len(ID_list) > 0):

                    #Take sequence for each ID with Entrez.efetch function
                    #Edit the downloaded string object to only contain the coding sequence

                    seq_list = []
                    for ID in ID_list: 

                        handle = Entrez.efetch(db="nucleotide", id=ID, rettype="gb", retmode="text")
                        gb_record = SeqIO.read(handle, "gb")
                        features = gb_record.features
                        for feature in features:
                            print(feature.qualifiers)
                            if "product" in feature.qualifiers :
                                if feature.type == 'CDS' and feature.qualifiers["product"] == [Product_dict[gene][0]]:
                                    sequence=feature.extract(gb_record)
                        seq_list.append(str(sequence.seq))

                    #"Empty" result lists still contain a warning string, in which case they must not be appended to the result

                    if(len(seq_list) > 0 and seq_list != ['EDIDPARAMETERISEMPTY.']):
                        new_seq_df.loc[name, gene] = seq_list
                        print(" Replaced:" + gene + " for " + name)
    return(new_seq_df)

In [None]:
# Write files for MAFFT

def fasta_from_pd_df(df, writing_dir):
    
    #Create new directory
    os.system("mkdir " + writing_dir)
    os.chdir(WorkingDirectory + "/" + writing_dir)

    #Iterate over every gene, for each write a file with each new sequence for each organism. Organisms that have multiple sequences for the same gene 
    # will have an  additional cypher (1,2,3...etc.) to differentiate them.
    
    for gene in df.columns:

        with open('New_alignment_sequences_'+ gene, 'w') as f:
            seq_str = ""
            for name in df.index:
                if(df.loc[name, gene] != None):

                    line_list = ["> " + name + " "+ str(i)+ "\n" + df.loc[name, gene][i] for i in range(len(df.loc[name, gene]))]

                    line =("\n".join(line_list))
                    f.write(line + "\n")


    
    

In [None]:
#Edit the Dataframes

#We change to row names to the scientific names in order to use the .loc method, but do not get rid of the "Scientific name" 
#column because we will revert to a numbered index when we have multiple sequences for some gene-organism match
#Replace empty values with "None"

Product_dict = find_product_name("GenBank_matrix_sqm.csv", 20, top = 2, bad_IDs = False)
Phy_genes.columns = ['Scientific name'] + ([*Product_dict])
Phy_genes.index = Phy_genes.iloc[:,0].values

for name in Phy_genes.index:
    for gene in Phy_genes.columns:
        if(Phy_genes.loc[name, gene].count("-") == len(Phy_genes.loc[name, gene])):
                Phy_genes.loc[name,gene] = None


In [None]:
#Create new Dataframes to fill out with new results
#Phy_genes_update: A copy of Phy_genes with the addition of new data
#new_seq_df: Datframe containing ONLY the new data

Phy_genes_update = Phy_genes.copy()
new_seq_df = pd.DataFrame( np.empty((len(Phy_genes.index), len(Phy_genes.columns)),dtype=pd.Timestamp), index = Phy_genes.index, columns = Phy_genes.columns)


In [None]:
new_seq_df = search_new_sequences(Phy_genes)

Name:Ablepharus_budaki, Gene:X12S
[]
Name:Ablepharus_budaki, Gene:AMEL
[]
Name:Ablepharus_budaki, Gene:BDNF
[]
Name:Ablepharus_budaki, Gene:BMP2
[]
Name:Ablepharus_budaki, Gene:CMOS
['MN418807.1']
OrderedDict([('organism', ['Ablepharus budaki']), ('mol_type', ['genomic DNA']), ('isolate', ['C4']), ('db_xref', ['taxon:283347'])])
OrderedDict([('gene', ['C-mos'])])
OrderedDict([('gene', ['C-mos']), ('product', ['oocyte maturation factor Mos'])])
OrderedDict([('gene', ['C-mos']), ('codon_start', ['2']), ('product', ['oocyte maturation factor Mos']), ('protein_id', ['QIH46181.1']), ('translation', ['AVKQVKKCSKNRLASRQSFWAELNVARLSHNNVVRIIAASACSPTSQNSLGTIIMEYVGNSTLHHVIYGTESMLTKRKDNGLGCGFEPLSITQSLSYSCDIAAGLVFLHSQLTVHLDLKPANIFITEQNVCKIGDFGSS'])])
 Replaced:CMOS for Ablepharus_budaki
Name:Ablepharus_budaki, Gene:COI
[]
Name:Ablepharus_budaki, Gene:ND1
[]
Name:Ablepharus_budaki, Gene:ND2
[]
Name:Ablepharus_budaki, Gene:ND4
[]
Name:Ablepharus_budaki, Gene:NT3
[]
Name:Ablepharus_budaki, Gene:PDC
[]

In [None]:
fasta_from_pd_df(new_seq_df, "Trial_py")