In [33]:
import os

from google.auth.transport.requests import Request
from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError

from Bio import Entrez

In [34]:
SCOPES = ["https://www.googleapis.com/auth/spreadsheets"]
SPREADSHEET_ID = "1UZ6ZI3MWDeSO_w1joJPHb8_mgzRuL4OSAgpRmkjFjyo"
RANGE_NAME = "Dataset"
GENE_ID = "103574560"
#Gene = "Timeless"
Gene = "Timeout"

In [32]:
Entrez.email = "your_email@example.com"

# Step 1: Fetch the protein accession number using the Gene ID
#
search_handle = Entrez.esearch(db="protein", term="{}[GeneID]".format(GENE_ID), retmax=10)

search_results = Entrez.read(search_handle)
search_handle.close()

for proteinID in search_results["IdList"]:
    # Assuming the first result is the relevant one, but you might need to check this
    protein_accession = proteinID
    print(f"Protein Accession: {protein_accession}")

    # Step 2: Fetch the protein sequence using the protein accession number
    fetch_handle = Entrez.efetch(db="protein", id=protein_accession, rettype="fasta", retmode="text")
    protein_sequence = fetch_handle.read()
    fetch_handle.close()


    print(protein_sequence)

    # Step 3: Fetch the full protein record to extract taxonomic information
    fetch_handle = Entrez.efetch(db="protein", id=protein_accession, rettype="gb", retmode="xml")
    protein_record = Entrez.read(fetch_handle)
    fetch_handle.close()

    # Adjust this to correctly navigate to the taxon ID in the XML data structure
    for feature in protein_record[0]['GBSeq_feature-table']:
        for qualifier in feature['GBFeature_quals']:
            if qualifier['GBQualifier_name'] == 'db_xref':
                qualifier_value = qualifier['GBQualifier_value']
                if qualifier_value.startswith('taxon:'):
                    taxon_id = qualifier_value.split(':')[1]
                    print(f"Extracted Taxon ID: {taxon_id}")

    # Step 4: Fetch the taxonomic information using the taxon ID
    fetch_handle = Entrez.efetch(db="taxonomy", id=taxon_id, retmode="xml")
    taxonomy_record = Entrez.read(fetch_handle)
    fetch_handle.close()

    # Extract the order and any other relevant taxonomic information
    # Adjust the indexing and field names based on the actual returned data structure
    order = [item for item in taxonomy_record[0]['LineageEx'] if item['Rank'] == 'order'][0]['ScientificName']
    print(f"Order: {order}")

    common_name = taxonomy_record[0].get('OtherNames', {}).get('GenbankCommonName', taxonomy_record[0].get('CommonName', 'Unknown'))

    start_pos = protein_sequence.find('[') + 1  # Add 1 to start after the character
    end_pos = protein_sequence.find(']')

    species = protein_sequence[start_pos:end_pos]

    start_pos = protein_sequence.find('>') + 1  # Add 1 to start after the character
    end_pos = protein_sequence.find(' ')

    protein_id = protein_sequence[start_pos:end_pos]

    protein_sequence = protein_sequence.split('\n')

    # Remove the first line (header)
    sequence_lines = protein_sequence[1:]

    # Join the remaining lines back into a single string
    protein_sequence = ''.join(sequence_lines)

    credentials = None
    if os.path.exists("token.json"):
        credentials = Credentials.from_authorized_user_file("token.json", SCOPES)
    if not credentials or not credentials.valid:
        if credentials and credentials.expired and credentials._refresh_token:
                credentials.refresh(Request())
        else:
            flow = InstalledAppFlow.from_client_secrets_file("credits.json", SCOPES)
            credentials = flow.run_local_server(port=0)
        with open("token.json", "w") as token:
            token.write(credentials.to_json())
            
    try:
        service = build("sheets", "v4", credentials=credentials)
        sheets = service.spreadsheets()

        # Defining a new row to append
        new_row = [[common_name,species,Gene, order, GENE_ID,protein_id,"","",taxon_id,"","","","","","","",protein_sequence]]

        # Appending the new row to the sheet
        request_body = {
            'values': new_row
        }
        response = sheets.values().append(
            spreadsheetId=SPREADSHEET_ID,
            range=f"{RANGE_NAME}!A:Q",
            valueInputOption="RAW",
            body=request_body
        ).execute()

        print(f"Row appended: {response}")
    except HttpError as error:
        print(error)


Protein Accession: 665807903
>XP_008552252.1 protein timeless homolog [Microplitis demolitor]
MADYISAELAATCDALGYYDGATYHLDSDALNVIKDLIKYLKRDDDTHTVRRYLGQAKLLETDLIQILIQ
HSNKSELWDVLLRLMINLTSPALMFYNEELPAERTNRNYYLQLVSYLQGYKKALTDDRLWTVVSNRLGKI
LKIDTSERGEENELIIERILTLIRNVLQVPPHDNDKRADNDATVHDEVLFALHASGIVDLLLFIASNSSE
QQFHVQIIEIIALMLREQNASKLAVVGLERTAEEKAREEAKLLAVRQKEITEKMEKMKKYSGSRHSRFGG
TFVVQNMKAIGDNQMICHKPFEKIEALEFSRDKGKMKKPKNRVFVEPSVEERMSALSVRLFLKEFCVEFL
IGAYNPVMRYAKSFIIGDSADKSDGIHYFWALRFFMEFNRHYKFQVKYVSETVSTETFYLVQRQMEQYYE
LLMADKKKPAFWLRRLHEALKAYQELLHTLGAMDKALDKGVRDSSKVIKSNIFYVPEYRETILGQLLSYN
GLKMSRNYLVDLITTVHIFLKMLEHFCAQSRNVMVAKSKAKRRKSTKKKNKPEKEKEQTTAIQKSLDERW
DDAGPELSAVMQDGTIPDVIPFDATLDTPIEDQKSDAMKRIQKLLRKKEFEEAVGLLRASRAVWPENDCF
GKPDIPVEEEFLALREIFFADLGVQEEAEKQNEDVESFLNNEGGEIEDENDEENEEEDNVVEWEETKFDF
KEFIHRFANVKVVKAVTVLLKTFEKNSMELNRYIIKMLHRIAWDCKMPGMIFQASIFRVFQRILDSKHHE
HKELQKFASFIIRQFAEVAQKNRKAYMELLFWKNTREATEMVDGYDAANAENKKISRAVWSEAEEDELRT
LFMEHQTNKYPQDLIDWLLENIISENRTRRGIIKKLKEMYLIVNSKEVRNEVQK

In [112]:
## Short Gene IDs 

In [35]:
SCOPES = ["https://www.googleapis.com/auth/spreadsheets"]
SPREADSHEET_ID = "1UZ6ZI3MWDeSO_w1joJPHb8_mgzRuL4OSAgpRmkjFjyo"
RANGE_NAME = "Dataset"
GENE_ID = "8914"
iso_list = ["222136585","1057867276"]
#Gene = "Timeless"
Gene = "Timeout" 

In [36]:
Entrez.email = "your_email@example.com"

for isoform in iso_list:
    # Step 1: Fetch the protein accession number using the Gene ID
    #
    print(isoform)
    search_handle = Entrez.esearch(db="protein", term="{}".format(isoform), retmax=10)
    #search_handle = Entrez.esearch(db="protein", term="{}".format(GENE_ID), retmax=10)
    search_results = Entrez.read(search_handle)
    search_handle.close()

    print(search_results["IdList"])

    for proteinID in search_results["IdList"]:
        # Assuming the first result is the relevant one, but you might need to check this
        protein_accession = proteinID
        print(f"Protein Accession: {protein_accession}")

        # Step 2: Fetch the protein sequence using the protein accession number
        fetch_handle = Entrez.efetch(db="protein", id=protein_accession, rettype="fasta", retmode="text")
        protein_sequence = fetch_handle.read()
        fetch_handle.close()


        print(protein_sequence)

        # Step 3: Fetch the full protein record to extract taxonomic information
        fetch_handle = Entrez.efetch(db="protein", id=protein_accession, rettype="gb", retmode="xml")
        protein_record = Entrez.read(fetch_handle)
        fetch_handle.close()

        # Adjust this to correctly navigate to the taxon ID in the XML data structure
        for feature in protein_record[0]['GBSeq_feature-table']:
            for qualifier in feature['GBFeature_quals']:
                if qualifier['GBQualifier_name'] == 'db_xref':
                    qualifier_value = qualifier['GBQualifier_value']
                    if qualifier_value.startswith('taxon:'):
                        taxon_id = qualifier_value.split(':')[1]
                        print(f"Extracted Taxon ID: {taxon_id}")

        # Step 4: Fetch the taxonomic information using the taxon ID
        fetch_handle = Entrez.efetch(db="taxonomy", id=taxon_id, retmode="xml")
        taxonomy_record = Entrez.read(fetch_handle)
        fetch_handle.close()

        # Extract the order and any other relevant taxonomic information
        # Adjust the indexing and field names based on the actual returned data structure
        order = [item for item in taxonomy_record[0]['LineageEx'] if item['Rank'] == 'order'][0]['ScientificName']
        print(f"Order: {order}")

        common_name = taxonomy_record[0].get('OtherNames', {}).get('GenbankCommonName', taxonomy_record[0].get('CommonName', 'Unknown'))

        start_pos = protein_sequence.find('[') + 1  # Add 1 to start after the character
        end_pos = protein_sequence.find(']')

        species = protein_sequence[start_pos:end_pos]

        start_pos = protein_sequence.find('>') + 1  # Add 1 to start after the character
        end_pos = protein_sequence.find(' ')

        protein_id = protein_sequence[start_pos:end_pos]

        protein_sequence = protein_sequence.split('\n')

        # Remove the first line (header)
        sequence_lines = protein_sequence[1:]

        # Join the remaining lines back into a single string
        protein_sequence = ''.join(sequence_lines)

        credentials = None
        if os.path.exists("token.json"):
            credentials = Credentials.from_authorized_user_file("token.json", SCOPES)
        if not credentials or not credentials.valid:
            if credentials and credentials.expired and credentials._refresh_token:
                    credentials.refresh(Request())
            else:
                flow = InstalledAppFlow.from_client_secrets_file("credits.json", SCOPES)
                credentials = flow.run_local_server(port=0)
            with open("token.json", "w") as token:
                token.write(credentials.to_json())
                
        try:
            service = build("sheets", "v4", credentials=credentials)
            sheets = service.spreadsheets()

            # Defining a new row to append
            new_row = [[common_name,species,Gene, order, GENE_ID,protein_id,"","",taxon_id,"","","","","","","",protein_sequence]]

            # Appending the new row to the sheet
            request_body = {
                'values': new_row
            }
            response = sheets.values().append(
                spreadsheetId=SPREADSHEET_ID,
                range=f"{RANGE_NAME}!A:Q",
                valueInputOption="RAW",
                body=request_body
            ).execute()

            print(f"Row appended: {response}")
        except HttpError as error:
            print(error)


222136585
['222136585']
Protein Accession: 222136585
>NP_003911.2 protein timeless homolog isoform 1 [Homo sapiens]
MDLHMMNCELLATCSALGYLEGDTYHKEPDCLESVKDLIRYLRHEDETRDVRQQLGAAQILQSDLLPILT
QHHQDKPLFDAVIRLMVNLTQPALLCFGNLPKEPSFRHHFLQVLTYLQAYKEAFASEKAFGVLSETLYEL
LQLGWEERQEEDNLLIERILLLVRNILHVPADLDQEKKIDDDASAHDQLLWAIHLSGLDDLLLFLASSSA
EEQWSLHVLEIVSLMFRDQNPEQLAGVGQGRLAQERSADFAELEVLRQREMAEKKTRALQRGNRHSRFGG
SYIVQGLKSIGERDLIFHKGLHNLRNYSSDLGKQPKKVPKRRQAARELSIQRRSALNVRLFLRDFCSEFL
ENCYNRLMGSVKDHLLREKAQQHDETYYMWALAFFMAFNRAASFRPGLVSETLSVRTFHFIEQNLTNYYE
MMLTDRKEAASWARRMHLALKAYQELLATVNEMDISPDEAVRESSRIIKNNIFYVMEYRELFLALFRKFD
ERCQPRSFLRDLVETTHLFLKMLERFCRSRGNLVVQNKQKKRRKKKKKVLDQAIVSGNVPSSPEEVEAVW
PALAEQLQCCAQNSELSMDSVVPFDAASEVPVEEQRAEAMVRIQDCLLAGQAPQALTLLRSAREVWPEGD
VFGSQDISPEEEIQLLKQILSAPLPRQQGPEERGAEEEEEEEEEEEEELQVVQVSEKEFNFLDYLKRFAC
STVVRAYVLLLRSYQQNSAHTNHCIVKMLHRLAHDLKMEALLFQLSVFCLFNRLLSDPAAGAYKELVTFA
KYILGKFFALAAVNQKAFVELLFWKNTAVVREMTEGYGSLDDRSSSRRAPTWSPEEEAHLRELYLANKDV
EGQDVVEAILAHLNTVPRTRKQIIHHLVQMGL

In [72]:
start_pos = protein_sequence.find('[') + 1  # Add 1 to start after the character
end_pos = protein_sequence.find(']')

species = protein_sequence[start_pos:end_pos]
print(species)

Centruroides sculpturatus


In [73]:
start_pos = protein_sequence.find('>') + 1  # Add 1 to start after the character
end_pos = protein_sequence.find(' ')

protein_id = protein_sequence[start_pos:end_pos]
print(protein_id)

XP_023244207.1


In [74]:
protein_sequence = protein_sequence.split('\n')

# Remove the first line (header)
sequence_lines = protein_sequence[1:]

# Join the remaining lines back into a single string
protein_sequence = ''.join(sequence_lines)

print(protein_sequence)

MPYKEQPYSETFLKMMEDFNSAISAEIVATCGAIGYNNKGKYVKDPDCSECLKDLIRFLRNDDESHSIRRQLARIGVLKTDLIPLLKYYPKDKTLFDINIRLLINLTNPVMLLYHEELPEEKLTRNYYMEILSYQQECKQSFTDPDVWKELANRLRELLQLDWEHCQEDDRIMIERILILIRNILHIPSDSVSEKRTDDDVSIHDQIVWSFHISGMDDLIIYLAVTDEHKQFCFHVLEIISLLLREQTPQQLASTGQERFTLEKQRDNQELLKLRKQEFLRKQASVKLQTSRHSRFGGTFEVKNMKSVSDNNMLYHHKLQNITSLTYNQKEKKRKPKRNVPMQHVEVKRKSALTVRLCLRDFCKEFLDTAYNLFMKLIKEYLMGNQIQANDETYYFWAIKFFMEFNRLSGSAMSLITETMSIQSFHYLQTQIEYYYEMTINDKKKIVQWSKRMHLALKAYQELLLTLYAMDESSDREIIESAKIIKGNIFYVMEYRELLISLLMNYDEVKFSQNYLKDLIETTHVFMKMLEKFCKRQSHLIVQQKKVKRKKKNKPKKTLNNENKEDYEKLWSDLSENLSTALQEKIEEAESNTSIFDPVSDHTIDQQRINVIIKIKETLQMKNVKEAICLLRAARNIWPENDVFGIPTISPENEFIVLREIYFSDVQNLGDNANQKDVQEFEDNMTEEEQEERIVQSYEKEFDFKGFVSRFANSKIVRAYAMLFKNFNKNSDYTNNCILKMLHRISWDCKMVALLFQASVFQTMQKILDFPEYSSMIKTH


In [75]:
credentials = None
if os.path.exists("token.json"):
    credentials = Credentials.from_authorized_user_file("token.json", SCOPES)
if not credentials or not credentials.valid:
    if credentials and credentials.expired and credentials._refresh_token:
            credentials.refresh(Request())
    else:
        flow = InstalledAppFlow.from_client_secrets_file("credits.json", SCOPES)
        credentials = flow.run_local_server(port=0)
    with open("token.json", "w") as token:
        token.write(credentials.to_json())
        
try:
    service = build("sheets", "v4", credentials=credentials)
    sheets = service.spreadsheets()

    # Defining a new row to append
    new_row = [["",species, order, GENE_ID,protein_id,"","",taxon_id,"","","","","","","",protein_sequence]]

    # Appending the new row to the sheet
    request_body = {
        'values': new_row
    }
    response = sheets.values().append(
        spreadsheetId=SPREADSHEET_ID,
        range=f"{RANGE_NAME}!A:Q",
        valueInputOption="RAW",
        body=request_body
    ).execute()

    print(f"Row appended: {response}")
except HttpError as error:
    print(error)


['common name', 'Species', 'order']
['Bark scorpion', 'Centruroides sculpturatus', ' ']
Row appended: {'spreadsheetId': '1UZ6ZI3MWDeSO_w1joJPHb8_mgzRuL4OSAgpRmkjFjyo', 'tableRange': 'Timeout!A1:R2', 'updates': {'spreadsheetId': '1UZ6ZI3MWDeSO_w1joJPHb8_mgzRuL4OSAgpRmkjFjyo', 'updatedRange': 'Timeout!A3:O3', 'updatedRows': 1, 'updatedColumns': 15, 'updatedCells': 15}}


In [37]:
len('MDLHMMNCELLATCSALGYLEGDTYHKEPDCLESVKDLIRYLRHEDETRDVRQQLGAAQILQSDLLPILTQHHQDKPLFDAVIRLMVNLTQPALLCFGNLPKEPSFRHHFLQVLTYLQAYKEAFASEKAFGVLSETLYELLQLGWEERQEEDNLLIERILLLVRNILHVPADLDQEKKIDDDASAHDQLLWAIHLSGLDDLLLFLASSSAEEQWSLHVLEIVSLMFRDQNPEQLAGVGQGRLAQERSADFAELEVLRQREMAEKKTRALQRGNRHSRFGGSYIVQGLKSIGERDLIFHKGLHNLRNYSSDLGKQPKKVPKRRQAARELSIQRRSALNVRLFLRDFCSEFLENCYNRLMGSVKDHLLREKAQQHDETYYMWALAFFMAFNRAASFRPGLVSETLSVRTFHFIEQNLTNYYEMMLTDRKEAASWARRMHLALKAYQELLATVNEMDISPDEAVRESSRIIKNNIFYVMEYRELFLALFRKFDERCQPRSFLRDLVETTHLFLKMLERFCRSRGNLVVQNKQKKRRKKKKKVLDQAIVSGNVPSSPEEVEAVWPALAEQLQCCAQNSELSMDSVVPFDAASEVPVEEQRAEAMVRIQDCLLAGQAPQALTLLRSAREVWPEGDVFGSQDISPEEEIQLLKQILSAPLPRQQGPEERGAEEEEEEEEEEEEELQVVQVSEKEFNFLDYLKRFACSTVVRAYVLLLRSYQQNSAHTNHCIVKMLHRLAHDLKMEALLFQLSVFCLFNRLLSDPAAGAYKELVTFAKYILGKFFALAAVNQKAFVELLFWKNTAVVREMTEGYGSLDDRSSSRRAPTWSPEEEAHLRELYLANKDVEGQDVVEAILAHLNTVPRTRKQIIHHLVQMGLADSVKDFQRKGTHIVLWTGDQELELQRLFEEFRDSDDVLGHIMKNITAKRSRARIVDKLLALGLVAERRELYKKRQKKLASSILPNGAESLKDFCQEDLEEEENLPEEDSEEEEEGGSEAEQVQGSLVLSNENLGQSLHQEGFSIPLLWLQNCLIRAADDREEDGCSQAVPLVPLTEENEEAMENEQFQQLLRKLGVRPPASGQETFWRIPAKLSPTQLRRAAASLSQPEEEQKLQPELQPKVPGEQGSDEEHCKEHRAQALRALLLAHKKKAGLASPEEEDAVGKEPLKAAPKKRQLLDSDEEQEEDEGRNRAPELGAPGIQKKKRYQIEDDEDD')

1208

In [38]:
len('MDLHMMNCELLATCSALGYLEGDTYHKEPDCLESVKDLIRYLRHEDETRDVRQQLGAAQILQSDLLPILTQHHQDKPLFDAVIRLMVNLTQPALLCFGNLPKEPSFRHHFLQVLTYLQAYKEAFASEKAFGVLSETLYELLQLGWEERQEEDNLLIERILLLVRNILHVPADLDQEKIDDDASAHDQLLWAIHLSGLDDLLLFLASSSAEEQWSLHVLEIVSLMFRDQNPEQLAGVGQGRLAQERSADFAELEVLRQREMAEKKTRALQRGNRHSRFGGSYIVQGLKSIGERDLIFHKGLHNLRNYSSDLGKQPKKVPKRRQAARELSIQRRSALNVRLFLRDFCSEFLENCYNRLMGSVKDHLLREKAQQHDETYYMWALAFFMAFNRAASFRPGLVSETLSVRTFHFIEQNLTNYYEMMLTDRKEAASWARRMHLALKAYQELLATVNEMDISPDEAVRESSRIIKNNIFYVMEYRELFLALFRKFDERCQPRSFLRDLVETTHLFLKMLERFCRSRGNLVVQNKQKKRRKKKKKVLDQAIVSGNVPSSPEEVEAVWPALAEQLQCCAQNSELSMDSVVPFDAASEVPVEEQRAEAMVRIQDCLLAGQAPQALTLLRSAREVWPEGDVFGSQDISPEEEIQLLKQILSAPLPRQQGPEERGAEEEEEEEEEEEEELQVVQVSEKEFNFLDYLKRFACSTVVRAYVLLLRSYQQNSAHTNHCIVKMLHRLAHDLKMEALLFQLSVFCLFNRLLSDPAAGAYKELVTFAKYILGKFFALAAVNQKAFVELLFWKNTAVVREMTEGYGSLDDRSSSRRAPTWSPEEEAHLRELYLANKDVEGQDVVEAILAHLNTVPRTRKQIIHHLVQMGLADSVKDFQRKGTHIVLWTGDQELELQRLFEEFRDSDDVLGHIMKNITAKRSRARIVDKLLALGLVAERRELYKKRQKKLASSILPNGAESLKDFCQEDLEEEENLPEEDSEEEEEGGSEAEQVQGSLVLSNENLGQSLHQEGFSIPLLWLQNCLIRAADDREEDGCSQAVPLVPLTEENEEAMENEQFQQLLRKLGVRPPASGQETFWRIPAKLSPTQLRRAAASLSQPEEEQKLQPELQPKVPGEQGSDEEHCKEHRAQALRALLLAHKKKAGLASPEEEDAVGKEPLKAAPKKRQLLDSDEEQEEDEGRNRAPELGAPGIQKKKRYQIEDDEDD')

1207