Canonical Transcripts fetching and p. (pdots/HGVSp notation) matching and markup

In [None]:
# Canonical transcript fetch based on Gene symbols from Ensembl API
# Requires a gene list text file and output filepath

#!/usr/bin/env python

import sys
import json
import time
import pandas as pd

try:
    from urllib.parse import urlencode
    from urllib.request import urlopen, Request
    from urllib.error import HTTPError
except ImportError:
    from urlparse import urlparse
    from urllib import urlencode
    from urllib2 import urlopen, Request, HTTPError

class EnsemblRestClient(object):
    def __init__(self, server='http://rest.ensembl.org', reqs_per_sec=15):
        self.server = server
        self.reqs_per_sec = reqs_per_sec
        self.req_count = 0
        self.last_req = 0

    def perform_rest_action(self, endpoint, hdrs=None, params=None):
        """ Perform API request with automatic rate limiting. """
        if hdrs is None:
            hdrs = {}

        if 'Content-Type' not in hdrs:
            hdrs['Content-Type'] = 'application/json'

        if params:
            endpoint += '?' + urlencode(params)

        data = None

        # Rate limit handling
        if self.req_count >= self.reqs_per_sec:
            delta = time.time() - self.last_req
            if delta < 1:
                time.sleep(1 - delta)
            self.last_req = time.time()
            self.req_count = 0

        try:
            request = Request(self.server + endpoint, headers=hdrs)
            response = urlopen(request)
            content = response.read()
            if content:
                data = json.loads(content)
            self.req_count += 1  # Increment request count

        except HTTPError as e:
            if e.code == 429:  # Ensembl rate limiting
                if 'Retry-After' in e.headers:
                    retry = int(e.headers['Retry-After'])
                    print(f"Rate limited. Retrying after {retry} seconds...")
                    time.sleep(retry)
                    return self.perform_rest_action(endpoint, hdrs, params)
            else:
                sys.stderr.write(f"Request failed for {endpoint}: {e.code} {e.reason}\n")

        return data

    def get_canonical_transcript(self, species, symbol):
        """ Fetch canonical transcript for a given gene symbol. """
        gene_info = self.perform_rest_action(
            endpoint=f'/lookup/symbol/{species}/{symbol}', 
            params={}
        )
        if gene_info:
            return gene_info.get("canonical_transcript", "Not available")
        return "No data found"

def fetch_canonical_transcripts(gene_list_file, output_file, species="homo_sapiens"):
    """ Reads gene list, fetches canonical transcripts, and saves results. """
    try:
        with open(gene_list_file, 'r') as file:
            gene_symbols = [line.strip() for line in file if line.strip()]
    except FileNotFoundError:
        print(f"Error: File {gene_list_file} not found.")
        sys.exit(1)

    client = EnsemblRestClient()
    results = []

    for gene in gene_symbols:
        print(f"Fetching canonical transcript for {gene}...")
        canonical_transcript = client.get_canonical_transcript(species, gene)
        results.append({"Gene Symbol": gene, "Canonical Transcript": canonical_transcript})

    # Save results to Excel
    df = pd.DataFrame(results)
    df.to_excel(output_file, index=False)
    print(f"\nResults saved to {output_file}")

if __name__ == '__main__':
    gene_list_file = 'genes_abs_ind.txt'  # Update with actual file path
    output_file = 'abs_ind_gene_canonical_transcripts.xlsx'
    fetch_canonical_transcripts(gene_list_file, output_file)


Fetching canonical transcript for CFTR...
Fetching canonical transcript for CTNS...
Fetching canonical transcript for PEX12...
Fetching canonical transcript for BMPR2...
Fetching canonical transcript for FYCO1...
Fetching canonical transcript for ALDH7A1...
Fetching canonical transcript for NRAS...
Fetching canonical transcript for VWF...
Fetching canonical transcript for FBN1...
Fetching canonical transcript for MAP3K1...
Fetching canonical transcript for FARS2...
Fetching canonical transcript for SOD1...
Fetching canonical transcript for SDCCAG8...
Fetching canonical transcript for HBB...
Fetching canonical transcript for TP53...
Fetching canonical transcript for HMBS...
Fetching canonical transcript for GJB2...
Fetching canonical transcript for ARHGAP11A...
Fetching canonical transcript for POLK...
Fetching canonical transcript for SDHC...
Fetching canonical transcript for BCHE...
Fetching canonical transcript for GDF6...
Fetching canonical transcript for PKLR...
Fetching canonical 

Request failed for /lookup/symbol/homo_sapiens/CNPY3-GNMT: 400 Bad Request


Fetching canonical transcript for CRYGA...
Fetching canonical transcript for ROS1...
Fetching canonical transcript for ANKS6...
Fetching canonical transcript for LDLR...
Fetching canonical transcript for PIGH...
Fetching canonical transcript for STRA6...
Fetching canonical transcript for ABCB6...
Fetching canonical transcript for LOC102724788...


Request failed for /lookup/symbol/homo_sapiens/LOC102724788: 400 Bad Request


Fetching canonical transcript for PEPD...
Fetching canonical transcript for RYR1...
Fetching canonical transcript for UGT1A1...
Fetching canonical transcript for AGL...
Fetching canonical transcript for LBR...
Fetching canonical transcript for CD3G...
Fetching canonical transcript for KMT2D...
Fetching canonical transcript for CEP290...
Fetching canonical transcript for BRCA2...
Fetching canonical transcript for CREBBP...
Fetching canonical transcript for PAFAH1B1...
Fetching canonical transcript for KMT2B...
Fetching canonical transcript for BAX...
Fetching canonical transcript for MSH6...
Fetching canonical transcript for SLC4A11...
Fetching canonical transcript for LZTR1...
Fetching canonical transcript for DEPDC5...
Fetching canonical transcript for SDHA...
Fetching canonical transcript for APC...
Fetching canonical transcript for HSD17B3...
Fetching canonical transcript for PCDH19...
Fetching canonical transcript for FUT2...
Fetching canonical transcript for NCF4...
Fetching canon

Request failed for /lookup/symbol/homo_sapiens/TAZ: 400 Bad Request


Fetching canonical transcript for POLR3A...
Fetching canonical transcript for OTOF...
Fetching canonical transcript for GCH1...
Fetching canonical transcript for TSHR...
Fetching canonical transcript for RECQL4...
Fetching canonical transcript for TENM4...
Fetching canonical transcript for GNPTG...
Fetching canonical transcript for SYNGAP1...
Fetching canonical transcript for F11...
Fetching canonical transcript for TMC1...
Fetching canonical transcript for PALB2...
Fetching canonical transcript for AGRN...
Fetching canonical transcript for CRB1...
Fetching canonical transcript for ROM1...
Fetching canonical transcript for CPOX...
Fetching canonical transcript for DDB2...
Fetching canonical transcript for SMPD1...
Fetching canonical transcript for AMT...
Fetching canonical transcript for RNF43...
Fetching canonical transcript for FLCN...
Fetching canonical transcript for SCN5A...
Fetching canonical transcript for CNTN2...
Fetching canonical transcript for CFHR5...
Fetching canonical tr

Request failed for /lookup/symbol/homo_sapiens/H3F3A: 400 Bad Request


Fetching canonical transcript for PROC...
Fetching canonical transcript for TGM1...
Fetching canonical transcript for SBDS...
Fetching canonical transcript for ASPM...
Fetching canonical transcript for IDH1...
Fetching canonical transcript for RRAS2...
Fetching canonical transcript for ROBO4...
Fetching canonical transcript for COL9A1...
Fetching canonical transcript for PITX2...
Fetching canonical transcript for ALG6...
Fetching canonical transcript for CSF1R...
Fetching canonical transcript for ERF...
Fetching canonical transcript for CDKL5...
Fetching canonical transcript for ADA2...
Fetching canonical transcript for NIPBL...
Fetching canonical transcript for SOHLH1...
Fetching canonical transcript for TSEN2...
Fetching canonical transcript for CD19...
Fetching canonical transcript for TJP2...
Fetching canonical transcript for SLC36A2...
Fetching canonical transcript for RPGRIP1L...
Fetching canonical transcript for RAB3GAP2...
Fetching canonical transcript for RASA1...
Fetching can

Request failed for /lookup/symbol/homo_sapiens/LRRC6: 400 Bad Request


Fetching canonical transcript for HBD...
Fetching canonical transcript for EGR2...
Fetching canonical transcript for GFM1...
Fetching canonical transcript for ADGRG1...
Fetching canonical transcript for GNPTAB...
Fetching canonical transcript for SCO1...
Fetching canonical transcript for COL1A1...
Fetching canonical transcript for FGFR2...
Fetching canonical transcript for KAT6B...
Fetching canonical transcript for CC2D2A...
Fetching canonical transcript for AIFM1...
Fetching canonical transcript for MAPK7...
Fetching canonical transcript for CYP1B1...
Fetching canonical transcript for MYOM1...
Fetching canonical transcript for HNF1A...
Fetching canonical transcript for ARMC9...
Fetching canonical transcript for SPART...
Fetching canonical transcript for KATNB1...
Fetching canonical transcript for COG4...
Fetching canonical transcript for ADA...
Fetching canonical transcript for GHRHR...
Fetching canonical transcript for RAX2...
Fetching canonical transcript for SLC25A13...
Fetching ca

Request failed for /lookup/symbol/homo_sapiens/ADPRHL2: 400 Bad Request


Fetching canonical transcript for PYGM...
Fetching canonical transcript for DNAI1...
Fetching canonical transcript for ALK...
Fetching canonical transcript for SPTBN4...
Fetching canonical transcript for SRD5A3...
Fetching canonical transcript for RSPO4...
Fetching canonical transcript for SPATA5...


Request failed for /lookup/symbol/homo_sapiens/SPATA5: 400 Bad Request


Fetching canonical transcript for ATP6V0A4...
Fetching canonical transcript for DYNC2H1...
Fetching canonical transcript for NTF4...
Fetching canonical transcript for AQP1...
Fetching canonical transcript for KAT6A...
Fetching canonical transcript for BBS10...
Fetching canonical transcript for KANSL1...
Fetching canonical transcript for SLC12A1...
Fetching canonical transcript for NECTIN3...
Fetching canonical transcript for TPR...
Fetching canonical transcript for ALDH6A1...
Fetching canonical transcript for AARS2...
Fetching canonical transcript for BLM...
Fetching canonical transcript for FCSK...
Fetching canonical transcript for NPHP3...
Fetching canonical transcript for ASB10...
Fetching canonical transcript for PCCB...
Fetching canonical transcript for CUL7...
Fetching canonical transcript for CARS2...
Fetching canonical transcript for ACO2...
Fetching canonical transcript for IL17RD...
Fetching canonical transcript for MRPS22...
Fetching canonical transcript for FSHR...
Fetching

Request failed for /lookup/symbol/homo_sapiens/DARS: 400 Bad Request


Fetching canonical transcript for CANT1...
Fetching canonical transcript for CYP24A1...
Fetching canonical transcript for EXT1...
Fetching canonical transcript for CYP7B1...
Fetching canonical transcript for MCCC2...
Fetching canonical transcript for SCN4A...
Fetching canonical transcript for SBF1...
Fetching canonical transcript for TUBB2A...
Fetching canonical transcript for ADAMTS13...
Fetching canonical transcript for MTSS2...
Fetching canonical transcript for RAD51D...
Fetching canonical transcript for ALOXE3...
Fetching canonical transcript for MPO...
Fetching canonical transcript for KCNJ13...
Fetching canonical transcript for MPV17...
Fetching canonical transcript for ACAD8...
Fetching canonical transcript for AHSG...
Fetching canonical transcript for EDAR...
Fetching canonical transcript for SLC3A1...
Fetching canonical transcript for COL4A4...
Fetching canonical transcript for SLC34A1...
Fetching canonical transcript for CA12...
Fetching canonical transcript for IMPDH1...
Fet

Request failed for /lookup/symbol/homo_sapiens/GBA: 400 Bad Request


Fetching canonical transcript for PCCA...
Fetching canonical transcript for RARS2...
Fetching canonical transcript for PHKG2...
Fetching canonical transcript for NEK1...
Fetching canonical transcript for LPAR6...
Fetching canonical transcript for RORA...
Fetching canonical transcript for COL18A1...
Fetching canonical transcript for IQCE...
Fetching canonical transcript for UNC13D...
Fetching canonical transcript for MTO1...
Fetching canonical transcript for PEX10...
Fetching canonical transcript for NOBOX...
Fetching canonical transcript for SLC26A8...
Fetching canonical transcript for ATP6V1B1...
Fetching canonical transcript for AP4B1...
Fetching canonical transcript for KCTD7...
Fetching canonical transcript for SLC34A3...
Fetching canonical transcript for NBAS...
Fetching canonical transcript for RP9...
Fetching canonical transcript for MMP9...
Fetching canonical transcript for RHD...
Fetching canonical transcript for TRDN...
Fetching canonical transcript for TH...
Fetching canonic

Request failed for /lookup/symbol/homo_sapiens/TTC37: 400 Bad Request


Fetching canonical transcript for CNGB1...
Fetching canonical transcript for LAMA3...
Fetching canonical transcript for MAT1A...
Fetching canonical transcript for MRI1...
Fetching canonical transcript for TGFBI...
Fetching canonical transcript for MSRB3...
Fetching canonical transcript for APOA1...
Fetching canonical transcript for KCNQ3...
Fetching canonical transcript for MAPK1...
Fetching canonical transcript for P2RY12...
Fetching canonical transcript for TREX1...
Fetching canonical transcript for PIGW...
Fetching canonical transcript for STXBP2...
Fetching canonical transcript for PHYKPL...
Fetching canonical transcript for MTTP...
Fetching canonical transcript for RNASEH2C...
Fetching canonical transcript for RTEL1...
Fetching canonical transcript for CPT1A...
Fetching canonical transcript for PIK3R2...
Fetching canonical transcript for CASQ2...
Fetching canonical transcript for PPP1CB...
Fetching canonical transcript for HGD...
Fetching canonical transcript for ALDH18A1...
Fetch

Request failed for /lookup/symbol/homo_sapiens/IARS: 400 Bad Request


Fetching canonical transcript for CNNM4...
Fetching canonical transcript for CACNB2...
Fetching canonical transcript for PUF60...
Fetching canonical transcript for EEF1A2...
Fetching canonical transcript for SDR9C7...
Fetching canonical transcript for LTBP2...
Fetching canonical transcript for RDH5...
Fetching canonical transcript for TBCE...
Fetching canonical transcript for PCDH15...
Fetching canonical transcript for FMO3...
Fetching canonical transcript for MYOC...
Fetching canonical transcript for SPARC...
Fetching canonical transcript for ELP1...
Fetching canonical transcript for SLC7A9...
Fetching canonical transcript for ABL1...
Fetching canonical transcript for RET...
Fetching canonical transcript for DOCK6...
Fetching canonical transcript for AR...
Fetching canonical transcript for TGM6...
Fetching canonical transcript for HEXB...
Fetching canonical transcript for STIL...
Fetching canonical transcript for NDUFS1...
Fetching canonical transcript for PYROXD1...
Fetching canonica

Request failed for /lookup/symbol/homo_sapiens/WDR34: 400 Bad Request


Fetching canonical transcript for PHKA2...
Fetching canonical transcript for LRBA...
Fetching canonical transcript for DNAJC21...
Fetching canonical transcript for PKP2...
Fetching canonical transcript for ZDHHC9...
Fetching canonical transcript for LRP6...
Fetching canonical transcript for TSC2...
Fetching canonical transcript for F5...
Fetching canonical transcript for TANGO2...
Fetching canonical transcript for TAF4B...
Fetching canonical transcript for P3H1...
Fetching canonical transcript for MEIS2...
Fetching canonical transcript for ACSF3...
Fetching canonical transcript for B9D1...
Fetching canonical transcript for FLNB...
Fetching canonical transcript for PRODH...
Fetching canonical transcript for CDKN1C...
Fetching canonical transcript for SERPINC1...
Fetching canonical transcript for SDHAF2...
Fetching canonical transcript for RNF213...
Fetching canonical transcript for FANCF...
Fetching canonical transcript for CNTNAP1...
Fetching canonical transcript for PYGL...
Fetching c

Request failed for /lookup/symbol/homo_sapiens/U2AF1L5: 400 Bad Request


Fetching canonical transcript for SAMHD1...
Fetching canonical transcript for CCN6...
Fetching canonical transcript for CHD8...
Fetching canonical transcript for IL7R...
Fetching canonical transcript for GALNT14...
Fetching canonical transcript for CASP8...
Fetching canonical transcript for TGFBR1...
Fetching canonical transcript for LRRK2...
Fetching canonical transcript for PAX2...
Fetching canonical transcript for GATA6...
Fetching canonical transcript for TP63...
Fetching canonical transcript for CDK4...
Fetching canonical transcript for SMARCA4...
Fetching canonical transcript for ERCC3...
Fetching canonical transcript for LIFR...
Fetching canonical transcript for VRK1...
Fetching canonical transcript for NDE1...
Fetching canonical transcript for SOX9...
Fetching canonical transcript for KDM6A...
Fetching canonical transcript for FH...
Fetching canonical transcript for PRDM16...
Fetching canonical transcript for GNMT...
Fetching canonical transcript for DOCK8...
Fetching canonical

In [6]:
# Canonical transcript fetch based on ENSG Id from Ensembl API
# Requires a ensg_id list text file and output filepath

#!/usr/bin/env python

import sys
import json
import time
import pandas as pd
from tqdm import tqdm

try:
    from urllib.parse import urlencode
    from urllib.request import urlopen, Request
    from urllib.error import HTTPError
except ImportError:
    from urlparse import urlparse
    from urllib import urlencode
    from urllib2 import urlopen, Request, HTTPError

class EnsemblRestClient(object):
    def __init__(self, server='http://rest.ensembl.org', reqs_per_sec=15):
        self.server = server
        self.reqs_per_sec = reqs_per_sec
        self.req_count = 0
        self.last_req = 0

    def perform_rest_action(self, endpoint, hdrs=None, params=None):
        """ Perform API request with automatic rate limiting. """
        if hdrs is None:
            hdrs = {}

        if 'Content-Type' not in hdrs:
            hdrs['Content-Type'] = 'application/json'

        if params:
            endpoint += '?' + urlencode(params)

        data = None

        # Rate limit handling
        if self.req_count >= self.reqs_per_sec:
            delta = time.time() - self.last_req
            if delta < 1:
                time.sleep(1 - delta)
            self.last_req = time.time()
            self.req_count = 0

        try:
            request = Request(self.server + endpoint, headers=hdrs)
            response = urlopen(request)
            content = response.read()
            if content:
                data = json.loads(content)
            self.req_count += 1  # Increment request count

        except HTTPError as e:
            if e.code == 429:  # Ensembl rate limiting
                if 'Retry-After' in e.headers:
                    retry = int(e.headers['Retry-After'])
                    print(f"Rate limited. Retrying after {retry} seconds...")
                    time.sleep(retry)
                    return self.perform_rest_action(endpoint, hdrs, params)
            else:
                sys.stderr.write(f"Request failed for {endpoint}: {e.code} {e.reason}\n")

        return data

    def get_canonical_transcript(self, ensg_id):
        """ Fetch canonical transcript for a given ENSG ID. """
        gene_info = self.perform_rest_action(
            endpoint=f'/lookup/id/{ensg_id}', 
            params={}
        )
        if gene_info:
            return gene_info.get("canonical_transcript", "Not available")
        return "No data found"

def fetch_canonical_transcripts(ensg_id_list_file, output_file):
    """ Reads ENSG ID list, fetches canonical transcripts, and saves results. """
    try:
        with open(ensg_id_list_file, 'r') as file:
            ensg_ids = [line.strip() for line in file if line.strip()]
    except FileNotFoundError:
        print(f"Error: File {ensg_id_list_file} not found.")
        sys.exit(1)

    client = EnsemblRestClient()
    results = []

    for ensg_id in tqdm(ensg_ids, desc="Fetching ENSG ID data", unit="ensg_id"):
        canonical_transcript = client.get_canonical_transcript(ensg_id)
        results.append({"ENSG ID": ensg_id, "Canonical Transcript": canonical_transcript})

    # Save results to Excel
    df = pd.DataFrame(results)
    df.to_excel(output_file, index=False)
    print(f"\nResults saved to {output_file}")

if __name__ == '__main__':
    ensg_id_list_file = 'ensgs_abs_ind.txt'  # Update with actual file path
    output_file = 'abs_ind_ensg_canonical_transcripts.xlsx'
    fetch_canonical_transcripts(ensg_id_list_file, output_file)


Fetching ENSG ID data: 100%|██████████| 11/11 [00:05<00:00,  2.06ensg_id/s]


Results saved to abs_ind_ensg_canonical_transcripts.xlsx





In [11]:
# p. markup based on canonical transcript matchup on AAChange col
# Requires excel filepath for the canonical transcripts fetched and the annotation file to which p. will be added (it MUST have AAChange col)

import pandas as pd
import re

# Load the provided Excel files
annot_df = pd.read_excel(r"D:\4bc_Gdrive\My Drive\Pathogenic_Landscape\assets\absolute_indie_patho\COMBINED DATA (Absolute plus Indiegene) Table DATASETS for Abstract - Copy.xlsx", sheet_name= "Patho only df")
canonical_df = pd.read_excel(r"D:\4bc_Gdrive\My Drive\Pathogenic_Landscape\code\abs_ind_gene_canonical_transcripts.xlsx")

# Merge the dataframes on 'Ref.Gene' from annot_df and 'Gene Symbol' from canonical_df
merged_df = annot_df.merge(canonical_df, how='left', left_on='Ref.Gene', right_on='Gene Symbol')

# Drop the redundant 'Gene Symbol' column after merge
merged_df.drop(columns=['Gene Symbol'], inplace=True)

# Function to extract p. value for the canonical transcript
def extract_pdot(row):
    # Skip if Canonical Transcript is NaN
    if pd.isnull(row['Canonical Transcript']):
        return None
    
    aa_changes = row['AAChange.ensGene']
    canonical_transcript_id = row['Canonical Transcript'].split('.')[0]

    if pd.notnull(aa_changes):
        aa_changes_list = aa_changes.split(',')
        for change in aa_changes_list:
            if canonical_transcript_id in change:
                match = re.search(r'p\.[A-Za-z]+\d+[A-Za-z]*', change)
                if match:
                    return match.group()
    return None

# Apply the function to create the 'canonical pdots' column
merged_df['Canonical pdots'] = merged_df.apply(extract_pdot, axis=1)

# Save the updated dataframe to a new Excel file
output_file = r"D:\4bc_Gdrive\My Drive\Pathogenic_Landscape\assets\absolute_indie_patho\updated_absolute_patho_data.xlsx"
merged_df.to_excel(output_file, index=False)

print(f"Updated file saved at: {output_file}")


Updated file saved at: D:\4bc_Gdrive\My Drive\Pathogenic_Landscape\assets\absolute_indie_patho\updated_absolute_patho_data.xlsx
