In [1]:
import polars as pl
import numpy as np
import requests
from zipfile import ZipFile
import os 

import psutil

In [2]:


# Get total memory
total_memory = psutil.virtual_memory().total / (1024 ** 3)  # Convert to GB
available_memory = psutil.virtual_memory().available / (1024 ** 3)  # Convert to GB

print(f"Total memory: {total_memory:.2f} GB")
print(f"Available memory: {available_memory:.2f} GB")

Total memory: 1007.09 GB
Available memory: 984.04 GB


In [3]:
# Download file anew: (takes 13 m 12s)
chunk_size = 4096
filename = "C:/Users/Mette/OneDrive - Danmarks Tekniske Universitet/11. Semester Speciale/Data/Xena/TCGA_PANCAN_transcript_expression_RNAseq_RSEM_tpm.gz"
document_url = "https://toil-xena-hub.s3.us-east-1.amazonaws.com/download/tcga_rsem_isoform_tpm.gz"
#with requests.get(document_url, stream=True) as r:
#        with open(filename, 'wb') as f:
#            for chunk in r.iter_content(chunk_size): 
#                if chunk:
#                    f.write(chunk)

In [84]:
in_file = "/zhome/94/f/147417/tma_thesis/data/xena/tcga_Kallisto_tpm.tsv" #TCGA_PANCAN_transcript_expression_RNAseq_RSEM_tpm.tsv" #"C:/Users/Mette/OneDrive - Danmarks Tekniske Universitet/11. Semester Speciale/Data/Xena/TCGA_PANCAN_transcript_expression_RNAseq_RSEM_tpm.gz"
in_pheno2 = "/zhome/94/f/147417/tma_thesis/data/xena/TCGA_PANCAN_curated_clinical_data.tsv" # "C:/Users/Mette/OneDrive - Danmarks Tekniske Universitet/11. Semester Speciale/Data/Xena/TCGA_PANCAN_curated_clinical_data.tsv"
#in_transcript_id = "/zhome/94/f/147417/tma_thesis/data/xena/TCGA_PANCAN_transcript_expression_RNAseq_RSEM_transcript_list.txt" # "C:/Users/Mette/OneDrive - Danmarks Tekniske Universitet/11. Semester Speciale/Data/Xena/TCGA_PANCAN_transcript_expression_RNAseq_RSEM_transcript_list.txt"
in_genenames = "/zhome/94/f/147417/tma_thesis/data/targets_ensg.csv" # "C:/Users/Mette/OneDrive - Danmarks Tekniske Universitet/11. Semester Speciale/Data/targets_ensg.csv"

In [85]:
# Load dataframe as lazy using polars
lf_data = pl.scan_csv(in_file, separator="\t")

# Load transcript ids:
#with open(in_transcript_id) as f:  
#    transcript_id_list = f.read().splitlines()
#transcript_id_list.pop(0) # Remove "column name"s

# Add it to the lf_data: 
#lf_data = lf_data.with_columns(trans_id = transcript_id_list)

# Remove trailing . and digits on the transcript ids:
lf_data = lf_data.with_columns(pl.col("sample").str.split(".").list.get(0))

# Load gene names as dataframe: 
df_genenames = pl.read_csv(in_genenames)

In [86]:
# Define a function to fetch transcript IDs from Ensembl
def fetch_transcript_ids_from_ensembl(gene_id):
    url = f"https://rest.ensembl.org/lookup/id/{gene_id}?expand=1"
    response = requests.get(url, headers={"Content-Type": "application/json"})
    if response.ok:
        data = response.json()
        return [transcript['id'] for transcript in data.get('Transcript', [])]
    else:
        return []
    
# Define a function to fetch transcript IDs from Ensembl
def fetch_gene_ids_from_ensembl(trans_id):
    url = f"https://rest.ensembl.org/lookup/id/{trans_id}?expand=1"
    response = requests.get(url, headers={"Content-Type": "application/json"})
    if response.ok:
        data = response.json()
        return data.get('display_name', [])
    else:
        return []

In [87]:
# From ensembl API get the transcript IDs for all genes of interest: 
# Apply the API function to the DataFrame
df_genenames = df_genenames.with_columns(pl.col("converted_alias").map_elements(fetch_transcript_ids_from_ensembl).alias("transcript_ids"))

# Get a list of all the transcript IDs related to any of the 14 genes of interest: 
trans_id_of_interest = df_genenames.select(pl.col("transcript_ids").explode()).to_series().to_list()

def make_cancer_specific_tsv(cancer_type_abb):
    # Make list of sample ids for different cancer types
    df_pheno = pl.read_csv(in_pheno2, separator="\t")
    sample_id_cancertype = df_pheno.filter(pl.col("cancer type abbreviation") == cancer_type_abb).select(pl.col("sample")).to_numpy()
    #sample_id_data = np.array(lf_data.collect_schema().names())
    sample_id_data = np.array(lf_data.limit(1).collect().columns)
    sample_id_cancertype = set(np.intersect1d(sample_id_cancertype, sample_id_data))

    # Subset lazyframe to include only samples from cancer type of interest.
    lf_data_cancertype = lf_data.select(sample_id_cancertype.union(set(["sample"])))

    # Use list to filter the data: 
    lf_data_cancertype = lf_data_cancertype.filter(pl.col("sample").is_in(trans_id_of_interest))    

    return lf_data_cancertype


def add_gene_id(in_lazy, write_file, cancer_type_abb):
    # Add column with gene_id
    lf = in_lazy.with_columns(pl.col("sample").map_elements(fetch_gene_ids_from_ensembl).alias("gene_id"))

    # Rename transcript column and move it to front together with gene_id:
    lf = lf.rename({"sample": "transcript_id"})
    all_columns = lf.columns
    lf = lf.select(
        [pl.col("transcript_id"), pl.col("gene_id")]  + [pl.col(c) for c in all_columns if c not in ["transcript_id", "gene_id"]]
    ).sort(by="gene_id")

    out_path = "/zhome/94/f/147417/tma_thesis/data/xena/TCGA_PANCAN_transcript_expression_RNAseq_kallisto_tpm_{}.tsv".format(cancer_type_abb)

    if write_file: 
        if os.path.exists(out_path):
            os.remove(out_path)

        lf.collect().write_csv(out_path, separator="\t")

    return lf


In [81]:
add_gene_id(make_cancer_specific_tsv(c_type), True, c_type).sort(by="gene_id").collect()

transcript_id,gene_id,TCGA-3X-AAVB-01,TCGA-W5-AA2W-01,TCGA-WD-A7RX-01,TCGA-W5-AA2H-01,TCGA-W5-AA2U-01,TCGA-W5-AA2T-01,TCGA-W5-AA2O-01,TCGA-W5-AA34-11,TCGA-W5-AA36-01,TCGA-W5-AA2I-01,TCGA-ZU-A8S4-11,TCGA-3X-AAVE-01,TCGA-ZH-A8Y6-01,TCGA-W5-AA33-01,TCGA-W5-AA2X-01,TCGA-W5-AA38-01,TCGA-W5-AA2U-11,TCGA-4G-AAZT-01,TCGA-YR-A95A-01,TCGA-W5-AA31-01,TCGA-ZH-A8Y1-01,TCGA-ZH-A8Y2-01,TCGA-W5-AA2X-11,TCGA-W5-AA39-01,TCGA-W5-AA2Z-01,TCGA-3X-AAVA-01,TCGA-W5-AA2R-11,TCGA-ZH-A8Y8-01,TCGA-ZD-A8I3-01,TCGA-W5-AA2R-01,TCGA-W5-AA34-01,TCGA-3X-AAV9-01,TCGA-W5-AA2I-11,TCGA-ZU-A8S4-01,TCGA-W5-AA31-11,TCGA-W5-AA30-01,TCGA-ZH-A8Y4-01,TCGA-W5-AA2G-01,TCGA-W5-AA2Q-11,TCGA-ZH-A8Y5-01,TCGA-W5-AA30-11,TCGA-W6-AA0S-01,TCGA-W5-AA2Q-01,TCGA-3X-AAVC-01,TCGA-4G-AAZO-01
str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""ENST0000027519…","""ARG1-201""",-9.9658,-3.0469,-1.685,-9.9658,-3.816,-9.9658,-2.3884,4.0765,-9.9658,-9.9658,3.5923,-3.0469,-5.5735,1.5216,-9.9658,-9.9658,4.3924,-9.9658,-9.9658,1.9638,-3.3076,-9.9658,2.8542,2.5163,-9.9658,-2.9324,4.3883,-1.4305,-9.9658,-9.9658,-9.9658,-9.9658,2.6624,-9.9658,4.7555,-9.9658,-3.3076,-1.2481,2.3077,-3.6259,2.5756,-9.9658,-9.9658,-5.0116,-9.9658
"""ENST0000035696…","""ARG1-202""",-5.5735,-9.9658,0.547,-9.9658,-9.9658,-2.8262,2.2452,1.9111,-9.9658,0.4233,2.8562,-9.9658,-9.9658,-9.9658,-0.7346,-9.9658,2.8402,-4.035,1.6649,-0.394,-2.9324,-9.9658,3.5754,2.0673,-4.035,-9.9658,2.6464,0.2154,-9.9658,-9.9658,-9.9658,-9.9658,3.0252,-9.9658,2.8542,-5.5735,-9.9658,0.9191,1.8683,-9.9658,1.5514,-3.816,-4.6082,-4.6082,-9.9658
"""ENST0000036808…","""ARG1-203""",-9.9658,-2.2447,5.6453,-1.7322,-3.6259,-0.7834,4.5417,8.3469,-9.9658,-1.9379,8.2829,-0.9406,1.5013,4.7939,-0.6873,-3.816,8.6945,-9.9658,4.9677,6.0826,-9.9658,-9.9658,8.441,7.5867,-9.9658,-0.0725,8.2676,5.1879,-9.9658,-2.6349,-1.3183,-5.5735,8.4831,-0.1665,8.7651,-9.9658,3.3745,5.9839,7.8486,-9.9658,8.2165,-9.9658,-9.9658,-9.9658,-9.9658
"""ENST0000046929…","""ARG1-204""",-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-4.2934,-9.9658,-9.9658,0.3907,-9.9658,-9.9658,-3.0469,-9.9658,-9.9658,2.2693,-9.9658,-2.5479,-1.1811,-9.9658,-9.9658,2.3677,-0.9971,-9.9658,-9.9658,0.3685,-3.3076,-9.9658,-9.9658,-9.9658,-9.9658,-0.7834,-9.9658,-1.8314,-9.9658,-9.9658,-1.3183,1.2636,-9.9658,1.7489,-9.9658,-2.4659,-9.9658,-9.9658
"""ENST0000048482…","""ARG1-205""",-9.9658,-9.9658,-9.9658,-9.9658,-5.0116,-9.9658,-9.9658,1.3511,-3.1714,-9.9658,-9.9658,-2.3147,-9.9658,-2.114,-1.9942,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-0.5332,-5.5735,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,1.334,-3.1714,-9.9658,-9.9658,-9.9658,-3.458,-9.9658,-9.9658,-9.9658,-2.8262,-5.0116
"""ENST0000049826…","""ARG1-206""",-9.9658,-9.9658,-0.4921,-9.9658,-9.9658,-9.9658,-1.8314,2.6138,-9.9658,-3.458,2.0465,-9.9658,-3.6259,-2.7274,-9.9658,-9.9658,2.6067,-9.9658,-3.816,-0.6416,-9.9658,-9.9658,3.1129,1.8444,-9.9658,-9.9658,0.4447,-1.1488,-4.2934,-4.2934,-9.9658,-9.9658,2.4908,-9.9658,3.1908,-9.9658,-9.9658,-0.5973,-0.5543,-9.9658,1.3735,-9.9658,-9.9658,-9.9658,-2.6349
"""ENST0000026178…","""ARG2-201""",1.614,-0.8339,-1.5951,-2.4659,0.7999,0.537,-1.8836,-3.6259,-1.9942,-1.5951,0.8726,-2.0529,1.3283,-0.4719,2.9581,-0.5973,-0.8863,-2.0529,0.8082,0.9343,-0.8084,-2.6349,-1.9379,1.1184,-0.6873,0.0158,0.9038,-0.013,1.5854,-3.3076,-0.9686,2.1925,-3.6259,1.6466,-2.114,-0.3566,-1.0559,0.605,-0.2498,-0.3022,-2.5479,-0.7108,-0.4131,3.5022,2.6161
"""ENST0000055649…","""ARG2-202""",-9.9658,-9.9658,-9.9658,-3.458,-1.685,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-2.3147,-9.9658,-3.458,-9.9658,-9.9658,-9.9658,-3.0469,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-3.1714,-2.9324,-9.9658,-9.9658
"""ENST0000055712…","""ARG2-203""",-2.9324,-2.5479,-9.9658,-9.9658,-2.9324,-9.9658,-2.4659,-9.9658,-2.2447,-9.9658,-9.9658,-1.5105,-1.3921,-2.7274,-1.3921,-9.9658,-9.9658,-5.0116,-9.9658,-9.9658,-9.9658,-2.114,-9.9658,-9.9658,-0.8599,-2.1779,-9.9658,-9.9658,-1.685,-4.035,-1.9942,-0.6643,-9.9658,-3.3076,-2.5479,-1.9942,-6.5064,-0.4325,-3.458,-1.8314,-6.5064,-9.9658,-9.9658,-1.1488,-1.8314
"""ENST0000055731…","""ARG2-204""",-0.6873,-9.9658,-2.0529,-3.816,-1.0262,-1.8836,-9.9658,-3.1714,-9.9658,-9.9658,-3.0469,-2.4659,-1.1811,-2.3147,-9.9658,-9.9658,-1.4699,-2.8262,-1.9379,-1.5951,-1.0559,-1.7322,-3.3076,-0.1993,-1.8836,-0.8339,-0.9971,-1.7809,-9.9658,-1.9942,-0.3022,-0.8599,-3.458,-1.9942,-2.3884,-5.5735,-9.9658,-0.6416,-9.9658,-3.0469,-3.1714,-3.0469,-2.3884,-0.2671,-0.5756


In [88]:
cancer_type_list = ["PAAD", "SKCM", "LAML", "CHOL", "CESC", "DLBC", "HNSC", "BRCA", "LUAD", "LUSC"]

for c_type in cancer_type_list: 
    print(c_type)
    add_gene_id(make_cancer_specific_tsv(c_type), True, c_type)

PAAD
SKCM
LAML
CHOL
CESC
DLBC
HNSC
BRCA
LUAD
LUSC
