In [104]:
import polars as pl
import numpy as np
import requests
from zipfile import ZipFile

import psutil

In [105]:


# Get total memory
total_memory = psutil.virtual_memory().total / (1024 ** 3)  # Convert to GB
available_memory = psutil.virtual_memory().available / (1024 ** 3)  # Convert to GB

print(f"Total memory: {total_memory:.2f} GB")
print(f"Available memory: {available_memory:.2f} GB")

Total memory: 1007.09 GB
Available memory: 972.97 GB


In [106]:
# Download file anew: (takes 13 m 12s)
chunk_size = 4096
filename = "C:/Users/Mette/OneDrive - Danmarks Tekniske Universitet/11. Semester Speciale/Data/Xena/TCGA_PANCAN_transcript_expression_RNAseq_RSEM_tpm.gz"
document_url = "https://toil-xena-hub.s3.us-east-1.amazonaws.com/download/tcga_rsem_isoform_tpm.gz"
#with requests.get(document_url, stream=True) as r:
#        with open(filename, 'wb') as f:
#            for chunk in r.iter_content(chunk_size): 
#                if chunk:
#                    f.write(chunk)

In [114]:
in_file = "/zhome/94/f/147417/tma_thesis/data/xena/tcga_Kallisto_tpm.gz" #TCGA_PANCAN_transcript_expression_RNAseq_RSEM_tpm.tsv" #"C:/Users/Mette/OneDrive - Danmarks Tekniske Universitet/11. Semester Speciale/Data/Xena/TCGA_PANCAN_transcript_expression_RNAseq_RSEM_tpm.gz"
in_pheno2 = "/zhome/94/f/147417/tma_thesis/data/xena/TCGA_PANCAN_curated_clinical_data.tsv" # "C:/Users/Mette/OneDrive - Danmarks Tekniske Universitet/11. Semester Speciale/Data/Xena/TCGA_PANCAN_curated_clinical_data.tsv"
#in_transcript_id = "/zhome/94/f/147417/tma_thesis/data/xena/TCGA_PANCAN_transcript_expression_RNAseq_RSEM_transcript_list.txt" # "C:/Users/Mette/OneDrive - Danmarks Tekniske Universitet/11. Semester Speciale/Data/Xena/TCGA_PANCAN_transcript_expression_RNAseq_RSEM_transcript_list.txt"
in_genenames = "/zhome/94/f/147417/tma_thesis/data/targets_ensg.csv" # "C:/Users/Mette/OneDrive - Danmarks Tekniske Universitet/11. Semester Speciale/Data/targets_ensg.csv"

In [None]:
# Load dataframe as lazy using polars
lf_data = pl.scan_csv(in_file, separator="\t")

# Load transcript ids:
#with open(in_transcript_id) as f:  
#    transcript_id_list = f.read().splitlines()
#transcript_id_list.pop(0) # Remove "column name"s

# Add it to the lf_data: 
#lf_data = lf_data.with_columns(trans_id = transcript_id_list)

# Remove trailing . and digits on the transcript ids:
lf_data = lf_data.with_columns(pl.col("sample").str.split(".").list.get(0))

# Load gene names as dataframe: 
df_genenames = pl.read_csv(in_genenames)

In [109]:
# Define a function to fetch transcript IDs from Ensembl
def fetch_transcript_ids_from_ensembl(gene_id):
    url = f"https://rest.ensembl.org/lookup/id/{gene_id}?expand=1"
    response = requests.get(url, headers={"Content-Type": "application/json"})
    if response.ok:
        data = response.json()
        return [transcript['id'] for transcript in data.get('Transcript', [])]
    else:
        return []

In [110]:
# From ensembl API get the transcript IDs for all genes of interest: 
# Apply the API function to the DataFrame
df_genenames = df_genenames.with_columns(pl.col("converted_alias").map_elements(fetch_transcript_ids_from_ensembl).alias("transcript_ids"))

# Get a list of all the transcript IDs related to any of the 14 genes of interest: 
trans_id_of_interest = df_genenames.select(pl.col("transcript_ids").explode()).to_series().to_list()

def make_cancer_specific_tsv(cancer_type_abb, write_file):
    # Make list of sample ids for different cancer types
    df_pheno = pl.read_csv(in_pheno2, separator="\t")
    sample_id_cancertype = df_pheno.filter(pl.col("cancer type abbreviation") == cancer_type_abb).select(pl.col("sample")).to_numpy()
    #sample_id_data = np.array(lf_data.collect_schema().names())
    sample_id_data = np.array(lf_data.limit(1).collect().columns)
    sample_id_cancertype = set(np.intersect1d(sample_id_cancertype, sample_id_data))

    # Subset lazyframe to include only samples from cancer type of interest.
    lf_data_cancertype = lf_data.select(sample_id_cancertype.union(set(["sample"])))

    # Use list to filter the data: 
    lf_data_cancertype = lf_data_cancertype.filter(pl.col("sample").is_in(trans_id_of_interest))

    # Put rename "sample" columns and move to first posititon.
    all_columns = lf_data_cancertype.columns
    lf_data_cancertype = lf_data_cancertype.rename(
        {"sample": "transcript_id"}
    ).select(
        [pl.col("transcript_id")]  + [pl.col(c) for c in all_columns if c != "sample"]
    )

    if write_file: 
        lf_data_cancertype.sink_csv("/zhome/94/f/147417/tma_thesis/data/xena/TCGA_PANCAN_transcript_expression_RNAseq_RSEM_tpm_{}.tsv".format(cancer_type_abb))

    return lf_data_cancertype.collect()


In [112]:
cancer_type_list = ["LAML", "CHOL", "CESC", "DLBC", "HNSC", "BRCA", "LUAD", "LUSC"]

for c_type in cancer_type_list: 
    df_c_type = make_cancer_specific_tsv(c_type, True)