In [1]:
#!/usr/bin/python3
import polars as pl
import numpy as np
import requests
from zipfile import ZipFile
import os 

import psutil


In [2]:

in_file = "/zhome/94/f/147417/tma_thesis/data/xena/TCGA_pancan_expected_count_transcripts.tsv" #"/zhome/94/f/147417/tma_thesis/data/xena/TCGA_pancan_expected_count.tsv" #tma_thesis/data/xena/TCGA_PANCAN_transcript_expression_RNAseq_RSEM_tpm.tsv" #"C:/Users/Mette/OneDrive - Danmarks Tekniske Universitet/11. Semester Speciale/Data/Xena/TCGA_PANCAN_transcript_expression_RNAseq_RSEM_tpm.gz"
in_pheno2 = "/zhome/94/f/147417/tma_thesis/data/xena/TCGA_PANCAN_curated_clinical_data.tsv" # "C:/Users/Mette/OneDrive - Danmarks Tekniske Universitet/11. Semester Speciale/Data/Xena/TCGA_PANCAN_curated_clinical_data.tsv"
in_transcripts = "/zhome/94/f/147417/tma_thesis/data/transcripts_all.tsv" # "C:/Users/Mette/OneDrive - Danmarks Tekniske Universitet/11. Semester Speciale/Data/Xena/TCGA_PANCAN_transcript_expression_RNAseq_RSEM_transcript_list.txt"

cancer_type_list = ["LAML", "CHOL", "CESC", "DLBC", "HNSC", "BRCA", "PAAD", "SKCM", "LUAD", "LUSC"]

# Load dataframe as lazy using polars and remove version numbers from transcript IDs
lf_data = pl.scan_csv(in_file, separator="\t").with_columns(pl.col("sample").str.extract(r"([^.]+)", 1))
# Load a table with transcript information: 
df_transcripts = pl.read_csv(in_transcripts, separator="\t").with_columns()
# Rename column to sample (match data frame) and remove version numbers of transcripts: 
trans_id_of_interest = df_transcripts[["Transcript ID", "Name"]].rename({"Transcript ID": "sample"}).with_columns(pl.col("sample").str.extract(r"([^.]+)", 1)).lazy()#.to_list()

df_pheno = pl.read_csv(in_pheno2, separator="\t")


In [3]:
# Define a function to fetch transcript IDs from Ensembl
def fetch_gene_ids_from_ensembl(trans_id):
    url = f"https://rest.ensembl.org/lookup/id/{trans_id}?expand=1"
    response = requests.get(url, headers={"Content-Type": "application/json"})
    if response.ok:
        data = response.json()
        return data.get('display_name', [])
    else:
        return []

print(trans_id_of_interest.collect())

shape: (73, 2)
┌─────────────────┬──────────────┐
│ sample          ┆ Name         │
│ ---             ┆ ---          │
│ str             ┆ str          │
╞═════════════════╪══════════════╡
│ ENST00000381577 ┆ CD274-202    │
│ ENST00000381573 ┆ CD274-201    │
│ ENST00000498261 ┆ CD274-205    │
│ ENST00000492923 ┆ CD274-204    │
│ …               ┆ …            │
│ ENST00000593178 ┆ SIGLEC15-204 │
│ ENST00000217169 ┆ BIRC7-201    │
│ ENST00000342412 ┆ BIRC7-202    │
│ ENST00000395306 ┆ BIRC7-203    │
└─────────────────┴──────────────┘


In [4]:
cancer_type_abb = "PAAD"

sample_id_cancertype = df_pheno.filter(pl.col("cancer type abbreviation") == cancer_type_abb).select(pl.col("sample")).to_numpy()

sample_id_data = np.array(lf_data.limit(1).collect().columns)
sample_id_cancertype = set(np.intersect1d(sample_id_cancertype, sample_id_data))

# Subset lazyframe to include only samples from cancer type of interest.
lf_data_cancertype = lf_data.select(sample_id_cancertype.union(set(["sample"])))

# Use list to filter the data: 
lf_data_cancertype = lf_data_cancertype.join(trans_id_of_interest, on="sample")


In [7]:

# From ensembl API get the transcript IDs for all genes of interest: 
# Apply the API function to the DataFrame
#df_genenames = df_genenames.with_columns(pl.col("converted_alias").map_elements(fetch_transcript_ids_from_ensembl).alias("transcript_ids"))

# Get a list of all the transcript IDs related to any of the 14 genes of interest: 
#trans_id_of_interest = df_genenames.select(pl.col("transcript_ids").explode()).to_series().to_list()

def make_cancer_specific_tsv(cancer_type_abb, write_file):
    # Make list of sample ids for different cancer types

    sample_id_cancertype = df_pheno.filter(pl.col("cancer type abbreviation") == cancer_type_abb).select(pl.col("sample")).to_numpy()
    #sample_id_data = np.array(lf_data.collect_schema().names())
    sample_id_data = np.array(lf_data.limit(1).collect().columns)
    sample_id_cancertype = set(np.intersect1d(sample_id_cancertype, sample_id_data))

    # Subset lazyframe to include only samples from cancer type of interest.
    lf_data_cancertype = lf_data.select(sample_id_cancertype.union(set(["sample"])))

    # Use list to filter the data: 
    lf_data_cancertype = lf_data_cancertype.join(trans_id_of_interest, on="sample")    

    # Move transcript id and name columns to beginning and sort by transcript name: 
    lf_data_cancertype = lf_data_cancertype.select(sorted(lf_data_cancertype.columns, key=str.casefold)).sort(by="Name")

    out_path = "/zhome/94/f/147417/tma_thesis/data/xena/TCGA_pancan_expected_count_transcripts_{}.tsv".format(cancer_type_abb)

    if write_file: 
        if os.path.exists(out_path):
            os.remove(out_path)

        lf_data_cancertype.collect().write_csv(out_path, separator="\t")

    return lf_data_cancertype




In [8]:
for c_type in cancer_type_list: 
    print(c_type)
    make_cancer_specific_tsv(c_type, True)


LAML
CHOL
CESC
DLBC
HNSC
BRCA
PAAD
SKCM
LUAD
LUSC
