In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA

import os
import random
import pickle
from tqdm.notebook import tqdm

In [3]:
uuid_train,uuid_val,uuid_test=None,None,None

In [4]:
def read_uuid(rootdir, cancer_type, uuid):
    """Reads a case file into DataFrame, appends UUID and cancer type as column
    """
    try:
        return (
            pd
            .read_csv(os.path.join("..", "data", cancer_type, uuid, 
                                    next(filter( lambda s : s.endswith("tsv"),os.listdir(os.path.join(rootdir, "data", cancer_type, uuid)) ))
                                   ),  sep="\t", skiprows=[0,2,3,4,5])
            [['gene_name', "gene_type", 'unstranded', 'stranded_first', 'stranded_second', 'tpm_unstranded','fpkm_unstranded','fpkm_uq_unstranded']]
            .assign(uuid = uuid, cancer_type = cancer_type)
        )
    except Exception as e:
        print(f"Exception occured for {uuid}\n{e}")

In [5]:
def load_all_cases(cancer_type, gene_type):
    uuids = os.listdir(os.path.join("..", "data", cancer_type))
    df = (
        pd
        .concat([read_uuid("..", cancer_type, uuid) for uuid in tqdm(uuids)])
        .query(f"gene_type=='{gene_type}'")
        .pivot_table(
          index=['cancer_type','uuid'], 
          columns = 'gene_name', 
          values= ['tpm_unstranded'] # ['unstranded','stranded_first','stranded_second','tpm_unstranded','fpkm_unstranded','fpkm_uq_unstranded'])
        )
        .reset_index()
    )
    return df

In [6]:
def add_dataset_col(df, uuid_train, uuid_val, uuid_test):
    
    df=df.copy()
    # if uuid_train is None:
    #     print("Assignning dataset to each UUID")
    #     uuid_train, uuid_test = train_test_split(df.uuid, test_size=0.20)
    #     uuid_val, uuid_test = train_test_split(uuid_test, test_size=0.50)
    # print(
    #     len(df), "=", len(uuid_train), "+", len(uuid_val), "+", len(uuid_test)    
    # )
    df.loc[df.uuid.isin(uuid_train), 'dataset'] = 'Train'
    df.loc[df.uuid.isin(uuid_val), 'dataset'] = 'Validation'
    df.loc[df.uuid.isin(uuid_test), 'dataset'] = 'Test'
    return df

In [7]:
def cr_gene_type_df(gene_type):
    df = pd.concat([
        load_all_cases('bladder', gene_type),
        load_all_cases('skin', gene_type),
        load_all_cases('pancreas', gene_type),
        load_all_cases('brain', gene_type),
        load_all_cases('thyroid_gland', gene_type)
    ]).pipe(add_dataset_col)
    print(
        df.groupby([('cancer_type', ''), ('dataset','')]).uuid.count()
    )
    return df

In [8]:
def build_pca(df):
    df_train = df[df['dataset']=="Train"]
    xtrain = df_train.drop(columns=[('cancer_type', ''), ('uuid',''), ('dataset','')])
    ytrain = df_train[('cancer_type','')]
    pca = PCA(n_components=50)
    pca.fit_transform(xtrain)
    return pca

In [9]:
def build_logreg(df, pca):
    df_train = df[df['dataset']=="Train"]
    xtrain = df_train.drop(columns=[('cancer_type', ''), ('uuid',''),  ('dataset','')])
    ytrain = df_train[('cancer_type','')]
    xtrain_pca = pca.transform(xtrain)
    
    logreg = LogisticRegression(max_iter=1000)
    logreg.fit(xtrain_pca, ytrain)
    return logreg

In [10]:
def add_predictions(df, pca, logreg):
    x = df.drop(columns=[('cancer_type', ''), ('uuid',''),  ('dataset','')])
    x_pca = pca.transform(x)
    df = df.assign(logreg_prediction = logreg.predict(x_pca))
    return df

In [11]:
all_gene_types = """protein_coding
lncRNA
processed_pseudogene
unprocessed_pseudogene
miRNA
snRNA
misc_RNA
TEC
transcribed_unprocessed_pseudogene
snoRNA
transcribed_processed_pseudogene
rRNA_pseudogene
IG_V_pseudogene
IG_V_gene
transcribed_unitary_pseudogene
TR_V_gene
unitary_pseudogene
TR_J_gene
scaRNA
polymorphic_pseudogene
rRNA
IG_D_gene
TR_V_pseudogene
Mt_tRNA
IG_J_gene
pseudogene
IG_C_gene
IG_C_pseudogene
ribozyme
TR_C_gene
sRNA
TR_J_pseudogene
TR_D_gene
IG_J_pseudogene
Mt_rRNA
translated_processed_pseudogene
scRNA
translated_unprocessed_pseudogene
IG_pseudogene
vault_RNA""".split("\n")

In [12]:
random_seed = 111
np.random.seed(random_seed)
random.seed(random_seed)

for gene_type in tqdm(all_gene_types):
    
    df = cr_gene_type_df(gene_type)
    pca = build_pca(df)
    logreg = build_logreg(df, pca)
    df_final = add_predictions(df, pca, logreg)
    
    output_path = os.path.join("..", "output", gene_type)
    os.mkdir(output_path)
    df_final.to_csv(os.path.join(output_path, "df_final.csv"))
    with open(os.path.join(output_path, "pca.pickle"), "wb") as f:
        pickle.dump(pca, f)
    with open(os.path.join(output_path, "logreg.pickle"), "wb") as f:
        pickle.dump(logreg, f)

  0%|          | 0/40 [00:00<?, ?it/s]

FileNotFoundError: [WinError 3] The system cannot find the path specified: '..\\data\\bladder'