In [1]:
# Necessary imports
%load_ext autoreload
%autoreload 2\
    
import os
import sys
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)
from definitions import ROOT_DIR

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
import pandas as pd
import numpy as np

In [37]:
from src.features.multi_omics import MultiOmicsData

lusc_data = MultiOmicsData(cancer_type="LUSC", 
                           folder_path="/home/jonny2/PycharmProjects/assn-miRNA-LUAD/data/tcga-assembler/LUSC/", 
                           modalities=["GE", "MIR"])

('DRUGS', (357, 4))
('PATIENTS', (504, 5))
('MIR', (380, 1870))
('GE', (552, 20472))


In [4]:
centroids = pd.read_csv("/home/jonny2/PycharmProjects/assn-miRNA-LUAD/data/external/wilkerson.scc/predictor.centroids.csv")
centroids.columns = ["genes", "primitive", "classical", "secretory", "basal"]
centroids = centroids[centroids["genes"].isin(lusc_data.GE.get_genes_list())]
centroid_genes = centroids["genes"]

In [5]:
centroids.index = centroids.genes
centroids.drop(['genes'], axis=1, inplace=True)
centroids = centroids.T
centroids

genes,MYL6B,PODXL2,HSF2,TTLL4,MARCKSL1,MDK,CHKA,TRIM28,STOM,CASP1,...,ALDH1A3,DSE,MMP10,VDR,CAPZB,FNBP1,ENPP4,SH2B3,DOCK10,SDC1
primitive,0.539568,0.852272,0.293831,0.679557,1.015985,1.001421,0.513166,0.569403,-0.737997,-0.775559,...,-0.231312,-0.521502,-0.862938,-0.421701,-0.015213,-0.016889,0.232617,-0.178687,-0.121821,-0.764157
classical,-0.139755,-0.019997,0.060404,-0.089302,-0.306555,-0.019797,-0.150269,0.021242,-0.065057,-0.187048,...,-0.485903,-0.462036,-0.649346,-0.259443,-0.216407,-0.103185,-0.276022,-0.169656,-0.137791,0.33713
secretory,-0.084411,-0.104802,-0.131566,-0.044189,0.012595,-0.202898,0.235328,-0.264212,0.297794,0.305713,...,0.325191,0.48799,-0.974158,0.384095,0.180478,0.421356,0.482619,0.533466,0.569353,-0.99866
basal,-0.033551,-0.053003,-0.07913,-0.084657,0.097278,-0.22218,-0.136313,0.011287,0.043434,0.063549,...,0.905489,0.509466,2.926716,0.309637,0.216626,-0.185309,-0.147701,-0.030816,-0.093761,0.419894


# Subsetting the GE data to only genes

In [6]:
lusc_ge = lusc_data.GE.data[centroid_genes]


# Subset the LUSC samples to only tumor samples
lusc_ge = lusc_ge[lusc_ge.index.str.contains("-01A")]

In [7]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range=(-np.max(np.abs(centroids.values)), np.max(np.abs(centroids.values))))
# scaler = MinMaxScaler()
# scaler.fit(centroids)

# Classify LUSC patients based on cluster centroids obtained from Wilkenson, et. al

TODO Read paper "The molecular portraits of breast tumors are conserved across microarray platforms" on how to do subtype prediction

In [8]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=4)

In [9]:
kmeans.cluster_centers_ = centroids.values

In [10]:
lusc_subtypes_map = {0: 'Primitive', 1: 'Classical', 2: 'Secretory', 3: 'Basal'}

lusc_subtypes_pred = pd.DataFrame(kmeans.predict(scaler.fit_transform(lusc_ge)), index=lusc_ge.index)
lusc_subtypes_pred.columns = ["subtype"]
lusc_subtypes_pred.replace({"subtype": lusc_subtypes_map}, inplace=True)
lusc_subtypes_pred["subtype"].value_counts(sort=False, normalize=False)

Secretory     40
Primitive     23
Classical    221
Basal        212
Name: subtype, dtype: int64

# Assign predicted subtypes to LUSC patients samples

In [38]:
lusc_subtypes_pred["patient_barcode"] = lusc_subtypes_pred.index.str[:-4]
lusc_subtypes_pred

lusc_data.multi_omics_data["PATIENTS"] = pd.merge(lusc_data.clinical.patient, lusc_subtypes_pred, how="left",
        right_on="patient_barcode", left_index=True)


In [39]:
lusc_data.clinical.patient = lusc_data.multi_omics_data["PATIENTS"]


In [42]:
X, y = lusc_data.load_data(multi_omics=["GE"], 
#                            target=["ajcc_pathologic_tumor_stage"], 
                           target=["subtype"], 
                           predicted_subtypes=["Primitive"], 
#                            pathologic_stages=['Stage I']
                          )

                 patient_barcode bcr_patient_barcode gender race  \
TCGA-18-3406-01A    TCGA-18-3406                 NaN    NaN  NaN   
TCGA-18-3407-01A    TCGA-18-3407                 NaN    NaN  NaN   
TCGA-18-3408-01A    TCGA-18-3408                 NaN    NaN  NaN   
TCGA-18-3409-01A    TCGA-18-3409                 NaN    NaN  NaN   
TCGA-18-3410-01A    TCGA-18-3410                 NaN    NaN  NaN   
TCGA-18-3411-01A    TCGA-18-3411                 NaN    NaN  NaN   
TCGA-18-3412-01A    TCGA-18-3412                 NaN    NaN  NaN   
TCGA-18-3414-01A    TCGA-18-3414                 NaN    NaN  NaN   
TCGA-18-3415-01A    TCGA-18-3415                 NaN    NaN  NaN   
TCGA-18-3416-01A    TCGA-18-3416                 NaN    NaN  NaN   
TCGA-18-3417-01A    TCGA-18-3417                 NaN    NaN  NaN   
TCGA-18-3419-01A    TCGA-18-3419                 NaN    NaN  NaN   
TCGA-18-3421-01A    TCGA-18-3421                 NaN    NaN  NaN   
TCGA-18-4083-01A    TCGA-18-4083                

In [41]:
y

Unnamed: 0,subtype
