# Play NLP Games with Genetic Data

In [None]:
import json
import os
import pandas as pd

In [None]:
from hack4nf import synapse 
from hack4nf import genie
from hack4nf import embedders

In [None]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
pd.set_option('display.max_columns', 200)

In [None]:
#GENIE_VERSION = "genie-12.0-public"
GENIE_VERSION = "genie-13.3-consortium"

In [None]:
SYNC_PATH = synapse.SYNC_PATH
print(SYNC_PATH)
EMBEDDINGS_PATH = os.path.join(SYNC_PATH, "../embeddings")
print(EMBEDDINGS_PATH)

In [None]:
syn_file_paths = synapse.get_file_name_to_path(genie_version=GENIE_VERSION)
syn_file_paths

# GENIE Joined Mutation Data 

In [None]:
df_mut_all = genie.read_pat_sam_mut(
    syn_file_paths["data_clinical_patient"],
    syn_file_paths["data_clinical_sample"],
    syn_file_paths["data_mutations_extended"],
)

In [None]:
df_mut_all

# GENIE - Clinical Sample

In [None]:
df_dcs_all = genie.read_clinical_sample(syn_file_paths["data_clinical_sample"]).set_index("SAMPLE_ID")

In [None]:
df_dcs_all

# Subset

In [None]:
SUBSET = "MSK-IMPACT468"

In [None]:
if SUBSET == "MSK-IMPACT468":
    df_dcs = df_dcs_all[df_dcs_all['SEQ_ASSAY_ID']=='MSK-IMPACT468']
    df_mut = df_mut_all[df_mut_all['SAMPLE_ID'].isin(df_dcs.index)]
    

elif SUBSET == "ALL":
    df_dcs = df_dcs_all
    df_mut = df_mut_all

else:
    raise ValueError()
    
ser_mut_tokens = df_mut.groupby('SAMPLE_ID')['Hugo_Symbol'].apply(list)    
print('number of samples isolated: ', df_dcs.shape[0])
print('number of variant isolated: ', df_mut.shape[0])
print('number of sample sentences isolated: ', ser_mut_tokens.shape[0])

# align df_dcs and tokens
df_dcs = df_dcs.loc[ser_mut_tokens.index]

# add sentences to dcs
df_dcs['sent'] = ser_mut_tokens

# NLP

In [None]:
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline

In [None]:
df_dcs['y_binary'] = df_dcs['ONCOTREE_CODE'].isin(genie.NF_ONCOTREE_CODES).astype(int)

In [None]:
df_dcs['y_binary'].sum()

In [None]:
df_dcs[df_dcs['y_binary']==1]['ONCOTREE_CODE'].value_counts()

In [None]:
df_dcs[df_dcs['y_binary']==1]['CANCER_TYPE_DETAILED'].value_counts()

In [None]:
df_dcs[df_dcs['y_binary']==1]['CANCER_TYPE'].value_counts()

In [None]:
#Y_PREDICT = 'y_binary'
#Y_PREDICT = 'ONCOTREE_CODE'
Y_PREDICT = "CANCER_TYPE"

In [None]:
df_v = pd.read_csv('/home/galtay/data/hack4nf/embeddings/dme_MSK-IMPACT468_sample_100_vecs.tsv', sep='\t', header=None)
df_m = pd.read_csv('/home/galtay/data/hack4nf/embeddings/dme_MSK-IMPACT468_sample_meta.tsv', sep='\t')

In [None]:
clf_report_stacks = {
    "v0": [],
    "v1": [],
}

skf = StratifiedKFold(n_splits=5, random_state=4297, shuffle=True)
for indices_train, indices_test in skf.split(df_dcs, df_dcs['y_binary']):
    
    df_train = df_dcs.iloc[indices_train]
    df_test = df_dcs.iloc[indices_test]
    
    # basic logistic regression
    #======================================================
    pipe_v0 = Pipeline([
        ('count', CountVectorizer(tokenizer=lambda x: x, preprocessor=lambda x: x, token_pattern=None)),
        ('tfidf', TfidfTransformer()),
        ('svd', TruncatedSVD(n_components=50)),
        ('clf', LogisticRegression()),
    ])
    
    pipe_v0.fit(df_train['sent'], df_train[Y_PREDICT])
    feature_names = pipe_v0['count'].get_feature_names()
    y_pred = pipe_v0.predict(df_test['sent'])
    
    
    cls_report_dict = classification_report(df_test[Y_PREDICT], y_pred, output_dict=True)
    df_clf_report = pd.DataFrame(cls_report_dict).drop(columns=['accuracy', 'macro avg', 'weighted avg']).T
    clf_report_stacks["v0"].append(df_clf_report)
    
    
    # use PMI sample embeddings 
    #======================================================
    map_train = pd.merge(df_train[[]], df_m[['SAMPLE_ID']], left_index=True, right_on='SAMPLE_ID')
    map_test = pd.merge(df_test[[]], df_m[['SAMPLE_ID']], left_index=True, right_on='SAMPLE_ID')
    
    x_train_vecs = df_v.iloc[map_train.index]
    x_test_vecs = df_v.iloc[map_test.index]
    
    pipe_v1 = Pipeline([
        ('clf', LogisticRegression())
    ])
    
    pipe_v1.fit(x_train_vecs, df_train[Y_PREDICT])
    y_pred = pipe_v1.predict(x_test_vecs)
    
    cls_report_dict = classification_report(df_test[Y_PREDICT], y_pred, output_dict=True)
    df_clf_report = pd.DataFrame(cls_report_dict).drop(columns=['accuracy', 'macro avg', 'weighted avg']).T
    clf_report_stacks["v1"].append(df_clf_report)

In [None]:
df_clf_reports = {}
for key, clf_report_stack in clf_report_stacks.items():
    df = pd.concat(clf_report_stack).reset_index().rename(columns={"index": "target"})
    df_report = pd.concat([
        df.groupby('target').mean().rename(columns={x: f"{x}-mean" for x in df.columns}),
        df.groupby('target').std().rename(columns={x: f"{x}-std" for x in df.columns}),
    ], axis=1).sort_values('f1-score-mean')
    df_clf_reports[key] = df_report

In [None]:
df_clf_reports["v0"].tail(40)

In [None]:
df_clf_reports["v1"].tail(40)