## Importing libraries

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.ticker as tkr
import numpy as np
import statsmodels.api as sm
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
import warnings
warnings.simplefilter('ignore')

## Reading csv files

In [4]:
reads = pd.read_csv('../summary_data/summary_tcga_reads.csv')
vj = pd.read_csv('../summary_data/summary_tcga_VJ.csv')
cdr3 = pd.read_csv('../summary_data/summary_tcga_cdr3.csv')
microbiome = pd.read_csv('../summary_data/summary_tcga_microbiome.csv')
mtDNA = pd.read_csv('../summary_data/summary_tcga_mtDNA.csv')
purity = pd.read_csv('../summary_data/purity.csv')
purity = purity.dropna(subset=['CPE'])

## Joining data into single dataframe

In [5]:
sbt = cdr3.merge(vj, left_on='sample', right_on='sample', suffixes=('', 'vj'))
sbt = sbt.merge(reads, left_on='sample', right_on='sample', suffixes=(False, False))
sbt = sbt.merge(microbiome, left_on='sample', right_on='sample', suffixes=(False, False))
sbt = sbt.merge(mtDNA, left_on='sample', right_on='sample', suffixes=(False, False))
sbt['FileID'] = sbt['sample'].str[:36]

metadata = pd.read_csv('../summary_data/explore_case_table.tsv', delimiter="\t")
metadata = metadata.merge(pd.read_csv('../summary_data/gdc_sample_sheet.tsv', delimiter="\t"), left_on='Case ID', right_on='Case ID', suffixes=(False, False))
metadata['FileID'] = metadata['File Name'].str[:36]
metadata = metadata.loc[metadata['Age at diagnosis'] != "--"]
metadata = metadata.merge(purity, left_on=['Sample ID'], right_on=['Sample ID'], suffixes=(False, False))
data = metadata.merge(sbt, left_on=['FileID'], right_on=['FileID'], suffixes=(False, False))

l = data['Age at diagnosis'].str.split(" ", n = 4, expand = True) 
l.loc[((l[3] != "days") & (l[3] != "day")),2] = 0
data["Age"] = l[0].astype(float) + l[2].astype(float)/365
sbt = sbt.drop(columns=['n_unique_pe'])
sbt = sbt.drop(columns=['name'])
sbt = sbt.drop(columns=['mtDNA_ID'])
sbt = sbt.drop(columns=['FileID'])
data['Sex'] = data['Gender'] == "Female"

## Average of female samples

In [6]:
np.mean(data['Sex'])

0.5483091787439613

## Linear model for SBT features 

In [7]:
X = data[['Sex', 'Age','n_unique_pe','CPE']]
X['Sex'] = X['Sex'].astype(int)

X = np.array(X)
X = sm.add_constant(X)

pvalues = []
for i in range(1,len(sbt.columns)):
    y = data[sbt.columns[i]]
    y = np.array(y)
    mod = sm.OLS(y,X)
    fii = mod.fit()
    p_values = fii.summary2().tables[1]['P>|t|']
    pvalues.append([sbt.columns[i],p_values[1:]['x1'],p_values[1:]['x2'],p_values[1:]['x3'],p_values[1:]['x4']])

pvalues = pd.DataFrame(pvalues)   
pvalues.columns = ['feature','gender','age','#reads','purity']
pvalues.round(4)

Unnamed: 0,feature,gender,age,#reads,purity
0,nIGH,0.4251,0.8434,0.0124,0.0
1,nIGK,0.0863,0.4749,0.0002,0.0
2,nIGL,0.3223,0.5808,0.0086,0.0
3,nTCRA,0.1197,0.3375,0.0,0.0
4,nTCRB,0.2385,0.0549,0.0,0.0
5,nTCRD,0.2336,0.8952,0.0002,0.0
6,nTCRG,0.1367,0.4284,0.0,0.0
7,loadIGH,0.7464,0.5289,0.0792,0.0
8,loadIGK,0.7543,0.6053,0.0006,0.0
9,loadIGL,0.4992,0.5253,0.0788,0.0
