In [1]:
# _ ____ ___ 
# | |___ |==]

# Tumor Classification

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import re
import numpy as np
from sklearn.svm import SVC
from sklearn.decomposition import PCA

In [3]:
gsva_mouse = pd.read_csv("./results/gsva_results.csv", index_col=0).T
pattern = re.compile("^[^_]+")
type_tumor = []
for name in gsva_mouse.index:
    type_tumor.append(re.findall(pattern, name)[0])
type_tumor = pd.DataFrame(type_tumor)

In [4]:
gsva_human = pd.read_csv("./results/gsva_human.csv", index_col=0).T

In [5]:
with open("./results/IL10_most_affected_human", "r") as f:
    gene_statistic = f.readlines()[0]

In [6]:
human_RNAseq = pd.read_csv("./Data/RNAseq_expression_clean.csv", index_col=0).T

In [7]:
gene_statistic_values = human_RNAseq.loc[:,gene_statistic].to_list()

In [8]:
is_pBIC = [1 if pBIC == "pBIC" else 0 for pBIC in type_tumor[0]]
is_pBIC10 = [1 if pBIC == "pBIC10" else 0 for pBIC in type_tumor[0]]

In [9]:
print(gene_statistic_values)

[11.671, 13.835, 12.011, 13.353, 12.423, 13.237, 10.324, 10.473, 11.397, 5.471, 13.071, 9.028, 10.277, 10.954, 10.92, 12.038, 13.429, 8.697, 11.393, 12.137, 10.306, 13.834, 9.882, 6.84, 12.507, 13.795, 12.583, 11.333, 13.724, 12.299, 9.865, 12.469, 12.805, 13.768, 13.947, 10.881, 15.596, 12.114, 12.285, 13.103, 10.862, 13.625, 13.416, 10.831, 13.618, 12.557, 12.7, 11.91, 13.78, 11.355, 13.823, 9.716, 13.9, 13.811, 14.476, 9.884, 9.593, 12.402, 11.498, 13.923, 8.93, 13.94, 10.673, 11.789, 11.675, 9.766, 13.645, 10.888, 10.842, 6.528, 13.069, 12.581, 11.281, 12.846, 10.136, 13.493, 13.067, 13.774, 12.692, 11.318, 13.102, 12.517, 10.187, 13.127, 13.139, 10.452, 10.89, 13.857, 11.873, 13.551, 11.314, 9.966, 12.422, 10.855, 11.762, 9.523, 9.518, 11.967, 9.023, 11.814, 11.163, 12.889, 13.254, 13.676, 11.934, 6.743, 11.23, 10.887, 13.054, 12.453, 11.42, 13.712, 9.826, 9.278, 12.335, 11.577, 12.611, 10.179, 12.771, 13.917, 11.603, 12.948, 11.62, 13.832, 10.94, 13.168, 10.391, 12.442, 13.489, 1

## Find optimal gamma

In [10]:
thresh_low = len(gsva_human) * 0.05
thresh_high = len(gsva_human) * 0.35

range_ = np.arange(0.1, 10.1, 0.1)

pBIC_range = []
pBIC10_range = []
for a in range_:
    svm_pBIC = SVC(gamma = round(a,1))
    svm_pBIC10 = SVC(gamma = round(a,1))

    svm_pBIC.fit(gsva_mouse, is_pBIC)
    svm_pBIC10.fit(gsva_mouse, is_pBIC10)

    sum_pBIC = sum(svm_pBIC.predict(gsva_human))
    sum_pBIC10 = sum(svm_pBIC10.predict(gsva_human))

    pBIC_condition = sum_pBIC >= thresh_low and sum_pBIC <= thresh_high
    pBIC10_condition = sum_pBIC10 >= thresh_low and sum_pBIC10 <= thresh_high

    if pBIC10_condition:
        pBIC10_range.append(round(a,1))

    if pBIC_condition:
        pBIC_range.append(round(a,1))

print(pBIC10_range)
print(pBIC_range)

[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7]
[0.2, 0.3, 0.4, 0.5, 0.6, 0.7]


In [11]:
combinations = [(round(a,1), round(b,1)) for a in pBIC_range for b in pBIC10_range]

results = []

for a,b in combinations:
    svm_pBIC = SVC(gamma = a)
    svm_pBIC10 = SVC(gamma = b)

    svm_pBIC.fit(gsva_mouse, is_pBIC)
    svm_pBIC10.fit(gsva_mouse, is_pBIC10)

    pred_pBIC = svm_pBIC.predict(gsva_human)
    pred_pBIC10 = svm_pBIC10.predict(gsva_human)

    coincide = [x == 1 & y == 1 for x,y in zip(pred_pBIC, pred_pBIC10)]
    pred_pBIC10 = [x + 1 if x == 1 else x for x in pred_pBIC10]
    prediction = [x + y if not bool_ else 0 for x,y,bool_ in zip(pred_pBIC, pred_pBIC10, coincide)]

    pred_pBIC = [x == 1 for x in prediction]
    pred_pBIC10 = [x == 2 for x in prediction]

    ccl19_pBIC = np.mean([x for idx, x in enumerate(gene_statistic_values) if pred_pBIC[idx]])
    ccl19_pBIC10 = np.mean([x for idx, x in enumerate(gene_statistic_values) if pred_pBIC10[idx]])

    results.append(ccl19_pBIC - ccl19_pBIC10)

print(results)


[1.381916060606061, 1.44682828282828, 1.6403091350397183, 1.8100021793275207, 1.8993488142292474, 1.7035521390374324, 1.7538427272727262, 1.4092028571428585, 1.4741150793650775, 1.6675959315765159, 1.8372889758643183, 1.926635610766045, 1.73083893557423, 1.7811295238095237, 1.4156683333333344, 1.4805805555555533, 1.6740614077669917, 1.8437544520547942, 1.9331010869565208, 1.7373044117647058, 1.7875949999999996, 1.286364761904764, 1.351276984126983, 1.5447578363384213, 1.7144508806262237, 1.8037975155279504, 1.6080008403361354, 1.6582914285714292, 1.5219933333333344, 1.5869055555555533, 1.7803864077669918, 1.9500794520547942, 2.039426086956521, 1.8436294117647058, 1.8939199999999996, 1.4982394871794877, 1.5631517094017067, 1.756632561613145, 1.9263256059009475, 2.015672240802674, 1.8198755656108592, 1.870166153846153]


In [18]:
prov = SVC(gamma=0.6)
prov.fit(gsva_mouse,is_pBIC)
sum(prov.predict(gsva_human))

30

In [16]:
data = {"combinations": combinations, "results": results}
prov = pd.DataFrame(data)
prov.sort_values("results", ascending=False).head(10)

Unnamed: 0,combinations,results
32,"(0.6, 0.5)",2.039426
39,"(0.7, 0.5)",2.015672
31,"(0.6, 0.4)",1.950079
18,"(0.4, 0.5)",1.933101
11,"(0.3, 0.5)",1.926636
38,"(0.7, 0.4)",1.926326
4,"(0.2, 0.5)",1.899349
34,"(0.6, 0.7)",1.89392
41,"(0.7, 0.7)",1.870166
17,"(0.4, 0.4)",1.843754


In [17]:
print(max(results))
values = combinations[results.index(max(results))]
print(values)

2.039426086956521
(0.6, 0.5)


The value of gamma for pBIC is 0.6  
The value of gamma for pBIC10 is 0.5

In [18]:
svm_pBIC = SVC(gamma = values[0])
svm_pBIC10 = SVC(gamma = values[1])

svm_pBIC.fit(gsva_mouse, is_pBIC)
svm_pBIC10.fit(gsva_mouse, is_pBIC10)

pred_pBIC = svm_pBIC.predict(gsva_human)
pred_pBIC10 = svm_pBIC10.predict(gsva_human)

coincide = [x == 1 & y == 1 for x,y in zip(pred_pBIC, pred_pBIC10)]
pred_pBIC10 = [x + 1 if x == 1 else x for x in pred_pBIC10]
prediction = [x + y if not bool_ else 0 for x,y,bool_ in zip(pred_pBIC, pred_pBIC10, coincide)]

In [22]:
sum(pred_pBIC10)//2

46

## Save data with classification

In [14]:
names_patients = list(human_RNAseq.index)

In [15]:
patient_data = pd.read_csv("./Data/Schmitz.csv", sep=";")

In [16]:
patient_data = patient_data.loc[patient_data["dbGaP subject ID"].isin(names_patients)]

In [17]:
patient_data["prov"] = pd.Categorical(
    patient_data["dbGaP subject ID"],
    categories=names_patients,
    ordered=True
)

patient_data = patient_data.sort_values("prov")

In [18]:
del patient_data["prov"]

In [19]:
patient_data["category"] = prediction

In [20]:
patient_data.head()

Unnamed: 0,dbGaP subject ID,dbGaP accession,Diagnosis,Gene Expression Subgroup,Genetic Subtype,Biopsy Type,Treatment__,Gender,Age,Ann Arbor Stage,...,ECOG Performance Status,Number of Extranodal Sites,IPI Group,IPI Range,Status at Follow_up_ 0 Alive_ 1 Dead,Follow_up Time _yrs,Progression_Free Survival _PFS_ Status_ 0 No Progressoin_ 1 Progression,Progression_Free Survival _PFS_ Time _yrs,Included in Survival Analysis,category
300,DLBCL10502,phs001444,Diffuse large B cell lymphoma,GCB,Other,Pre-treatment,Immunochemotherapy,M,46.0,2.0,...,0.0,0.0,Low,0,0,170568104,0,170568104,Yes,0
318,DLBCL10521,phs001444,Diffuse large B cell lymphoma,ABC,Other,Pre-treatment,Immunochemotherapy,F,62.0,4.0,...,2.0,1.0,Intermediate,3,0,3359342916,0,3359342916,Yes,0
144,DLBCL11255,phs001444,Diffuse large B cell lymphoma,ABC,Other,Relapse,Ibrutinib monotherapy,M,64.0,,...,,,,15,0,0,0,0,No,0
145,DLBCL11256,phs001444,Diffuse large B cell lymphoma,ABC,Other,Relapse,Ibrutinib monotherapy,F,58.0,,...,,,,4,0,0,0,0,No,0
146,DLBCL11257,phs001444,Diffuse large B cell lymphoma,Unclass,Other,Relapse,Ibrutinib monotherapy,M,55.0,,...,,,,4,0,0,0,0,No,0


In [23]:
patient_data.to_csv("./results/Schmitz_w_results.csv", index = False)