In [1]:
from sklearn.linear_model import LinearRegression
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import LogisticRegression

# 0. Data preparation

## 0.1 PDTC

In [2]:
response_pdtc = pd.read_csv('../data/DrugResponsesAUCModels.txt', sep='\t')
threshold_pdtc = response_pdtc['AUC'].median()

In [3]:
fingerprint_pdtc = pd.read_csv('../data/fingerprint.txt', sep = "\t", header=None)
fingerprint_pdtc.index = fingerprint_pdtc[0].tolist()
fingerprint_pdtc.drop(0, axis=1, inplace=True)

In [4]:
expression_pdtc = pd.read_csv('../data/with_human/pdtc_L1000.csv')
expression_pdtc.index = expression_pdtc['Gene'].tolist()
expression_pdtc.drop('Gene', axis=1, inplace=True)

In [5]:
X_pdtc = []
Y_pdtc = []
Y_pdtc_binary = []
for idx, line in response_pdtc.iterrows():
    if line[1] in fingerprint_pdtc.index:
        X_pdtc.append(expression_pdtc[line[0]].tolist() + fingerprint_pdtc.loc[line[1]].tolist())
        Y_pdtc.append(line[2])
        if line[2] < threshold_pdtc:
            Y_pdtc_binary.append('response')
        else:
            Y_pdtc_binary.append('non-response')

## 0.2 CCLE

In [6]:
response_ccle = pd.read_csv('../data/secondary-screen-dose-response-curve-parameters_abstract_breast_modified.csv')
threshold_ccle = response_ccle['auc'].median()

In [7]:
fingerprint_ccle = pd.read_csv('../data/ccle_fingerprint_only.txt', sep = "\t", header=None)
fingerprint_ccle.index = fingerprint_ccle[0].tolist()
fingerprint_ccle.drop(0, axis=1, inplace=True)

In [8]:
def return_fingerprint(smiles):
    out = np.fromstring(fingerprint_ccle.loc[smiles].values[0][1:-1], dtype=int, sep='\t')
    return out

In [9]:
expression_ccle = pd.read_csv('../data/with_human/cell_line_L1000.csv')
expression_ccle.index = expression_ccle['Unnamed: 0'].tolist()
expression_ccle.drop('Unnamed: 0', axis=1, inplace=True)

In [10]:
X_ccle = []
Y_ccle = []
Y_ccle_binary = []
for idx, line in response_ccle.iterrows():
    if line[3] in fingerprint_ccle.index:
        X_ccle.append(expression_ccle[line[2]].tolist() + return_fingerprint(line[3]).tolist())
        Y_ccle.append(line[4])
        if line[4] < threshold_ccle:
            Y_ccle_binary.append('response')
        else:
            Y_ccle_binary.append('non-response')

## 0.3 TCGA

In [11]:
response_tcga = pd.read_csv('../data/tcga_response_exist_conversion.csv', sep='\t')
response_tcga.drop('Unnamed: 0', axis=1, inplace=True)

In [12]:
fingerprint_tcga = pd.read_csv('../data/tcga_fingerprint.txt', sep='\t', header=None)
fingerprint_tcga.index = fingerprint_tcga[0].tolist()
fingerprint_tcga.drop(0, axis=1, inplace=True)

In [13]:
expression_tcga = pd.read_csv('../data/with_human/tcga_L1000.csv')
expression_tcga.index = expression_tcga['genes'].tolist()
expression_tcga.drop('genes', axis=1, inplace=True)

In [14]:
X_tcga = []
Y_tcga_binary = []
for idx, line in response_tcga.iterrows():
    if line[1] in fingerprint_tcga.index:
        for patient_code in expression_tcga.columns:
            if patient_code[:12] == line[0]:
                X_tcga.append(expression_tcga[patient_code].tolist() + fingerprint_tcga.loc[line[1]].tolist())
                if line[2] == 'Complete Response' or line[2] == 'Partial Response':
                    Y_tcga_binary.append('response')
                else:
                    Y_tcga_binary.append('non-response')
                break

# 1. Logistic Regression

## 1.1 PDTC -> TCGA

In [15]:
X_train, X_test, Y_train, Y_test = train_test_split(X_pdtc, Y_pdtc_binary, test_size=0.2)

In [16]:
reg = LogisticRegression()
reg.fit(X_train, Y_train)
scores = reg.score(X_test, Y_test) # mean accuracy
scores

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.7914110429447853

In [17]:
scores = reg.score(X_tcga, Y_tcga_binary) # mean accuracy
scores

0.6266666666666667

## 1.2 CCLE -> TCGA

In [18]:
X_train, X_test, Y_train, Y_test = train_test_split(X_ccle, Y_ccle_binary, test_size=0.2)

In [19]:
reg = LogisticRegression()
reg.fit(X_train, Y_train)
scores = reg.score(X_test, Y_test) # mean accuracy
scores

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.6946853808547547

In [20]:
scores = reg.score(X_tcga, Y_tcga_binary) # mean accuracy
scores

0.5

# 2. Linear Regression

## 2.1 PDTC -> TCGA

In [21]:
X_train, X_test, Y_train, Y_test = train_test_split(X_pdtc, Y_pdtc, test_size=0.2)

In [24]:
reg = LinearRegression()
reg.fit(X_train, Y_train)
scores = reg.score(X_test, Y_test)
scores

0.543121591107796

In [25]:
tp = 0
fp = 0
tn = 0
fn = 0
predict = reg.predict(X_tcga)
for expect, real in zip(predict, Y_tcga_binary):
    if expect < threshold_pdtc:
        if real == 'response':
            tp += 1
        else:
            fp += 1
    else:
        if real == 'response':
            fn += 1
        else:
            tn += 1
precision = tp / (tp + fp)
recall = tp / (tp + fn)
accuracy = (tp + tn) / (tp + fp + tn + fn)
balanced_accuracy = (tp / (tp + fn) + tn / (tn + fp)) / 2
F1 = 2 * (precision * recall) / (precision + recall)
print('precision : %f\nrecall : %f\naccuracy : %f\nbalacned accuracy : %f\nF1 score : %f' % (precision, recall, accuracy, balanced_accuracy, F1))

precision : 0.873786
recall : 0.522244
accuracy : 0.523333
balacned accuracy : 0.526182
F1 score : 0.653753


## 2.2 CCLE -> TCGA

In [26]:
X_train, X_test, Y_train, Y_test = train_test_split(X_ccle, Y_ccle, test_size=0.2)

In [27]:
reg = LinearRegression()
reg.fit(X_train, Y_train)
scores = reg.score(X_test, Y_test)
scores

-1.2473322443580108e+16

In [28]:
tp = 0
fp = 0
tn = 0
fn = 0
predict = reg.predict(X_tcga)
for expect, real in zip(predict, Y_tcga_binary):
    if expect < threshold_ccle:
        if real == 'response':
            tp += 1
        else:
            fp += 1
    else:
        if real == 'response':
            fn += 1
        else:
            tn += 1
precision = tp / (tp + fp)
recall = tp / (tp + fn)
accuracy = (tp + tn) / (tp + fp + tn + fn)
balanced_accuracy = (tp / (tp + fn) + tn / (tn + fp)) / 2
F1 = 2 * (precision * recall) / (precision + recall)
print('precision : %f\nrecall : %f\naccuracy : %f\nbalacned accuracy : %f\nF1 score : %f' % (precision, recall, accuracy, balanced_accuracy, F1))

precision : 0.860972
recall : 0.994197
accuracy : 0.856667
balacned accuracy : 0.497099
F1 score : 0.922801


# 3. Ridge Regression

## 3.1 PDTC -> TCGA

In [29]:
X_train, X_test, Y_train, Y_test = train_test_split(X_pdtc, Y_pdtc, test_size=0.2)

In [30]:
reg = Ridge()
reg.fit(X_train, Y_train)
scores = reg.score(X_test, Y_test)
scores

0.5828681701676285

In [31]:
tp = 0
fp = 0
tn = 0
fn = 0
predict = reg.predict(X_tcga)
for expect, real in zip(predict, Y_tcga_binary):
    if expect < threshold_pdtc:
        if real == 'response':
            tp += 1
        else:
            fp += 1
    else:
        if real == 'response':
            fn += 1
        else:
            tn += 1
precision = tp / (tp + fp)
recall = tp / (tp + fn)
accuracy = (tp + tn) / (tp + fp + tn + fn)
balanced_accuracy = (tp / (tp + fn) + tn / (tn + fp)) / 2
F1 = 2 * (precision * recall) / (precision + recall)
print('precision : %f\nrecall : %f\naccuracy : %f\nbalacned accuracy : %f\nF1 score : %f' % (precision, recall, accuracy, balanced_accuracy, F1))

precision : 0.896458
recall : 0.636364
accuracy : 0.623333
balacned accuracy : 0.589266
F1 score : 0.744344


## 3.2 CCLE -> TCGA

In [32]:
X_train, X_test, Y_train, Y_test = train_test_split(X_ccle, Y_ccle, test_size=0.2)

In [33]:
reg = Ridge()
reg.fit(X_train, Y_train)
scores = reg.score(X_test, Y_test)
scores

0.5887121692579464

In [34]:
tp = 0
fp = 0
tn = 0
fn = 0
predict = reg.predict(X_tcga)
for expect, real in zip(predict, Y_tcga_binary):
    if expect < threshold_ccle:
        if real == 'response':
            tp += 1
        else:
            fp += 1
    else:
        if real == 'response':
            fn += 1
        else:
            tn += 1
precision = tp / (tp + fp)
recall = tp / (tp + fn)
accuracy = (tp + tn) / (tp + fp + tn + fn)
balanced_accuracy = (tp / (tp + fn) + tn / (tn + fp)) / 2
F1 = 2 * (precision * recall) / (precision + recall)
print('precision : %f\nrecall : %f\naccuracy : %f\nbalacned accuracy : %f\nF1 score : %f' % (precision, recall, accuracy, balanced_accuracy, F1))

precision : 0.877193
recall : 0.483559
accuracy : 0.496667
balacned accuracy : 0.530936
F1 score : 0.623441


# 4. LASSO Regression

## 4.1 PDTC -> TCGA

In [35]:
X_train, X_test, Y_train, Y_test = train_test_split(X_pdtc, Y_pdtc, test_size=0.2)

In [38]:
reg = Lasso(alpha=0.0001)
reg.fit(X_train, Y_train)
scores = reg.score(X_test, Y_test)
scores

  model = cd_fast.enet_coordinate_descent(


0.5219267940277483

In [39]:
tp = 0
fp = 0
tn = 0
fn = 0
predict = reg.predict(X_tcga)
for expect, real in zip(predict, Y_tcga_binary):
    if expect < threshold_pdtc:
        if real == 'response':
            tp += 1
        else:
            fp += 1
    else:
        if real == 'response':
            fn += 1
        else:
            tn += 1
precision = tp / (tp + fp)
recall = tp / (tp + fn)
accuracy = (tp + tn) / (tp + fp + tn + fn)
balanced_accuracy = (tp / (tp + fn) + tn / (tn + fp)) / 2
F1 = 2 * (precision * recall) / (precision + recall)
print('precision : %f\nrecall : %f\naccuracy : %f\nbalacned accuracy : %f\nF1 score : %f' % (precision, recall, accuracy, balanced_accuracy, F1))

precision : 0.882064
recall : 0.694391
accuracy : 0.656667
balacned accuracy : 0.558039
F1 score : 0.777056


## 4.2 CCLE -> TCGA

In [40]:
X_train, X_test, Y_train, Y_test = train_test_split(X_ccle, Y_ccle, test_size=0.2)

In [41]:
reg = Lasso(alpha=0.0001)
reg.fit(X_train, Y_train)
scores = reg.score(X_test, Y_test)
scores

  model = cd_fast.enet_coordinate_descent(


0.5215656788564855

In [42]:
tp = 0
fp = 0
tn = 0
fn = 0
predict = reg.predict(X_tcga)
for expect, real in zip(predict, Y_tcga_binary):
    if expect < threshold_ccle:
        if real == 'response':
            tp += 1
        else:
            fp += 1
    else:
        if real == 'response':
            fn += 1
        else:
            tn += 1
precision = tp / (tp + fp)
recall = tp / (tp + fn)
accuracy = (tp + tn) / (tp + fp + tn + fn)
balanced_accuracy = (tp / (tp + fn) + tn / (tn + fp)) / 2
F1 = 2 * (precision * recall) / (precision + recall)
print('precision : %f\nrecall : %f\naccuracy : %f\nbalacned accuracy : %f\nF1 score : %f' % (precision, recall, accuracy, balanced_accuracy, F1))

precision : 0.877193
recall : 0.483559
accuracy : 0.496667
balacned accuracy : 0.530936
F1 score : 0.623441


In [43]:
tp = 0
fp = 0
tn = 0
fn = 0
for real in Y_tcga_binary:
    if real == 'response':
        tp += 1
    else:
        fp += 1
precision = tp / (tp + fp)
recall = tp / (tp + fn)
accuracy = (tp + tn) / (tp + fp + tn + fn)
balanced_accuracy = (tp / (tp + fn) + tn / (tn + fp)) / 2
F1 = 2 * (precision * recall) / (precision + recall)
print('precision : %f\nrecall : %f\naccuracy : %f\nbalacned accuracy : %f\nF1 score : %f' % (precision, recall, accuracy, balanced_accuracy, F1))

precision : 0.861667
recall : 1.000000
accuracy : 0.861667
balacned accuracy : 0.500000
F1 score : 0.925694
