In [1]:
from sklearn.linear_model import LinearRegression
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import LogisticRegression

# 0. Data preparation

In [2]:
threshold = 0.5

## 0.1 PDTC

In [3]:
response_pdtc = pd.read_csv('../data/DrugResponsesAUCModels.txt', sep='\t')

In [4]:
fingerprint_pdtc = pd.read_csv('../data/fingerprint.txt', sep = "\t", header=None)
fingerprint_pdtc.index = fingerprint_pdtc[0].tolist()
fingerprint_pdtc.drop(0, axis=1, inplace=True)

In [5]:
expression_pdtc = pd.read_csv('../data/with_human/pdtc_L1000.csv')
expression_pdtc.index = expression_pdtc['Gene'].tolist()
expression_pdtc.drop('Gene', axis=1, inplace=True)

In [6]:
X_pdtc = []
Y_pdtc = []
Y_pdtc_binary = []
for idx, line in response_pdtc.iterrows():
    if line[1] in fingerprint_pdtc.index:
        X_pdtc.append(expression_pdtc[line[0]].tolist() + fingerprint_pdtc.loc[line[1]].tolist())
        Y_pdtc.append(line[2])
        if line[2] < threshold:
            Y_pdtc_binary.append('response')
        else:
            Y_pdtc_binary.append('non-response')

## 0.2 CCLE

In [7]:
response_ccle = pd.read_csv('../data/secondary-screen-dose-response-curve-parameters_abstract_breast_modified.csv')

In [8]:
fingerprint_ccle = pd.read_csv('../data/ccle_fingerprint_only.txt', sep = "\t", header=None)
fingerprint_ccle.index = fingerprint_ccle[0].tolist()
fingerprint_ccle.drop(0, axis=1, inplace=True)

In [9]:
def return_fingerprint(smiles):
    out = np.fromstring(fingerprint_ccle.loc[smiles].values[0][1:-1], dtype=int, sep='\t')
    return out

In [10]:
expression_ccle = pd.read_csv('../data/with_human/cell_line_L1000.csv')
expression_ccle.index = expression_ccle['Unnamed: 0'].tolist()
expression_ccle.drop('Unnamed: 0', axis=1, inplace=True)

In [11]:
X_ccle = []
Y_ccle = []
Y_ccle_binary = []
for idx, line in response_ccle.iterrows():
    if line[3] in fingerprint_ccle.index:
        X_ccle.append(expression_ccle[line[2]].tolist() + return_fingerprint(line[3]).tolist())
        Y_ccle.append(line[4])
        if line[4] < threshold:
            Y_ccle_binary.append('response')
        else:
            Y_ccle_binary.append('non-response')

## 0.3 TCGA

In [17]:
response_tcga = pd.read_csv('../data/tcga_response_exist_conversion.csv', sep='\t')
response_tcga.drop('Unnamed: 0', axis=1, inplace=True)

In [24]:
fingerprint_tcga = pd.read_csv('../data/tcga_fingerprint.txt', sep='\t', header=None)
fingerprint_tcga.index = fingerprint_tcga[0].tolist()
fingerprint_tcga.drop(0, axis=1, inplace=True)

In [32]:
expression_tcga = pd.read_csv('../data/with_human/tcga_L1000.csv')
expression_tcga.index = expression_tcga['genes'].tolist()
expression_tcga.drop('genes', axis=1, inplace=True)

In [34]:
X_tcga = []
Y_tcga_binary = []
for idx, line in response_tcga.iterrows():
    if line[1] in fingerprint_tcga.index:
        for patient_code in expression_tcga.columns:
            if patient_code[:12] == line[0]:
                X_tcga.append(expression_tcga[patient_code].tolist() + fingerprint_tcga.loc[line[1]].tolist())
                if line[2] == 'Complete Response' or line[2] == 'Partial Response':
                    Y_tcga_binary.append('response')
                else:
                    Y_tcga_binary.append('non-response')
                break

# 1. Logistic Regression

## 1.1 PDTC -> TCGA

In [40]:
X_train, X_test, Y_train, Y_test = train_test_split(X_pdtc, Y_pdtc_binary, test_size=0.2)

In [41]:
reg = LogisticRegression()
reg.fit(X_train, Y_train)
scores = reg.score(X_test, Y_test) # mean accuracy
scores

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.9233128834355828

In [42]:
scores = reg.score(X_tcga, Y_tcga_binary) # mean accuracy
scores

0.695

## 1.2 CCLE -> TCGA

In [43]:
X_train, X_test, Y_train, Y_test = train_test_split(X_ccle, Y_ccle_binary, test_size=0.2)

In [44]:
reg = LogisticRegression()
reg.fit(X_train, Y_train)
scores = reg.score(X_test, Y_test) # mean accuracy
scores

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.966566787572938

In [45]:
scores = reg.score(X_tcga, Y_tcga_binary) # mean accuracy
scores

0.14333333333333334

# 2. Linear Regression

## 2.1 PDTC -> TCGA

In [48]:
X_train, X_test, Y_train, Y_test = train_test_split(X_pdtc, Y_pdtc, test_size=0.2)

In [49]:
reg = LinearRegression()
reg.fit(X_train, Y_train)
scores = reg.score(X_test, Y_test)
scores

0.5721273765113899

In [53]:
right = 0
wrong = 0
predict = reg.predict(X_tcga)
for expect, real in zip(predict, Y_tcga_binary):
    if expect < threshold:
        if real == 'response':
            right += 1
        else:
            wrong += 1
    else:
        if real == 'response':
            wrong += 1
        else:
            right += 1
print(right/(right + wrong))

0.8616666666666667


## 2.2 CCLE -> TCGA

In [54]:
X_train, X_test, Y_train, Y_test = train_test_split(X_ccle, Y_ccle, test_size=0.2)

In [55]:
reg = LinearRegression()
reg.fit(X_train, Y_train)
scores = reg.score(X_test, Y_test)
scores

0.5886986513767958

In [56]:
right = 0
wrong = 0
predict = reg.predict(X_tcga)
for expect, real in zip(predict, Y_tcga_binary):
    if expect < threshold:
        if real == 'response':
            right += 1
        else:
            wrong += 1
    else:
        if real == 'response':
            wrong += 1
        else:
            right += 1
print(right/(right + wrong))

0.6266666666666667


# 3. Ridge Regression

## 3.1 PDTC -> TCGA

In [57]:
X_train, X_test, Y_train, Y_test = train_test_split(X_pdtc, Y_pdtc, test_size=0.2)

In [59]:
reg = Ridge()
reg.fit(X_train, Y_train)
scores = reg.score(X_test, Y_test)
scores

0.47553152854875047

In [60]:
right = 0
wrong = 0
predict = reg.predict(X_tcga)
for expect, real in zip(predict, Y_tcga_binary):
    if expect < threshold:
        if real == 'response':
            right += 1
        else:
            wrong += 1
    else:
        if real == 'response':
            wrong += 1
        else:
            right += 1
print(right/(right + wrong))

0.6916666666666667


## 3.2 CCLE -> TCGA

In [61]:
X_train, X_test, Y_train, Y_test = train_test_split(X_ccle, Y_ccle, test_size=0.2)

In [62]:
reg = Ridge()
reg.fit(X_train, Y_train)
scores = reg.score(X_test, Y_test)
scores

0.597637114119458

In [63]:
right = 0
wrong = 0
predict = reg.predict(X_tcga)
for expect, real in zip(predict, Y_tcga_binary):
    if expect < threshold:
        if real == 'response':
            right += 1
        else:
            wrong += 1
    else:
        if real == 'response':
            wrong += 1
        else:
            right += 1
print(right/(right + wrong))

0.135


# 4. LASSO Regression

## 4.1 PDTC -> TCGA

In [64]:
X_train, X_test, Y_train, Y_test = train_test_split(X_pdtc, Y_pdtc, test_size=0.2)

In [65]:
reg = Lasso()
reg.fit(X_train, Y_train)
scores = reg.score(X_test, Y_test)
scores

-0.0004151107210463856

In [66]:
right = 0
wrong = 0
predict = reg.predict(X_tcga)
for expect, real in zip(predict, Y_tcga_binary):
    if expect < threshold:
        if real == 'response':
            right += 1
        else:
            wrong += 1
    else:
        if real == 'response':
            wrong += 1
        else:
            right += 1
print(right/(right + wrong))

0.8616666666666667


In [70]:
reg = Lasso(alpha=0.0001)
reg.fit(X_train, Y_train)
scores = reg.score(X_test, Y_test)
scores

  model = cd_fast.enet_coordinate_descent(


0.5321509076659257

In [71]:
right = 0
wrong = 0
predict = reg.predict(X_tcga)
for expect, real in zip(predict, Y_tcga_binary):
    if expect < threshold:
        if real == 'response':
            right += 1
        else:
            wrong += 1
    else:
        if real == 'response':
            wrong += 1
        else:
            right += 1
print(right/(right + wrong))

0.14


## 4.2 CCLE -> TCGA

In [67]:
X_train, X_test, Y_train, Y_test = train_test_split(X_ccle, Y_ccle, test_size=0.2)

In [68]:
reg = Lasso()
reg.fit(X_train, Y_train)
scores = reg.score(X_test, Y_test)
scores

-0.00038596610680352583

In [69]:
right = 0
wrong = 0
predict = reg.predict(X_tcga)
for expect, real in zip(predict, Y_tcga_binary):
    if expect < threshold:
        if real == 'response':
            right += 1
        else:
            wrong += 1
    else:
        if real == 'response':
            wrong += 1
        else:
            right += 1
print(right/(right + wrong))

0.13833333333333334


In [None]:
reg = Lasso(alpha=0.0001)
reg.fit(X_train, Y_train)
scores = reg.score(X_test, Y_test)
scores

In [None]:
right = 0
wrong = 0
predict = reg.predict(X_tcga)
for expect, real in zip(predict, Y_tcga_binary):
    if expect < threshold:
        if real == 'response':
            right += 1
        else:
            wrong += 1
    else:
        if real == 'response':
            wrong += 1
        else:
            right += 1
print(right/(right + wrong))