In [1]:
from sklearn.linear_model import LinearRegression
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import LogisticRegression
import openpyxl

In [2]:
response_pdtc = pd.read_csv('../data/DrugResponsesAUCModels.txt', sep='\t')

fingerprint_pdtc = pd.read_csv('../data/fingerprint.txt', sep = "\t", header=None)
fingerprint_pdtc.index = fingerprint_pdtc[0].tolist()
fingerprint_pdtc.drop(0, axis=1, inplace=True)

expression_pdtc = pd.read_csv('../data/with_human/pdtc_L1000.csv')
expression_pdtc.index = expression_pdtc['Gene'].tolist()
expression_pdtc.drop('Gene', axis=1, inplace=True)

response_ccle = pd.read_csv('../data/secondary-screen-dose-response-curve-parameters_abstract_breast_modified.csv')

fingerprint_ccle = pd.read_csv('../data/ccle_fingerprint_only.txt', sep = "\t", header=None)
fingerprint_ccle.index = fingerprint_ccle[0].tolist()
fingerprint_ccle.drop(0, axis=1, inplace=True)

def return_fingerprint(smiles):
    out = np.fromstring(fingerprint_ccle.loc[smiles].values[0][1:-1], dtype=int, sep='\t')
    return out

expression_ccle = pd.read_csv('../data/with_human/cell_line_L1000.csv')
expression_ccle.index = expression_ccle['Unnamed: 0'].tolist()
expression_ccle.drop('Unnamed: 0', axis=1, inplace=True)

response_tcga = pd.read_csv('../data/tcga_response_exist_conversion.csv', sep='\t')
response_tcga.drop('Unnamed: 0', axis=1, inplace=True)

fingerprint_tcga = pd.read_csv('../data/tcga_fingerprint.txt', sep='\t', header=None)
fingerprint_tcga.index = fingerprint_tcga[0].tolist()
fingerprint_tcga.drop(0, axis=1, inplace=True)

expression_tcga = pd.read_csv('../data/with_human/tcga_L1000.csv')
expression_tcga.index = expression_tcga['genes'].tolist()
expression_tcga.drop('genes', axis=1, inplace=True)

In [25]:
wb = openpyxl.Workbook()

for i in range(11):
    threshold = i / 10
    ws = wb.create_sheet(str(threshold))
    ws['A2'] = 'Logistic regression'
    ws['A6'] = 'Linear regression'
    ws['A10'] = 'Ridge regression'
    ws['A14'] = 'LASSO regression'
    for j in range(2, 15, 4):
        ws['B' + str(j)] = 'pdtc'
        ws['C' + str(j)] = 'cell line'
        ws['A' + str(j + 1)] = 'self'
        ws['A' + str(j + 2)] = 'human'

    X_pdtc = []
    Y_pdtc = []
    Y_pdtc_binary = []
    for idx, line in response_pdtc.iterrows():
        if line[1] in fingerprint_pdtc.index:
            X_pdtc.append(expression_pdtc[line[0]].tolist() + fingerprint_pdtc.loc[line[1]].tolist())
            Y_pdtc.append(line[2])
            if line[2] < threshold:
                Y_pdtc_binary.append('response')
            else:
                Y_pdtc_binary.append('non-response')

    X_ccle = []
    Y_ccle = []
    Y_ccle_binary = []
    for idx, line in response_ccle.iterrows():
        if line[3] in fingerprint_ccle.index:
            X_ccle.append(expression_ccle[line[2]].tolist() + return_fingerprint(line[3]).tolist())
            Y_ccle.append(line[4])
            if line[4] < threshold:
                Y_ccle_binary.append('response')
            else:
                Y_ccle_binary.append('non-response')

    X_tcga = []
    Y_tcga_binary = []
    for idx, line in response_tcga.iterrows():
        if line[1] in fingerprint_tcga.index:
            for patient_code in expression_tcga.columns:
                if patient_code[:12] == line[0]:
                    X_tcga.append(expression_tcga[patient_code].tolist() + fingerprint_tcga.loc[line[1]].tolist())
                    if line[2] == 'Complete Response' or line[2] == 'Partial Response':
                        Y_tcga_binary.append('response')
                    else:
                        Y_tcga_binary.append('non-response')
                    break

    X_train, X_test, Y_train, Y_test = train_test_split(X_pdtc, Y_pdtc_binary, test_size=0.2)

    try:
        reg = LogisticRegression()
        reg.fit(X_train, Y_train)
        scores = reg.score(X_test, Y_test)
        ws['B3'] = scores

        scores = reg.score(X_tcga, Y_tcga_binary)
        ws['B4'] = scores
    except:
        ws['B3'] = 'NA'
        ws['B4'] = 'NA'

    X_train, X_test, Y_train, Y_test = train_test_split(X_ccle, Y_ccle_binary, test_size=0.2)

    try:
        reg = LogisticRegression()
        reg.fit(X_train, Y_train)
        scores = reg.score(X_test, Y_test)
        ws['C3'] = scores

        scores = reg.score(X_tcga, Y_tcga_binary)
        ws['C4'] = scores
    except:
        ws['C3'] = 'NA'
        ws['C4'] = 'NA'

    X_train, X_test, Y_train, Y_test = train_test_split(X_pdtc, Y_pdtc, test_size=0.2)

    reg = LinearRegression()
    reg.fit(X_train, Y_train)
    scores = reg.score(X_test, Y_test)
    ws['B7'] = scores

    right = 0
    wrong = 0
    predict = reg.predict(X_tcga)
    for expect, real in zip(predict, Y_tcga_binary):
        if expect < threshold:
            if real == 'response':
                right += 1
            else:
                wrong += 1
        else:
            if real == 'response':
                wrong += 1
            else:
                right += 1
    ws['B8'] = right/(right + wrong)

    X_train, X_test, Y_train, Y_test = train_test_split(X_ccle, Y_ccle, test_size=0.2)

    reg = LinearRegression()
    reg.fit(X_train, Y_train)
    scores = reg.score(X_test, Y_test)
    ws['C7'] = scores

    right = 0
    wrong = 0
    predict = reg.predict(X_tcga)
    for expect, real in zip(predict, Y_tcga_binary):
        if expect < threshold:
            if real == 'response':
                right += 1
            else:
                wrong += 1
        else:
            if real == 'response':
                wrong += 1
            else:
                right += 1
    ws['C8'] = right/(right + wrong)

    X_train, X_test, Y_train, Y_test = train_test_split(X_pdtc, Y_pdtc, test_size=0.2)

    reg = Ridge()
    reg.fit(X_train, Y_train)
    scores = reg.score(X_test, Y_test)
    ws['B11'] = scores

    right = 0
    wrong = 0
    predict = reg.predict(X_tcga)
    for expect, real in zip(predict, Y_tcga_binary):
        if expect < threshold:
            if real == 'response':
                right += 1
            else:
                wrong += 1
        else:
            if real == 'response':
                wrong += 1
            else:
                right += 1
    ws['B12'] = right/(right + wrong)

    X_train, X_test, Y_train, Y_test = train_test_split(X_ccle, Y_ccle, test_size=0.2)

    reg = Ridge()
    reg.fit(X_train, Y_train)
    scores = reg.score(X_test, Y_test)
    ws['C11'] = scores

    right = 0
    wrong = 0
    predict = reg.predict(X_tcga)
    for expect, real in zip(predict, Y_tcga_binary):
        if expect < threshold:
            if real == 'response':
                right += 1
            else:
                wrong += 1
        else:
            if real == 'response':
                wrong += 1
            else:
                right += 1
    ws['C12'] = right/(right + wrong)

    X_train, X_test, Y_train, Y_test = train_test_split(X_pdtc, Y_pdtc, test_size=0.2)

    reg = Lasso(alpha=0.0001)
    reg.fit(X_train, Y_train)
    scores = reg.score(X_test, Y_test)
    ws['B15'] = scores

    right = 0
    wrong = 0
    predict = reg.predict(X_tcga)
    for expect, real in zip(predict, Y_tcga_binary):
        if expect < threshold:
            if real == 'response':
                right += 1
            else:
                wrong += 1
        else:
            if real == 'response':
                wrong += 1
            else:
                right += 1
    ws['B16'] = right/(right + wrong)

    X_train, X_test, Y_train, Y_test = train_test_split(X_ccle, Y_ccle, test_size=0.2)

    reg = Lasso(alpha=0.0001)
    reg.fit(X_train, Y_train)
    scores = reg.score(X_test, Y_test)
    ws['C15'] = scores

    right = 0
    wrong = 0
    predict = reg.predict(X_tcga)
    for expect, real in zip(predict, Y_tcga_binary):
        if expect < threshold:
            if real == 'response':
                right += 1
            else:
                wrong += 1
        else:
            if real == 'response':
                wrong += 1
            else:
                right += 1
    ws['C16'] = right/(right + wrong)

wb.remove(wb['Sheet'])
wb.save('../result/regression_interval.xlsx')

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iteration

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iteration

  model = cd_fast.enet_coordinate_descent(
