## Import Libraries

In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_regression
from sklearn import linear_model, ensemble, model_selection, metrics, tree, neighbors
import seaborn as sns
import matplotlib.pyplot as plt
import time
from sklearn import decomposition, feature_selection, svm, neighbors, datasets, preprocessing, neural_network , dummy
from matplotlib import pyplot

In [None]:
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error
from collections import Counter

## Load Data

#### Single X matrix and Single Y matrix

In [None]:
X = pd.read_table('Output/Avalon_BMatrix_map_2018_08.tsv')
X[X.columns[0]] = X[X.columns[0]].astype(str)
X = X.set_index(X.columns[0])
raw_X = X
# X = X.T

# Y = pd.read_table('Output/L1000_signatures_pertid_2018_07.tsv')
# Y.set_index(Y.columns[0], inplace = True)
# Y = Y.T

# print(X.shape, Y.shape)

In [None]:
Y = pd.read_table('Input/consensus-perts.tsv')
Y.set_index('pert_id', inplace=True)
print( Y.shape)

In [None]:
Y.head()

#### Multiple X matricies and single Y matrix

In [None]:
# Y = pd.read_table('Output/L1000_signatures_pertid_2018_07.tsv')
# Y.set_index(Y.columns[0], inplace = True)
# Y = Y.T

Y = pd.read_table('Input/consensus-perts.tsv')
Y.set_index('pert_id', inplace=True)

X1 = pd.read_table('RDKit_fps/Morg1_BMatrix_map_2018_08.tsv')
X1[X1.columns[0]] = X1[X1.columns[0]].astype(str)
X1 = X1.set_index(X1.columns[0])

X2 = pd.read_table('RDKit_fps/TopologicalTorsion_BMatrix_map_2018_08.tsv')
X2[X2.columns[0]] = X2[X2.columns[0]].astype(str)
X2 = X2.set_index(X2.columns[0])

X3 = pd.read_table('RDKit_fps/AtomPair_BMatrix_map_2018_08.tsv')
X3[X3.columns[0]] = X3[X3.columns[0]].astype(str)
X3 = X3.set_index(X3.columns[0])

X4 = pd.read_table('RDKit_fps/MACCs_BMatrix_map_2018_08.tsv')
X4[X4.columns[0]] = X4[X4.columns[0]].astype(str)
X4 = X4.set_index(X4.columns[0])

X5 = pd.read_table('RDKit_fps/RDKfps2_BMatrix_map_2018_08.tsv')
X5[X5.columns[0]] = X5[X5.columns[0]].astype(str)
X5 = X5.set_index(X5.columns[0])

X6 = pd.read_table('RDKit_fps/Avalon_BMatrix_map_2018_08.tsv')
X6[X6.columns[0]] = X6[X6.columns[0]].astype(str)
X6 = X6.set_index(X6.columns[0])

# X7 = pd.read_table('Output/L1000_Scaffolds_2018_07.tsv')
# X7[X7.columns[0]] = X7[X7.columns[0]].astype(str)
# X7 = X7.set_index(X7.columns[0])
# X7 = X7.T

X = pd.concat([X1, X2, X3, X4, X5, X6], axis = 1, sort=True)
# X = pd.concat([X2], axis = 1, sort=True)
X = X.dropna()
X.head()

In [None]:
X.shape

## Only get drugs shared between X and Y

In [None]:
shared_drugs = sorted(list(set(X.index) & set(Y.index)))
X = X.loc[shared_drugs]
Y = Y.loc[shared_drugs]
X = X.values
Y = Y.values

## Dimentionality Reduction

In [None]:
# dr_model = decomposition.LatentDirichletAllocation(n_components=100, learning_method= 'online')
dr_model = decomposition.NMF(n_components=100, init = 'nndsvda')
# dr_model = decomposition.NMF(n_components=100)

X_dr = dr_model.fit_transform(X)

In [None]:
X_dr.shape

In [None]:
X_dr_df = pd.DataFrame(X_dr)
X_dr_df.head()

In [None]:
X = X_dr

In [None]:
X.shape,Y.shape

## Pick the regressor

In [None]:
# regressor = linear_model.MultiTaskLasso()
# regressor = linear_model.MultiTaskElasticNet()
regressor = linear_model.Ridge(fit_intercept=True, normalize=False, copy_X=True, solver='auto')
# regressor = ensemble.RandomForestRegressor(n_estimators = 60, n_jobs = 7)
# regressor = linear_model.BayesianRidge()
# regressor = linear_model.LassoLars() 
# regressor = ensemble.GradientBoostingRegressor(n_estimators = 5, max_depth= 3, min_samples_split= 3, learning_rate= 0.01, loss= 'ls')
# regressor = svm.SVR(degree = 1, epsilon=.01, kernel = 'poly')
# regressor = ensemble.AdaBoostRegressor(base_estimator=None, n_estimators=50, learning_rate=1.0, loss='linear', random_state=None)
# regressor = neural_network.MLPRegressor(hidden_layer_sizes=(50, ))
# regressor = neighbors.KNeighborsRegressor(n_neighbors=100)
# regressor = dummy.DummyRegressor(strategy = 'median')


## Run the model

In [None]:
r2ss = []
cv = model_selection.LeaveOneOut()
start = time.time()
for train_idx, test_idx in cv.split(X):
    X_train, Y_train = X[train_idx], Y[train_idx]
    
    X_test, Y_test = X[test_idx], Y[test_idx]
    
    regressor.fit(X_train, Y_train)
    Y_test_pred = regressor.predict(X_test)
    
    r2s = metrics.r2_score(Y_test[0], Y_test_pred[0])
    r2ss.append(r2s)
    
    end = time.time()
    print(end-start)

In [None]:
r2ss[:5]

In [None]:
r2ss = np.load('Output/Predicability/All_r2ss_pred.npy')

In [None]:
pred_vect_df = pd.DataFrame({'Pert_id':shared_drugs, 'R2_Score':r2ss})
pred_vect_df.to_csv('Output/Pred_vect.csv')

In [None]:
r2ss_pred = np.array(r2ss)
np.save('Output/Predicability/All_r2ss_pred_dummy', r2ss_pred)

In [None]:
sns.distplot(r2ss, color='#d2afff')

In [None]:
#008261
#598ff9
#d2afff
fig = sns.distplot(r2ss, bins = 75, color='#d2afff')

fig.set_xlim(0,1)


In [None]:
type(fig)

In [None]:
max(r2ss), min(r2ss)

In [None]:
r2ss_scale_pred = []
r2min = min(r2ss)
r2max = max(r2ss)

for r2 in r2ss:
    scaled = (r2-r2min)/((r2max-r2min))
    r2ss_scale_pred.append(scaled)

In [None]:
print(r2ss_scale_pred[:5])
print(len(r2ss_scale_pred))

In [None]:
r2ss_scaled_pred = np.array(r2ss_scale_pred)
np.save('Output/Predicability/All_r2ss_scaled_pred_dummy', r2ss_scaled_pred)

In [None]:
Y = r2ss_scale_pred

## Predict the predictability

In [None]:
# pred_regressor = linear_model.Ridge(fit_intercept=True, normalize=False, copy_X=True, solver='auto')
pred_regressor = ensemble.RandomForestRegressor(n_estimators = 300, n_jobs = 7)

pred_r2s = []
cv = model_selection.KFold(n_splits=10, shuffle=True)
start = time.time()
for train_idx, test_idx in cv.split(X):
    X_train, Y_train = X[train_idx], [Y[i] for i in train_idx]
    X_test, Y_test = X[test_idx], [Y[i] for i in test_idx]
    
    pred_regressor.fit(X_train, Y_train)
    Y_test_pred = pred_regressor.predict(X_test)
    
    r2s = metrics.r2_score(Y_test, Y_test_pred)
    pred_r2s.append(r2s)
    end = time.time()
    print(end-start)

In [None]:
pred_r2s

## Top/Bottom 10ish drugs based on r2 values

In [None]:
x = np.array(r2ss)
x_top20 = np.argsort(x)[-92:]
x_bottom20 = np.argsort(x)[:20]

top20 = [shared_drugs[i] for i in x_top20]
bottom20 = [shared_drugs[i] for i in x_bottom20]

In [None]:
top20, [x[i] for i in x_top20]

In [None]:
top20, bottom20

In [None]:
[x[i] for i in x_top20],[x[i] for i in x_bottom20]