In [58]:
import pandas as pd
from sklearn import model_selection, preprocessing
from sklearn import tree, linear_model, svm, ensemble, metrics
from matplotlib import pyplot as plt
import xgboost as xgb
import scipy
import numpy as np

import compare_y
import cross_validation_result as cvr

%matplotlib inline

In [68]:
# load PSL linear and random results
TRUTH_linear = "../psl/data/simulation/linear/seed0/cross_val_6fold/"
INFER_linear = "../psl/result/simulation/linear/"
linear_df = cvr.cross_val_result(TRUTH_linear, INFER_linear)

# load PSL random results
TRUTH_random = "../psl/data/simulation/random/seed0/cross_val_6fold/"
INFER_random = "../psl/result/simulation/random/"
random_df = cvr.cross_val_result(TRUTH_random, INFER_random)


In [62]:
# linear regression
linear_data = "../data/similuated_matrix_linear.tsv"
df = pd.read_csv(linear_data, sep="\t")
label = df["sensitivity"].copy()
data = df.drop("sensitivity", axis=1)
data.set_index(data["cell-drug-pair"], inplace=True)
data.drop("cell-drug-pair", axis=1, inplace=True)
data.drop("cell", axis=1, inplace=True)
data.drop("drug", axis=1, inplace=True)
df.head(10)

print data.shape
poly = preprocessing.PolynomialFeatures(2)
data = poly.fit_transform(data)
print data.shape

(100, 20)
(100, 231)


In [69]:
Classifiers = {"Random Forest": ensemble.RandomForestRegressor(),
               "SVM": svm.SVR(kernel="rbf"), 
               "XGBoost": xgb.XGBRegressor(n_estimators=200),
               "Linear": linear_model.LinearRegression()}

In [66]:
kf = model_selection.KFold(n_splits=5)
mse_result_df = pd.DataFrame()
spearman_result_df = pd.DataFrame()
fold_num = 0
for train, test in kf.split(df):
    fold_num += 1
    #tr, val = model_selection.train_test_split(train, test_size=0.2, random_state=1)
    
    x_train = data[train]
    y_train = label[train]
    #x_val = data[val]
    #y_val = label[val]
    x_test = data[test]
    y_test = label[test]
    
    for classifier_name, clf in Classifiers.iteritems():
        clf.fit(x_train, y_train)
        prediction = clf.predict(x_test)
        mse = metrics.mean_squared_error(prediction, y_test)
        rho = scipy.stats.spearmanr(prediction, y_test)
        mse_result_df.loc[fold_num, classifier_name] = mse
        spearman_result_df.loc[fold_num, classifier_name] = rho[0]
        
print "mse\n", mse_result_df
print "spearman rank correlation\n", spearman_result_df

mse
    XGBoost       SVM  Random Forest    Linear
1  0.079634  0.095857       0.090917  0.042076
2  0.021094  0.063589       0.062064  0.013896
3  0.040548  0.085639       0.074531  0.022128
4  0.024905  0.095518       0.081062  0.025433
5  0.033939  0.061333       0.069652  0.014121
spearman rank correlation
    XGBoost       SVM  Random Forest    Linear
1  0.421053 -0.007519       0.258647  0.640602
2  0.854135  0.592481       0.700752  0.933835
3  0.717293  0.333835       0.563910  0.851128
4  0.795489  0.245113       0.410526  0.810526
5  0.657143  0.380451       0.314286  0.806015


In [51]:
# linear regression
random_data = "../data/similuated_matrix_random.tsv"
df = pd.read_csv(random_data, sep="\t")
label = df["sensitivity"].copy()
data = df.drop("sensitivity", axis=1)
data.set_index(data["cell-drug-pair"], inplace=True)
data.drop("cell-drug-pair", axis=1, inplace=True)
data.drop("cell", axis=1, inplace=True)
data.drop("drug", axis=1, inplace=True)


kf = model_selection.KFold(n_splits=5)
mse_result_df = pd.DataFrame()
spearman_result_df = pd.DataFrame()
fold_num = 0
for train, test in kf.split(df):
    fold_num += 1
    tr, val = model_selection.train_test_split(train, test_size=0.2, random_state=1)
    
    x_train = data.iloc[tr]
    y_train = label.iloc[tr].as_matrix()
    x_val = data.iloc[val]
    y_val = label.iloc[val].as_matrix()
    x_test = data.iloc[test]
    y_test = label.iloc[test].as_matrix()
        
    for classifier_name, clf in Classifiers.iteritems():
        clf.fit(x_train, y_train)
        prediction = clf.predict(x_val)
        mse = metrics.mean_squared_error(prediction, y_val)
        rho = scipy.stats.spearmanr(prediction, y_val)
        mse_result_df.loc[fold_num, classifier_name] = mse
        spearman_result_df.loc[fold_num, classifier_name] = rho[0]
        
        
print "mse\n", mse_result_df
print "spearman rank correlation\n", spearman_result_df

mse
    XGBoost       SVM  Random Forest    Linear
1  0.208863  0.111856       0.119047  0.135031
2  0.154558  0.118065       0.153852  0.145151
3  0.162442  0.106500       0.102447  0.129539
4  0.138676  0.092803       0.075893  0.105072
5  0.119793  0.082659       0.096901  0.097032
spearman rank correlation
    XGBoost       SVM  Random Forest    Linear
1 -0.341176 -0.226471      -0.147059 -0.308824
2 -0.170588 -0.258824      -0.329412 -0.411765
3 -0.476471 -0.161765       0.105882  0.020588
4  0.150000 -0.026471       0.485294  0.164706
5 -0.061765  0.061765      -0.129412  0.029412


In [61]:
X = np.random.rand(100,2)
Y = X[:,0]**2 + X[:,1]**2 + 1

print X.shape
poly = preprocessing.PolynomialFeatures(2)
X = poly.fit_transform(X)
print X.shape

kf = model_selection.KFold(n_splits=5)

mse_result_df = pd.DataFrame()
spearman_result_df = pd.DataFrame()
fold_num = 0
for train, test in kf.split(Y):
    fold_num += 1
    tr, val = model_selection.train_test_split(train, test_size=0.2, random_state=1)
    
    x_train = X[tr]
    y_train = Y[tr]
    x_val = X[val]
    y_val = Y[val]
    x_test = X[test]
    y_test = Y[test]
        
    for classifier_name, clf in Classifiers.iteritems():
        clf.fit(x_train, y_train)
        prediction = clf.predict(x_val)
        mse = metrics.mean_squared_error(prediction, y_val)
        rho = scipy.stats.spearmanr(prediction, y_val)
        mse_result_df.loc[fold_num, classifier_name] = mse
        spearman_result_df.loc[fold_num, classifier_name] = rho[0]
        
        
print "mse\n", mse_result_df
print "spearman rank correlation\n", spearman_result_df

(100, 2)
(100, 6)
mse
    XGBoost       SVM  Random Forest        Linear
1  0.006420  0.003567       0.013353  1.571559e-31
2  0.003815  0.003380       0.008046  1.448299e-31
3  0.002539  0.003503       0.009953  6.162976e-32
4  0.002134  0.003650       0.012415  1.910523e-31
5  0.002409  0.004288       0.018506  8.628166e-32
spearman rank correlation
    XGBoost       SVM  Random Forest  Linear
1  0.976471  0.997059       0.970588     1.0
2  0.967647  0.994118       0.982353     1.0
3  0.976471  0.988235       0.947059     1.0
4  0.991176  0.994118       0.985294     1.0
5  0.994118  0.994118       0.967647     1.0
