In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import SGDRegressor as sgd
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

In [2]:
train_data = pd.read_csv('../CNN_Keras/SimulationCNN/Kmers6_counts_600bp.csv')
train_reads = pd.read_csv('../data/Mouse_DMRs_counts_total.csv',header = None)
train_methys = pd.read_csv('../data/Mouse_DMRs_counts_methylated.csv',header = None)
train_methy_level = pd.read_csv('../data/Mouse_DMRs_methylation_level.csv',header = None)

In [3]:
cell_type = 5
data = train_data.as_matrix()
level = train_methy_level.as_matrix()[:,cell_type]
reads = train_reads.as_matrix()[:,cell_type]
methy = train_methys.as_matrix()[:,cell_type]
print(data.shape,level.shape,reads.shape,methy.shape)

(58959, 2080) (58959,) (58959,) (58959,)


In [4]:
data_train = data[:48000]
level_train = level[:48000]
reads_train = reads[:48000]
methys_train = methy[:48000]
data_test = data[48000:]
level_test = level[48000:]
reads_test = reads[48000:]
methy_test = methy[48000:]

In [5]:
param = {'loss':['squared_loss','huber'],'penalty':['l2','l1']}

In [7]:
print('Info with no sample weight applied:')
for l in param['loss']:
    for p in param['penalty']:
        print('Start Training: Loss: ',l,", penalty: ",p)
        clf = sgd(loss = l, n_iter=100,penalty = p)
        scores = cross_val_score(clf, data, level, cv=5)
        print("R^2 score: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Info with no sample weight applied:
Start Training: Loss:  squared_loss , penalty:  l2
R^2 score: 0.22 (+/- 0.03)
Start Training: Loss:  squared_loss , penalty:  l1
R^2 score: 0.23 (+/- 0.02)
Start Training: Loss:  huber , penalty:  l2
R^2 score: 0.21 (+/- 0.02)
Start Training: Loss:  huber , penalty:  l1
R^2 score: 0.20 (+/- 0.01)


In [6]:
print('Info with sample weight applied:')
for l in param['loss']:
    for p in param['penalty']:
        print('Start Training: Loss: ',l,", penalty: ",p)
        clf = sgd(loss = l, n_iter=200,penalty = p,warm_start = False)
        kf = KFold(n_splits=5)
        train_score = []
        test_score = []
        Uni_train_score = []
        Uni_test_score = []
        #Cross Validation
        for train_index, test_index in kf.split(data):
            X_train, X_test = data[train_index],data[test_index]
            Y_train, Y_test = level[train_index],level[test_index]
            R_train, R_test = reads[train_index],reads[test_index]
            
            clf.fit(X_train,Y_train,sample_weight = R_train)
            train_score.append(clf.score(X_train,Y_train,R_train))
            test_score.append(clf.score(X_test,Y_test,R_test))
            
            clf.fit(X_train,Y_train)
            Uni_train_score.append(clf.score(X_train,Y_train))
            Uni_test_score.append(clf.score(X_test,Y_test))
        
        train_score = np.array(train_score)
        test_score = np.array(test_score)
        
        Uni_train_score = np.array(Uni_train_score)
        Uni_test_score = np.array(Uni_test_score)
        
        print("\t Train with different sample weigt")
        print("\t\tTrain R^2 score: %0.2f (+/- %0.2f)" % (train_score.mean(), train_score.std() * 2))
        print("\t\tTest R^2 score: %0.2f (+/- %0.2f)" % (test_score.mean(), test_score.std() * 2))
        
        print("\t Train with equal sample weigt")
        print("\t\tTrain R^2 score: %0.2f (+/- %0.2f)" % (Uni_train_score.mean(), Uni_train_score.std() * 2))
        print("\t\tTest R^2 score: %0.2f (+/- %0.2f)" % (Uni_test_score.mean(), Uni_test_score.std() * 2))

Info with sample weight applied:
Start Training: Loss:  squared_loss , penalty:  l2
	 Train with different sample weigt
		Train R^2 score: -203633442007800961325596672.00 (+/- 53612617249645648465100800.00)
		Test R^2 score: -195603056422496097663975424.00 (+/- 29784310162538468644749312.00)
	 Train with equal sample weigt
		Train R^2 score: 0.29 (+/- 0.01)
		Test R^2 score: 0.22 (+/- 0.03)
Start Training: Loss:  squared_loss , penalty:  l1
	 Train with different sample weigt
		Train R^2 score: -212755296407341937763287040.00 (+/- 53556334791207475403030528.00)
		Test R^2 score: -209667190754935327766347776.00 (+/- 49419782694077458821414912.00)
	 Train with equal sample weigt
		Train R^2 score: 0.28 (+/- 0.01)
		Test R^2 score: 0.23 (+/- 0.02)
Start Training: Loss:  huber , penalty:  l2
	 Train with different sample weigt
		Train R^2 score: -2.25 (+/- 0.45)
		Test R^2 score: -2.28 (+/- 0.22)
	 Train with equal sample weigt
		Train R^2 score: 0.27 (+/- 0.02)
		Test R^2 score: 0.20 (+/-

In [22]:
for e in [0.5,0.1,0.01]:
    for alpha in [0.0001, 0.001,0.01]:
        print('epsilon: ', e, " alpha; ", alpha)
        clf = sgd(loss = 'huber', n_iter=200,penalty = 'l1',alpha = alpha, epsilon = e, warm_start = False)
        kf = KFold(n_splits=5)
        train_score = []
        test_score = []
        Uni_train_score = []
        Uni_test_score = []
        #Cross Validation
        for train_index, test_index in kf.split(data):
            X_train, X_test = data[train_index],data[test_index]
            Y_train, Y_test = level[train_index],level[test_index]
            R_train, R_test = reads[train_index],reads[test_index]
            
            clf.fit(X_train,Y_train,sample_weight = R_train)
            train_score.append(clf.score(X_train,Y_train,sample_weight = R_train))
            test_score.append(clf.score(X_test,Y_test,sample_weight = R_test))
            
            clf.fit(X_train,Y_train,sample_weight = np.ones(len(X_train)))
            Uni_train_score.append(clf.score(X_train,Y_train))
            Uni_test_score.append(clf.score(X_test,Y_test))
        
        train_score = np.array(train_score)
        test_score = np.array(test_score)
        
        Uni_train_score = np.array(Uni_train_score)
        Uni_test_score = np.array(Uni_test_score)
        
        print("\t Train with different sample weigt")
        print("\t\tTrain R^2 score: %0.2f (+/- %0.2f)" % (train_score.mean(), train_score.std() * 2))
        print("\t\tTest R^2 score: %0.2f (+/- %0.2f)" % (test_score.mean(), test_score.std() * 2))
        
        print("\t Train with equal sample weigt")
        print("\t\tTrain R^2 score: %0.2f (+/- %0.2f)" % (Uni_train_score.mean(), Uni_train_score.std() * 2))
        print("\t\tTest R^2 score: %0.2f (+/- %0.2f)" % (Uni_test_score.mean(), Uni_test_score.std() * 2))

epsilon:  0.5  alpha;  0.0001
	 Train with different sample weigt
		Train R^2 score: -1.68 (+/- 0.34)
		Test R^2 score: -1.76 (+/- 0.40)
	 Train with equal sample weigt
		Train R^2 score: 0.28 (+/- 0.01)
		Test R^2 score: 0.23 (+/- 0.02)
epsilon:  0.5  alpha;  0.001
	 Train with different sample weigt
		Train R^2 score: -0.93 (+/- 0.65)
		Test R^2 score: -1.01 (+/- 0.71)
	 Train with equal sample weigt
		Train R^2 score: 0.23 (+/- 0.01)
		Test R^2 score: 0.21 (+/- 0.01)
epsilon:  0.5  alpha;  0.01
	 Train with different sample weigt
		Train R^2 score: -0.31 (+/- 0.14)
		Test R^2 score: -0.36 (+/- 0.10)
	 Train with equal sample weigt
		Train R^2 score: 0.03 (+/- 0.00)
		Test R^2 score: 0.03 (+/- 0.00)
epsilon:  0.1  alpha;  0.0001
	 Train with different sample weigt
		Train R^2 score: -0.24 (+/- 0.07)
		Test R^2 score: -0.31 (+/- 0.07)
	 Train with equal sample weigt
		Train R^2 score: 0.25 (+/- 0.01)
		Test R^2 score: 0.21 (+/- 0.02)
epsilon:  0.1  alpha;  0.001
	 Train with different

In [17]:
clf = sgd()

0.1