In [75]:
import os
import pandas

mirna_src_dir = os.getcwd() + "/assn-mirna-luad/data/processed/miRNA/"
clinical_src_dir = os.getcwd() + "/assn-mirna-luad/data/processed/clinical/"

mirna_tumor_df = pandas.read_csv(mirna_src_dir+'tumor_miRNA.csv')
mirna_normal_df = pandas.read_csv(mirna_src_dir+'normal_miRNA.csv')
clinical_df = pandas.read_csv(clinical_src_dir+'clinical.csv')

print "mirna_tumor_df.shape", mirna_tumor_df.shape
print "mirna_normal_df.shape", mirna_normal_df.shape

"""
Here we select samples to use for our regression analysis
"""
matched_samples = pandas.merge(clinical_df, mirna_normal_df, on='patient_barcode')['patient_barcode']
print "matched_samples", matched_samples.shape
# merged = pandas.merge(clinical_df, mirna_normal_df, on='patient_barcode')
# print merged.shape
# print
# print merged['histological_type'].value_counts().sort_index(axis=0)
# print
# print merged['pathologic_stage'].value_counts().sort_index(axis=0)
# print
# print merged['pathologic_T'].value_counts().sort_index(axis=0)
# print
# print merged['pathologic_N'].value_counts().sort_index(axis=0)
# print
# print merged['pathologic_M'].value_counts().sort_index(axis=0)
# print

mirna_tumor_df.shape (513, 1882)
mirna_normal_df.shape (46, 1882)
matched_samples (46,)


## We now create data matrixes from tumor vs normal samples

In [77]:
from sklearn import preprocessing
import numpy as np
X_normal = mirna_normal_df[mirna_normal_df['patient_barcode'].isin(matched_samples)]
X_tumor = mirna_tumor_df[mirna_tumor_df['patient_barcode'].isin(matched_samples)]
X_normal.__delitem__('patient_barcode')
X_tumor.__delitem__('patient_barcode')

X = pandas.concat([X_normal, X_tumor])
# Create labels for 92 samples, first 46 labeled 0 indicating normal tissues, and last 46 labeled 1 for tumor tissues
Y = np.concatenate((np.zeros(46), np.ones(46)), axis=0)

# print 'miRNA null values', X.isnull().sum().sum()
# print 'Y null values', Y.isnull().sum()

print "X_normal.shape", X_normal.shape
print "X_tumor.shape", X_tumor.shape
print "Y.shape", Y.shape

mirna_list = X.columns.values
# 
X_scaler = preprocessing.StandardScaler(with_mean=False).fit(X)
X = X_scaler.transform(X)
# # Y_scaler = preprocessing.StandardScaler().fit(Y)
# # Y = Y_scaler.transform(Y)

X_normal.shape (46, 1881)
X_tumor.shape (46, 1881)
Y.shape (92,)


## Perform classification with LASSO

In [92]:
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn import metrics
import numpy as np

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.4, random_state=np.random.randint(0, 10000))
print 'X_train', X_train.shape, ', y_train', y_train.shape
print 'X_test', X_test.shape, ', y_test', y_test.shape

model = linear_model.LogisticRegression(penalty="l1")
print model.fit(X_train, y_train)

print "\nNon-zero coef:", np.nonzero(model.coef_.ravel())[0].size
print "miRNA's selected:\n", mirna_list[np.nonzero(model.coef_.ravel())[0]]
print "miRNA's corresponding coefficients:\n", model.coef_.ravel()[np.nonzero(model.coef_.ravel())[0]]

print "classification_report on training"
print metrics.classification_report(y_train, np.array(model.predict(X_train), dtype=int), labels=[0, 1], target_names=["normal", "tumor"])
print "classification_report on testing"
print metrics.classification_report(y_test, np.array(model.predict(X_test), dtype=int), labels=[0, 1], target_names=["normal", "tumor"])

X_train (55, 1881) , y_train (55,)
X_test (37, 1881) , y_test (37,)
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

Non-zero coef: 27
miRNA's selected:
['hsa-let-7a-1' 'hsa-let-7a-3' 'hsa-mir-1229' 'hsa-mir-1287'
 'hsa-mir-133a-2' 'hsa-mir-138-2' 'hsa-mir-139' 'hsa-mir-146b'
 'hsa-mir-210' 'hsa-mir-218-1' 'hsa-mir-23c' 'hsa-mir-30a' 'hsa-mir-3681'
 'hsa-mir-3923' 'hsa-mir-429' 'hsa-mir-4454' 'hsa-mir-4465' 'hsa-mir-4645'
 'hsa-mir-4664' 'hsa-mir-4800' 'hsa-mir-625' 'hsa-mir-6499' 'hsa-mir-6717'
 'hsa-mir-6744' 'hsa-mir-6787' 'hsa-mir-708' 'hsa-mir-7-1']
miRNA's corresponding coefficients:
[-0.1675048  -0.0985799   0.05607123  0.07630713 -0.4478076  -0.07925564
 -0.64975806 -0.49416178  0.12596735 -0.43043432  0.36928382 -0.20999181
  0.06224203 -0.0527186   0.14881397  0.

In [85]:
print "classification_report on rest of tumor patients"
X_tumor_unmatched = mirna_tumor_df.dropna(subset=['hsa-mir-96']).copy()
X_tumor_unmatched.__delitem__('patient_barcode')
print X_tumor_unmatched.shape

Y_tumor_unmatched = np.array([1,]*X_tumor_unmatched.shape[0], dtype=int)

print metrics.classification_report(Y_tumor_unmatched, np.array(model.predict(X_tumor_unmatched), dtype=int), labels=[0, 1])
print metrics.confusion_matrix(Y_tumor_unmatched, np.array(model.predict(X_tumor_unmatched), dtype=int), labels=[0, 1])

classification_report on rest of tumor patients
(512, 1881)
             precision    recall  f1-score   support

          0       0.00      0.00      0.00         0
          1       1.00      0.92      0.96       512

avg / total       1.00      0.92      0.96       512

[[  0   0]
 [ 39 473]]


In [80]:
from matplotlib import pyplot as plt
# Display results
m_log_alphas = -np.log10(model.alphas_)

plt.figure()
plt.plot(m_log_alphas, model.mse_path_, ':')
plt.plot(m_log_alphas, model.mse_path_.mean(axis=-1), 'k',
         label='Average across the folds', linewidth=2)
plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k',
            label='alpha: CV estimate')

plt.legend()

plt.xlabel('-log(alpha)')
plt.ylabel('Mean square error')
plt.title('Mean square error on each fold: coordinate descent ')
plt.axis('tight')
plt.show()

AttributeError: 'LogisticRegressionCV' object has no attribute 'alphas_'