In [75]:
import os
import pandas

mirna_src_dir = os.getcwd() + "/assn-mirna-luad/data/processed/miRNA/"
clinical_src_dir = os.getcwd() + "/assn-mirna-luad/data/processed/clinical/"

mirna_tumor_df = pandas.read_csv(mirna_src_dir+'tumor_miRNA.csv')
mirna_normal_df = pandas.read_csv(mirna_src_dir+'normal_miRNA.csv')
clinical_df = pandas.read_csv(clinical_src_dir+'clinical.csv')

print "mirna_tumor_df.shape", mirna_tumor_df.shape
print "mirna_normal_df.shape", mirna_normal_df.shape

"""
Here we select samples to use for our regression analysis
"""
matched_samples = pandas.merge(clinical_df, mirna_normal_df, on='patient_barcode')['patient_barcode']
print "matched_samples", matched_samples.shape
# merged = pandas.merge(clinical_df, mirna_normal_df, on='patient_barcode')
# print merged.shape
# print
# print merged['histological_type'].value_counts().sort_index(axis=0)
# print
# print merged['pathologic_stage'].value_counts().sort_index(axis=0)
# print
# print merged['pathologic_T'].value_counts().sort_index(axis=0)
# print
# print merged['pathologic_N'].value_counts().sort_index(axis=0)
# print
# print merged['pathologic_M'].value_counts().sort_index(axis=0)
# print

mirna_tumor_df.shape (513, 1882)
mirna_normal_df.shape (46, 1882)
matched_samples (46,)


## We now create data matrixes from tumor vs normal samples

In [77]:
from sklearn import preprocessing
import numpy as np
X_normal = mirna_normal_df[mirna_normal_df['patient_barcode'].isin(matched_samples)]
X_tumor = mirna_tumor_df[mirna_tumor_df['patient_barcode'].isin(matched_samples)]
X_normal.__delitem__('patient_barcode')
X_tumor.__delitem__('patient_barcode')

X = pandas.concat([X_normal, X_tumor])
# Create labels for 92 samples, first 46 labeled 0 indicating normal tissues, and last 46 labeled 1 for tumor tissues
Y = np.concatenate((np.zeros(46), np.ones(46)), axis=0)

# print 'miRNA null values', X.isnull().sum().sum()
# print 'Y null values', Y.isnull().sum()

print "X_normal.shape", X_normal.shape
print "X_tumor.shape", X_tumor.shape
print "Y.shape", Y.shape

mirna_list = X.columns.values
# 
X_scaler = preprocessing.StandardScaler(with_mean=False).fit(X)
X = X_scaler.transform(X)

X_normal.shape (46, 1881)
X_tumor.shape (46, 1881)
Y.shape (92,)


## Perform classification with an L1-regularized Logistic Regression

In [114]:
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn import metrics

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=np.random.randint(0, 10000))
print 'X_train', X_train.shape, ', y_train', y_train.shape
print 'X_test', X_test.shape, ', y_test', y_test.shape

model = linear_model.LogisticRegression(penalty="l1", fit_intercept=False)
print model.fit(X_train, y_train)

print "\nNon-zero coef:", np.nonzero(model.coef_.ravel())[0].size
print "miRNA's selected:\n", mirna_list[np.nonzero(model.coef_.ravel())[0]]
print "miRNA's corresponding coefficients:\n", model.coef_.ravel()[np.nonzero(model.coef_.ravel())[0]]

print "\nClassification_report on training"
print metrics.classification_report(y_train, model.predict(X_train), labels=[0, 1], target_names=["normal", "tumor"])
print "classification_report on testing"
print metrics.classification_report(y_test, model.predict(X_test), labels=[0, 1], target_names=["normal", "tumor"])
print metrics.confusion_matrix(y_test, model.predict(X_test), labels=[0, 1])

X_train (73, 1881) , y_train (73,)
X_test (19, 1881) , y_test (19,)
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=False,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

Non-zero coef: 30
miRNA's selected:
['hsa-mir-129-2' 'hsa-mir-133a-1' 'hsa-mir-133b' 'hsa-mir-138-2'
 'hsa-mir-139' 'hsa-mir-140' 'hsa-mir-147b' 'hsa-mir-148a' 'hsa-mir-21'
 'hsa-mir-218-1' 'hsa-mir-30a' 'hsa-mir-3122' 'hsa-mir-3161' 'hsa-mir-3652'
 'hsa-mir-4465' 'hsa-mir-4510' 'hsa-mir-4791' 'hsa-mir-548ag-2'
 'hsa-mir-548ak' 'hsa-mir-548an' 'hsa-mir-5579' 'hsa-mir-599' 'hsa-mir-615'
 'hsa-mir-6777' 'hsa-mir-6832' 'hsa-mir-6892' 'hsa-mir-708' 'hsa-mir-7-1'
 'hsa-mir-7-2' 'hsa-mir-96']
miRNA's corresponding coefficients:
[ 0.0760485  -0.32607011 -0.0202594  -0.08760938 -0.50101293 -0.10367723
  0.22178164  0.25418546  1.14050731 -0.22016574 -1.12782902 -0.

In [115]:
print "classification_report on rest of tumor patients"
X_tumor_unmatched = mirna_tumor_df.dropna(subset=['hsa-mir-96']).copy()
X_tumor_unmatched.__delitem__('patient_barcode')
X_tumor_unmatched = X_scaler.transform(X_tumor_unmatched)
print X_tumor_unmatched.shape

Y_tumor_unmatched = np.array([1,]*X_tumor_unmatched.shape[0], dtype=int)

print metrics.classification_report(Y_tumor_unmatched, model.predict(X_tumor_unmatched), labels=[0, 1], target_names=["normal", "target"])
print metrics.confusion_matrix(Y_tumor_unmatched, model.predict(X_tumor_unmatched), labels=[0, 1])

classification_report on rest of tumor patients
(512, 1881)
             precision    recall  f1-score   support

     normal       0.00      0.00      0.00         0
     target       1.00      0.96      0.98       512

avg / total       1.00      0.96      0.98       512

[[  0   0]
 [ 22 490]]


In [80]:
from matplotlib import pyplot as plt
# Display results
m_log_alphas = -np.log10(model.alphas_)

plt.figure()
plt.plot(m_log_alphas, model.mse_path_, ':')
plt.plot(m_log_alphas, model.mse_path_.mean(axis=-1), 'k',
         label='Average across the folds', linewidth=2)
plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k',
            label='alpha: CV estimate')

plt.legend()

plt.xlabel('-log(alpha)')
plt.ylabel('Mean square error')
plt.title('Mean square error on each fold: coordinate descent ')
plt.axis('tight')
plt.show()

AttributeError: 'LogisticRegressionCV' object has no attribute 'alphas_'