In [11]:
import os
import pandas
from definitions import ROOT_DIR

mirna_tumor_df = pandas.read_csv(os.path.join(ROOT_DIR, "data/processed/miRNA/tumor_miRNA.csv"))
mirna_normal_df = pandas.read_csv(os.path.join(ROOT_DIR, "data/processed/miRNA/normal_miRNA.csv"))
clinical_df = pandas.read_csv(os.path.join(ROOT_DIR, "data/processed/clinical/clinical.csv"))
validated_miRNAs = pandas.read_csv(os.path.join(ROOT_DIR, 'data/external/validated_miRNAs_nsclc.csv'))

print "mirna_tumor_df.shape", mirna_tumor_df.shape
print "mirna_normal_df.shape", mirna_normal_df.shape
print 'validated_miRNAs.shape', validated_miRNAs.shape

X_normal = pandas.merge(clinical_df[['patient_barcode', 'pathologic_stage']], mirna_normal_df, on='patient_barcode')
X_normal['pathologic_stage'] = 'normal'
X_tumor = pandas.merge(clinical_df[['patient_barcode', 'pathologic_stage']], mirna_tumor_df, on='patient_barcode')

pathologic_stage_map = {'Stage IA': 'Stage I', 'Stage IB': 'Stage I', 
                        'Stage IIA': 'Stage II', 'Stage IIB': 'Stage II', 
                        'Stage IIIA': 'Stage III', 'Stage IIIB': 'Stage III'}

X_tumor.replace({'pathologic_stage': pathologic_stage_map}, inplace=True)

print X_normal['pathologic_stage'].value_counts().sort_index(axis=0)
print X_tumor['pathologic_stage'].value_counts().sort_index(axis=0)

mirna_tumor_df.shape (513, 1882)
mirna_normal_df.shape (46, 1882)
validated_miRNAs.shape (110, 1)
normal    46
Name: pathologic_stage, dtype: int64
Stage I      277
Stage II     121
Stage III     84
Stage IV      24
Name: pathologic_stage, dtype: int64


## We now create data matrixes from tumor vs normal samples

In [23]:
from sklearn import preprocessing

X = pandas.concat([X_normal, X_tumor]).dropna(subset=['pathologic_stage'])
Y = X['pathologic_stage']

X_normal_vs_I = X[X['pathologic_stage'].isin(['normal', 'Stage I'])]
X_normal_vs_II = X[X['pathologic_stage'].isin(['normal', 'Stage II'])]
X_normal_vs_III = X[X['pathologic_stage'].isin(['normal', 'Stage III'])]
X_normal_vs_IV = X[X['pathologic_stage'].isin(['normal', 'Stage IV'])]

Y_normal_vs_I = X_normal_vs_I['pathologic_stage']
Y_normal_vs_II = X_normal_vs_II['pathologic_stage']
Y_normal_vs_III = X_normal_vs_III['pathologic_stage']
Y_normal_vs_IV = X_normal_vs_IV['pathologic_stage']

X_normal_vs_I.__delitem__('patient_barcode')
X_normal_vs_I.__delitem__('pathologic_stage')
X_normal_vs_II.__delitem__('patient_barcode')
X_normal_vs_II.__delitem__('pathologic_stage')
X_normal_vs_III.__delitem__('patient_barcode')
X_normal_vs_III.__delitem__('pathologic_stage')
X_normal_vs_IV.__delitem__('patient_barcode')
X_normal_vs_IV.__delitem__('pathologic_stage')

X.__delitem__('patient_barcode')
X.__delitem__('pathologic_stage')

print 'miRNA null values', X.isnull().sum().sum()
print 'Y null values', Y.isnull().sum()

print "X.shape", X.shape
print "Y.shape", Y.shape

mirna_list = X.columns.values
validated_miRNAs = set(validated_miRNAs['miRNA'].tolist())

X_scaler = preprocessing.StandardScaler(with_mean=False).fit(X)
X_normal_vs_I = X_scaler.transform(X_normal_vs_I)
X_normal_vs_II = X_scaler.transform(X_normal_vs_II)
X_normal_vs_III = X_scaler.transform(X_normal_vs_III)
X_normal_vs_IV = X_scaler.transform(X_normal_vs_IV)

miRNA null values 0
Y null values 0
X.shape (552, 1881)
Y.shape (552,)


In [29]:
print "mirna_list", len(mirna_list)
print 'validated_miRNAs', len(validated_miRNAs)

print len(validated_miRNAs & set(mirna_list))

mirna_list 1881
validated_miRNAs 110
109


## Perform classification

In [45]:
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn import metrics
import numpy as np

# X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=np.random.randint(0, 10000))
# print 'X_train', X_train.shape, ', y_train', y_train.shape
# print 'X_test', X_test.shape, ', y_test', y_test.shape

normal_vs_I = linear_model.LogisticRegression(C=9.9, penalty="l1", fit_intercept=False, verbose=1)
normal_vs_II = linear_model.LogisticRegression(C=9.9, penalty="l1", fit_intercept=False, verbose=1)
normal_vs_III = linear_model.LogisticRegression(C=9.9, penalty="l1", fit_intercept=False, verbose=1)
normal_vs_IV = linear_model.LogisticRegression(C=9.9, penalty="l1", fit_intercept=False, verbose=1)
print normal_vs_I.fit(X_normal_vs_I, Y_normal_vs_I)
print normal_vs_II.fit(X_normal_vs_II, Y_normal_vs_II)
print normal_vs_III.fit(X_normal_vs_III, Y_normal_vs_III)
print normal_vs_IV.fit(X_normal_vs_IV, Y_normal_vs_IV)

# print "\nClassification_report on training"
# print metrics.classification_report(y_train, model.predict(X_train), 
#                                     labels=['normal', 'Stage I', 'Stage II', 'Stage III', 'Stage IV'], 
#                                     target_names=['normal', 'Stage I', 'Stage II', 'Stage III', 'Stage IV'])
# print "classification_report on testing"
# print metrics.classification_report(y_test, model.predict(X_test), 
#                                     labels=['normal', 'Stage I', 'Stage II', 'Stage III', 'Stage IV'], 
#                                     target_names=['normal', 'Stage I', 'Stage II', 'Stage III', 'Stage IV'])
# print metrics.confusion_matrix(y_test, model.predict(X_test), 
#                                labels=['normal', 'Stage I', 'Stage II', 'Stage III', 'Stage IV'])

[LibLinear]LogisticRegression(C=9.9, class_weight=None, dual=False, fit_intercept=False,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=1, warm_start=False)
[LibLinear]LogisticRegression(C=9.9, class_weight=None, dual=False, fit_intercept=False,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=1, warm_start=False)
[LibLinear]LogisticRegression(C=9.9, class_weight=None, dual=False, fit_intercept=False,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=1, warm_start=False)
[LibLinear]LogisticRegression(C=9.9, class_weight=None, dual=False, fit_intercept=False,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
      

In [60]:
from src.utils.validate_candidate_miRNAs import percent_candidate_in_validated as pciv
results = []

for model, name in zip([normal_vs_I, normal_vs_II, normal_vs_III, normal_vs_IV], ['normal_vs_I', 'normal_vs_II', 'normal_vs_III', 'normal_vs_IV']):
    print '\n', name 
    non_zero_idx = np.nonzero(model.coef_.ravel())[0]
    print "Non-zero coef:", non_zero_idx.size
    df = pandas.DataFrame({"mirna": mirna_list[non_zero_idx],
                           "coefficient": model.coef_.ravel()[non_zero_idx]})
    print df.sort_values(by='coefficient', ascending=False).head(6)
    print df.sort_values(by='coefficient', ascending=True).head(6)
    print "Percentage of known NSCLC miRNAs:", pciv(df['mirna'].tolist(), validated_miRNAs)
    print set(df['mirna'].tolist()) & validated_miRNAs
    results.append(df)


normal_vs_I
Non-zero coef: 101
    coefficient         mirna
97     0.911670  hsa-mir-7108
13     0.653433   hsa-mir-143
42     0.613127  hsa-mir-378a
58     0.517361  hsa-mir-4529
14     0.507204   hsa-mir-144
65     0.461374  hsa-mir-4732
    coefficient         mirna
90    -0.999950  hsa-mir-6787
80    -0.973630   hsa-mir-628
96    -0.937108   hsa-mir-7-1
25    -0.740619    hsa-mir-21
53    -0.732466  hsa-mir-4443
82    -0.664495   hsa-mir-639
Percentage of known NSCLC miRNAs: 0.190909090909
set(['hsa-mir-378a', 'hsa-mir-199a-1', 'hsa-mir-21', 'hsa-mir-106a', 'hsa-mir-34a', 'hsa-mir-218-1', 'hsa-mir-146a', 'hsa-mir-29c', 'hsa-mir-182', 'hsa-mir-148a', 'hsa-mir-199b', 'hsa-mir-186', 'hsa-mir-143', 'hsa-mir-625', 'hsa-mir-1-1', 'hsa-mir-30a', 'hsa-mir-7-1', 'hsa-let-7a-1', 'hsa-let-7a-2', 'hsa-let-7a-3', 'hsa-mir-361'])

normal_vs_II
Non-zero coef: 74
    coefficient         mirna
11     0.953946   hsa-mir-143
68     0.627343  hsa-mir-7108
26     0.470207  hsa-mir-378a
40     0.45104

86