In [16]:
import os
import pandas

mirna_src_dir = os.getcwd() + "/assn-mirna-luad/data/processed/miRNA/"
clinical_src_dir = os.getcwd() + "/assn-mirna-luad/data/processed/clinical/"

mirna_tumor_df = pandas.read_csv(mirna_src_dir+'tumor_miRNA.csv')
mirna_normal_df = pandas.read_csv(mirna_src_dir+'normal_miRNA.csv')
clinical_df = pandas.read_csv(clinical_src_dir+'clinical.csv')

print "mirna_tumor_df.shape", mirna_tumor_df.shape
print "mirna_normal_df.shape", mirna_normal_df.shape

X_normal = pandas.merge(clinical_df[['patient_barcode', 'pathologic_stage']], mirna_normal_df, on='patient_barcode')
X_normal['pathologic_stage'] = 'normal'
X_tumor = pandas.merge(clinical_df[['patient_barcode', 'pathologic_stage']], mirna_tumor_df, on='patient_barcode')

pathologic_stage_map = {'Stage IA': 'Stage I', 'Stage IB': 'Stage I', 
                        'Stage IIA': 'Stage II', 'Stage IIB': 'Stage II', 
                        'Stage IIIA': 'Stage III', 'Stage IIIB': 'Stage III'}

X_tumor.replace({'pathologic_stage': pathologic_stage_map}, inplace=True)

print X_normal['pathologic_stage'].value_counts().sort_index(axis=0)
print X_tumor['pathologic_stage'].value_counts().sort_index(axis=0)

mirna_tumor_df.shape (513, 1882)
mirna_normal_df.shape (46, 1882)
normal    46
Name: pathologic_stage, dtype: int64
Stage I      277
Stage II     121
Stage III     84
Stage IV      24
Name: pathologic_stage, dtype: int64


## We now create data matrixes from tumor vs normal samples

In [56]:
from sklearn import preprocessing

X = pandas.concat([X_normal, X_tumor]).dropna(subset=['pathologic_stage'])
Y = X['pathologic_stage']

X_normal_vs_I = X[X['pathologic_stage'].isin(['normal', 'Stage I'])]
X_normal_vs_II = X[X['pathologic_stage'].isin(['normal', 'Stage II'])]
X_normal_vs_III = X[X['pathologic_stage'].isin(['normal', 'Stage III'])]
X_normal_vs_IV = X[X['pathologic_stage'].isin(['normal', 'Stage IV'])]

Y_normal_vs_I = X_normal_vs_I['pathologic_stage']
Y_normal_vs_II = X_normal_vs_II['pathologic_stage']
Y_normal_vs_III = X_normal_vs_III['pathologic_stage']
Y_normal_vs_IV = X_normal_vs_IV['pathologic_stage']

X_normal_vs_I.__delitem__('patient_barcode')
X_normal_vs_I.__delitem__('pathologic_stage')
X_normal_vs_II.__delitem__('patient_barcode')
X_normal_vs_II.__delitem__('pathologic_stage')
X_normal_vs_III.__delitem__('patient_barcode')
X_normal_vs_III.__delitem__('pathologic_stage')
X_normal_vs_IV.__delitem__('patient_barcode')
X_normal_vs_IV.__delitem__('pathologic_stage')

X.__delitem__('patient_barcode')
X.__delitem__('pathologic_stage')

print 'miRNA null values', X.isnull().sum().sum()
print 'Y null values', Y.isnull().sum()

print "X.shape", X.shape
print "Y.shape", Y.shape

mirna_list = X.columns.values
print mirna_list

X_scaler = preprocessing.StandardScaler(with_mean=False).fit(X)
X_normal_vs_I = X_scaler.transform(X_normal_vs_I)
X_normal_vs_II = X_scaler.transform(X_normal_vs_II)
X_normal_vs_III = X_scaler.transform(X_normal_vs_III)
X_normal_vs_IV = X_scaler.transform(X_normal_vs_IV)

miRNA null values 0
Y null values 0
X.shape (552, 1881)
Y.shape (552,)
['hsa-let-7a-1' 'hsa-let-7a-2' 'hsa-let-7a-3' ..., 'hsa-mir-98'
 'hsa-mir-99a' 'hsa-mir-99b']


In [62]:
X_normal_vs_IV.shape
Y_normal_vs_IV.shape

(70,)

## Perform classification

In [63]:
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn import metrics
import numpy as np

# X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=np.random.randint(0, 10000))
# print 'X_train', X_train.shape, ', y_train', y_train.shape
# print 'X_test', X_test.shape, ', y_test', y_test.shape

normal_vs_I = linear_model.LogisticRegression(C=0.5, penalty="l1", fit_intercept=False, verbose=1)
normal_vs_II = linear_model.LogisticRegression(C=0.5, penalty="l1", fit_intercept=False, verbose=1)
normal_vs_III = linear_model.LogisticRegression(C=0.5, penalty="l1", fit_intercept=False, verbose=1)
normal_vs_IV = linear_model.LogisticRegression(C=0.5, penalty="l1", fit_intercept=False, verbose=1)
print normal_vs_I.fit(X_normal_vs_I, Y_normal_vs_I)
print normal_vs_II.fit(X_normal_vs_II, Y_normal_vs_II)
print normal_vs_III.fit(X_normal_vs_III, Y_normal_vs_III)
print normal_vs_IV.fit(X_normal_vs_IV, Y_normal_vs_IV)

# print "\nClassification_report on training"
# print metrics.classification_report(y_train, model.predict(X_train), 
#                                     labels=['normal', 'Stage I', 'Stage II', 'Stage III', 'Stage IV'], 
#                                     target_names=['normal', 'Stage I', 'Stage II', 'Stage III', 'Stage IV'])
# print "classification_report on testing"
# print metrics.classification_report(y_test, model.predict(X_test), 
#                                     labels=['normal', 'Stage I', 'Stage II', 'Stage III', 'Stage IV'], 
#                                     target_names=['normal', 'Stage I', 'Stage II', 'Stage III', 'Stage IV'])
# print metrics.confusion_matrix(y_test, model.predict(X_test), 
#                                labels=['normal', 'Stage I', 'Stage II', 'Stage III', 'Stage IV'])

[LibLinear]LogisticRegression(C=0.5, class_weight=None, dual=False, fit_intercept=False,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=1, warm_start=False)
[LibLinear]LogisticRegression(C=0.5, class_weight=None, dual=False, fit_intercept=False,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=1, warm_start=False)
[LibLinear]LogisticRegression(C=0.5, class_weight=None, dual=False, fit_intercept=False,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=1, warm_start=False)
[LibLinear]LogisticRegression(C=0.5, class_weight=None, dual=False, fit_intercept=False,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
      

In [69]:
dfs=[]

for model in [normal_vs_I, normal_vs_II, normal_vs_III, normal_vs_IV]:
    non_zero_idx = np.nonzero(model.coef_.ravel())[0]
    print "\nNon-zero coef:", non_zero_idx.size
    df = pandas.DataFrame({"mirna": mirna_list[non_zero_idx],
                           "coefficient": model.coef_.ravel()[non_zero_idx]})
    print df.sort(columns='coefficient', ascending=False).head(6)
    print df.sort(columns='coefficient', ascending=True).head(6)
    dfs.append(df)


Non-zero coef: 37
    coefficient         mirna
3      0.836722   hsa-mir-143
35     0.450923  hsa-mir-7108
12     0.348345  hsa-mir-378a
23     0.292981  hsa-mir-4732
19     0.282872  hsa-mir-4529
4      0.251154   hsa-mir-144
    coefficient         mirna
8     -1.503801    hsa-mir-21
34    -0.900268   hsa-mir-7-1
29    -0.588898   hsa-mir-628
6     -0.486738   hsa-mir-186
17    -0.323114  hsa-mir-4443
10    -0.298164   hsa-mir-34a

Non-zero coef: 34
    coefficient         mirna
7      1.144156   hsa-mir-143
18     0.489228  hsa-mir-4732
13     0.239837  hsa-mir-378a
30     0.227972  hsa-mir-7108
0      0.110596  hsa-let-7f-1
1      0.076368  hsa-let-7f-2
    coefficient           mirna
9     -1.124908      hsa-mir-21
6     -0.622543     hsa-mir-141
15    -0.538955    hsa-mir-4443
8     -0.525668     hsa-mir-186
24    -0.415845     hsa-mir-628
12    -0.188077  hsa-mir-3680-2

Non-zero coef: 38
    coefficient         mirna
3      1.128488   hsa-mir-143
29     0.485609  hsa-mir-4732



In [83]:
selected_miRs = sorted(set(dfs[0]['mirna'].tolist() + dfs[1]['mirna'].tolist() + dfs[2]['mirna'].tolist() + dfs[3]['mirna'].tolist()))
len(selected_miRs)

89