## Finding differentialy expressed miRNA's between LUAD stages I, II, III, IV vs normal using Logit regression
### Utilize Group Lasso with MISIM miRNA similarity data

In [1]:
import os
import pandas
from definitions import ROOT_DIR

mirna_tumor_df = pandas.read_csv(os.path.join(ROOT_DIR, "data/processed/miRNA/tumor_miRNA.csv"))
mirna_normal_df = pandas.read_csv(os.path.join(ROOT_DIR, "data/processed/miRNA/normal_miRNA.csv"))
clinical_df = pandas.read_csv(os.path.join(ROOT_DIR, "data/processed/clinical/clinical.csv"))
validated_miRNA_csv = pandas.read_csv(os.path.join(ROOT_DIR, 'data/external/validated_luad_miRNAs_miRCancer.csv'))

print "mirna_tumor_df.shape", mirna_tumor_df.shape, ', nulls:', mirna_tumor_df.isnull().sum().sum()
print "mirna_normal_df.shape", mirna_normal_df.shape, ', nulls:', mirna_normal_df.isnull().sum().sum()
print 'validated_miRNAs.shape', validated_miRNA_csv.shape
 
X_normal = pandas.merge(clinical_df[['patient_barcode', 'pathologic_stage']], mirna_normal_df, on='patient_barcode')
X_normal['pathologic_stage'] = 'normal'
X_tumor = pandas.merge(clinical_df[['patient_barcode', 'pathologic_stage']], mirna_tumor_df, on='patient_barcode')

# Map stage IA to stage I, IB to I, etc. ...
pathologic_stage_map = {'Stage IA': 'Stage I', 'Stage IB': 'Stage I', 
                        'Stage IIA': 'Stage II', 'Stage IIB': 'Stage II', 
                        'Stage IIIA': 'Stage III', 'Stage IIIB': 'Stage III'}

X_tumor.replace({'pathologic_stage': pathologic_stage_map}, inplace=True)

print X_normal['pathologic_stage'].value_counts().sort_index(axis=0)
print X_tumor['pathologic_stage'].value_counts().sort_index(axis=0)

mirna_tumor_df.shape (513, 1882) , nulls: 0
mirna_normal_df.shape (46, 1882) , nulls: 0
validated_miRNAs.shape (34, 2)
normal    46
Name: pathologic_stage, dtype: int64
Stage I      277
Stage II     121
Stage III     84
Stage IV      24
Name: pathologic_stage, dtype: int64


In [15]:
misim_df = pandas.read_table(os.path.join(ROOT_DIR, 'data/external/MISIM.tsv'), header=None)
misim_names = pandas.read_table(os.path.join(ROOT_DIR, 'data/external/MISIM_miRNA_names.csv'), header=0)

In [17]:
misim_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,261,262,263,264,265,266,267,268,269,270
0,1.0000,0.0553,0.0000,0.2082,0.1875,0.1500,0.0000,0.1256,0.0000,0.0000,...,0.0000,0.1648,0.0717,0.1097,0.1347,0.0276,0.1698,0.0555,0.1115,0.4170
1,0.0553,1.0000,0.0000,0.0259,0.1051,0.2634,0.0303,0.2870,0.0000,0.4000,...,0.0000,0.2888,0.4936,0.2437,0.2554,0.0112,0.3525,0.3838,0.2843,0.2837
2,0.0000,0.0000,1.0000,0.0000,0.0000,0.0000,0.0000,0.3333,0.0000,0.0000,...,0.0769,0.0218,0.0000,0.1053,0.0000,0.0000,0.0000,0.2500,0.1429,0.0000
3,0.2082,0.0259,0.0000,1.0000,0.1496,0.1392,0.0000,0.0283,0.0000,0.0000,...,0.0000,0.0323,0.0462,0.1567,0.0937,0.0094,0.0361,0.0308,0.1776,0.4092
4,0.1875,0.1051,0.0000,0.1496,1.0000,0.3652,0.0588,0.0855,0.0882,0.0000,...,0.0000,0.1028,0.1243,0.2017,0.3155,0.0573,0.1621,0.0858,0.1139,0.2203
5,0.1500,0.2634,0.0000,0.1392,0.3652,1.0000,0.0196,0.1519,0.0221,0.0000,...,0.0193,0.1545,0.4281,0.4641,0.5150,0.1055,0.1335,0.3371,0.4893,0.7214
6,0.0000,0.0303,0.0000,0.0000,0.0588,0.0196,1.0000,0.0260,0.6667,0.0606,...,0.0000,0.0303,0.0151,0.0179,0.2088,0.0000,0.0000,0.0202,0.0121,0.0000
7,0.1256,0.2870,0.3333,0.0283,0.0855,0.1519,0.0260,1.0000,0.0000,0.3333,...,0.0256,0.4142,0.5372,0.4840,0.1520,0.3715,0.1062,0.4628,0.3667,0.1833
8,0.0000,0.0000,0.0000,0.0000,0.0882,0.0221,0.6667,0.0000,1.0000,0.0000,...,0.0000,0.0000,0.0000,0.0093,0.2320,0.0000,0.0000,0.0000,0.0000,0.0000
9,0.0000,0.4000,0.0000,0.0000,0.0000,0.0000,0.0606,0.3333,0.0000,1.0000,...,0.0000,0.4000,0.1818,0.1053,0.0000,0.0000,0.0000,0.2500,0.1429,0.0000
