## Finding differentialy expressed miRNA's between LUAD stages I, II, III, IV vs normal using Logit regression
### Utilize Group Lasso with MISIM miRNA similarity data

In [166]:
# Necessary imports
import os
import pandas
import numpy as np

from definitions import ROOT_DIR

## Load normal and cancer miRNA expression data

In [197]:
from definitions import ROOT_DIR

# Load files into pandas data frames
mirna_tumor_df = pandas.read_csv(os.path.join(ROOT_DIR, "data/processed/miRNA/tumor_miRNA.csv"))
mirna_normal_df = pandas.read_csv(os.path.join(ROOT_DIR, "data/processed/miRNA/normal_miRNA.csv"))
clinical_df = pandas.read_csv(os.path.join(ROOT_DIR, "data/processed/clinical/clinical.csv"))
validated_miRNA_csv = pandas.read_csv(os.path.join(ROOT_DIR, 'data/external/validated_luad_miRNAs_miRCancer.csv'))

# Print data frame shapes
print "mirna_tumor_df.shape", mirna_tumor_df.shape, ', nulls:', mirna_tumor_df.isnull().sum().sum()
print "mirna_normal_df.shape", mirna_normal_df.shape, ', nulls:', mirna_normal_df.isnull().sum().sum()
print 'validated_miRNAs.shape', validated_miRNA_csv.shape

# Merge normal and tumor miRNA expression profiles with clinical cancer stage data
mirna_normal = pandas.merge(clinical_df[['patient_barcode', 'pathologic_stage']], mirna_normal_df, on='patient_barcode')
mirna_normal['pathologic_stage'] = 'normal'
mirna_tumor = pandas.merge(clinical_df[['patient_barcode', 'pathologic_stage']], mirna_tumor_df, on='patient_barcode')

# Map stage IA to stage I, IB to I, etc. ...
pathologic_stage_map = {'Stage IA': 'Stage I', 'Stage IB': 'Stage I', 
                        'Stage IIA': 'Stage II', 'Stage IIB': 'Stage II', 
                        'Stage IIIA': 'Stage III', 'Stage IIIB': 'Stage III'}
mirna_tumor.replace({'pathologic_stage': pathologic_stage_map}, inplace=True)

# Store list of all miRNA's in miRNA expression data
mirna_list = list(mirna_tumor.columns)[2:]

# Print quick overview of data
print mirna_normal['pathologic_stage'].value_counts().sort_index(axis=0)
print mirna_tumor['pathologic_stage'].value_counts().sort_index(axis=0)

mirna_tumor_df.shape (513, 1882) , nulls: 0
mirna_normal_df.shape (46, 1882) , nulls: 0
validated_miRNAs.shape (34, 2)
normal    46
Name: pathologic_stage, dtype: int64
Stage I      277
Stage II     121
Stage III     84
Stage IV      24
Name: pathologic_stage, dtype: int64


## Load normal & cancer Gene Expression data

In [199]:
gene_exp_tumor_df = pandas.read_table(os.path.join(ROOT_DIR, 'data/processed/gene_expression/tumor/READ__illuminahiseq_rnaseqv2__GeneExp.txt'), 
                                      header=0, delimiter='\t')
gene_exp_normal_df = pandas.read_table(os.path.join(ROOT_DIR, 'data/processed/gene_expression/normal/READ__illuminahiseq_rnaseqv2__GeneExp.txt'), 
                                      header=0, delimiter='\t')

gene_exp_tumor_df.rename(columns=lambda x: x[:12], inplace=True)
gene_exp_normal_df.rename(columns=lambda x: x[:12], inplace=True)

print gene_exp_tumor_df.shape
print gene_exp_normal_df.shape

(20530, 517)
(20530, 61)


## Reshape gene expression data frames to have shape (patients x gene expression) 

In [200]:
print len(list(gene_exp_tumor_df.columns))-2
print len(list(gene_exp_normal_df.columns))-2

# Remove entries with unknown Gene Symbol
gene_exp_tumor_df = gene_exp_tumor_df[gene_exp_tumor_df.GeneSymbol != '?']
gene_exp_normal_df = gene_exp_normal_df[gene_exp_normal_df.GeneSymbol != '?']

# Get list of all gene_symbols
gene_symbols = list(gene_exp_tumor_df['GeneSymbol'])
# Get list of tumor and normal patient_barcode
gene_exp_tumor_patient_barcodes = list(gene_exp_tumor_df.columns)[2:]
gene_exp_normal_patient_barcodes = list(gene_exp_normal_df.columns)[2:]

# Drop EntrezID column
gene_exp_tumor = gene_exp_tumor_df.drop(['EntrezID', 'GeneSymbol'], axis=1)
gene_exp_normal = gene_exp_normal_df.drop(['EntrezID', 'GeneSymbol'], axis=1)

# Reshaping data frame to have columns for GeneSymbols, and rows of patients
gene_exp_tumor = gene_exp_tumor.T
gene_exp_normal = gene_exp_normal.T
gene_exp_tumor.columns = gene_symbols
gene_exp_normal.columns = gene_symbols

# Add column for patients barcode
gene_exp_tumor['patient_barcode'] = gene_exp_tumor.index
gene_exp_normal['patient_barcode'] = gene_exp_normal.index

print "gene_symbols", len(gene_symbols)
print "gene_exp_tumor_patients", len(gene_exp_tumor_patient_barcodes)
print "gene_exp_normal_patients", len(gene_exp_normal_patient_barcodes)

print gene_exp_tumor.shape
print gene_exp_normal.shape

515
59
gene_symbols 20502
gene_exp_tumor_patients 515
gene_exp_normal_patients 59
(515, 20503)
(59, 20503)


## Filter samples with matched Gene Expression data and miRNA data

In [201]:
# Merge normal and tumor miRNA expression profiles with clinical cancer stage data
merged_normal_patients = pandas.merge(gene_exp_normal[['patient_barcode']], mirna_normal, on='patient_barcode')
merged_normal_patients = merged_normal_patients.fillna(merged_normal_patients.mean())[['patient_barcode', 'pathologic_stage']]
merged_tumor_patients = pandas.merge(gene_exp_tumor[['patient_barcode']], mirna_tumor, on='patient_barcode')
merged_tumor_patients = merged_tumor_patients.fillna(merged_tumor_patients.mean())[['patient_barcode', 'pathologic_stage']]

# Print quick overview of data
print merged_tumor_patients['pathologic_stage'].value_counts().sort_index(axis=0)
print merged_normal_patients['pathologic_stage'].value_counts().sort_index(axis=0)

mirna_tumor = mirna_tumor[mirna_tumor['patient_barcode'].isin(merged_tumor_patients['patient_barcode'])]
gene_exp_tumor = gene_exp_tumor[gene_exp_tumor['patient_barcode'].isin(merged_tumor_patients['patient_barcode'])]
mirna_normal = mirna_normal[mirna_normal['patient_barcode'].isin(merged_normal_patients['patient_barcode'])]
gene_exp_normal = gene_exp_normal[gene_exp_normal['patient_barcode'].isin(merged_normal_patients['patient_barcode'])]

mirna_tumor.drop(['patient_barcode', 'pathologic_stage'], 1, inplace=True)
mirna_normal.drop(['patient_barcode', 'pathologic_stage'], 1, inplace=True)

print "mirna_tumor", mirna_tumor.shape, mirna_tumor.isnull().sum().sum()
print "gene_exp_tumor", gene_exp_tumor.shape, gene_exp_tumor.isnull().sum().sum()
print "mirna_normal", mirna_normal.shape, mirna_normal.isnull().sum().sum()
print "gene_exp_normal", gene_exp_normal.shape, gene_exp_normal.isnull().sum().sum()

Stage I      275
Stage II     120
Stage III     84
Stage IV      24
Name: pathologic_stage, dtype: int64
normal    20
Name: pathologic_stage, dtype: int64


mirna_tumor (510, 1881) 0
gene_exp_tumor (510, 20503) 0
mirna_normal (20, 1881) 0
gene_exp_normal (20, 20503) 0


## Build miRNA-target relationship network to identify 
## Xu et al. (xu2011prioritizing)

In [202]:
from src.models.miRNA_target_network import miRNATargetNetwork
import networkx as nx

network = miRNATargetNetwork(threshold=0.6)
network.train(miRNAs_A=mirna_tumor, targets_A=gene_exp_tumor, miRNAs_B=mirna_normal, targets_B=gene_exp_normal)
print nx.bipartite.sets(network.B)

hsa-let-7a-1 - A1BG : 0.168492661018
hsa-let-7a-1 - A1CF : 0.0820435611878
hsa-let-7a-1 - RBFOX1 : 0.0737117868996
hsa-let-7a-1 - GGACT : -0.158622808191
hsa-let-7a-1 - A2M : -0.187444159012
hsa-let-7a-1 - A2ML1 : -0.154625311627
hsa-let-7a-1 - A4GALT : 0.129128218449
hsa-let-7a-1 - A4GNT : 0.0563189713128
hsa-let-7a-1 - NPSR1-AS1 : -0.480084934229
hsa-let-7a-1 - AAAS : 0.166864904934
hsa-let-7a-1 - AACS : 0.134693860012
hsa-let-7a-1 - AACSP1 : -0.611072632945
hsa-let-7a-1 - AADAC : 0.34243666034
hsa-let-7a-1 - AADACL2 : -0.123996523525
hsa-let-7a-1 - AADACL3 : -0.601693319924
hsa-let-7a-1 - AADACL4 : -0.118960790601
hsa-let-7a-1 - AADAT : -0.0616701415646
hsa-let-7a-1 - AAGAB : -0.0309940667089
hsa-let-7a-1 - AAK1 : -0.0288625246876
hsa-let-7a-1 - AAMP : 0.131710987029
hsa-let-7a-1 - AANAT : 0.215920528789
hsa-let-7a-1 - AARS : -0.0199947944412
hsa-let-7a-1 - AARS2 : 0.127825331376
hsa-let-7a-1 - AARSD1 : 0.0118823981682
hsa-let-7a-1 - AASDH : -0.338598199802
hsa-let-7a-1 - AASDHPPT :


hsa-let-7a-1 - ABCB11 : 0.197995873912
hsa-let-7a-1 - ABCB4 : -0.0931764058231
hsa-let-7a-1 - ABCB5 : 0.234827235786
hsa-let-7a-1 - ABCB6 : -0.219823578382
hsa-let-7a-1 - ABCB7 : -0.201716198676
hsa-let-7a-1 - ABCB8 : 0.149108023478
hsa-let-7a-1 - ABCB9 : -0.0902450979418
hsa-let-7a-1 - ABCC1 : 0.162896075662
hsa-let-7a-1 - ABCC10 : 0.21396194755
hsa-let-7a-1 - ABCC11 : 0.166793299203
hsa-let-7a-1 - ABCC12 : -0.1269570194
hsa-let-7a-1 - ABCC13 : -0.0614892446565
hsa-let-7a-1 - ABCC2 : -0.280295537797
hsa-let-7a-1 - ABCC3 : 0.206801875031
hsa-let-7a-1 - ABCC4 : -0.098264440021
hsa-let-7a-1 - ABCC5 : -0.14621505531
hsa-let-7a-1 - ABCC6 : 0.31534001114
hsa-let-7a-1 - ABCC6P1 : -0.0674811878891
hsa-let-7a-1 - ABCC6P2 : -0.07290332712
hsa-let-7a-1 - ABCC8 : 0.0811415709588
hsa-let-7a-1 - ABCC9 : 0.000681059856238
hsa-let-7a-1 - ABCD1 : 0.107618055198
hsa-let-7a-1 - ABCD2 : -0.0818613251349
hsa-let-7a-1 - ABCD3 : -0.0836719457331
hsa-let-7a-1 - ABCD4 : -0.122998515033
hsa-let-7a-1 - ABCE1 :

hsa-let-7a-1 - ABHD6 : -0.442826755005
hsa-let-7a-1 - ABHD8 : 0.274259747345
hsa-let-7a-1 - ABI1 : 0.0982713993068
hsa-let-7a-1 - ABI2 : -0.0408000735034
hsa-let-7a-1 - ABI3 : 0.316741326415
hsa-let-7a-1 - ABI3BP : -0.379971522414
hsa-let-7a-1 - ABL1 : 0.369639437252
hsa-let-7a-1 - ABL2 : 0.0967228001572
hsa-let-7a-1 - ABLIM1 : 0.0150659251174
hsa-let-7a-1 - ABLIM2 : 0.0226804706151
hsa-let-7a-1 - ABLIM3 : -0.340739314811
hsa-let-7a-1 - ABO : 0.0835360572908
hsa-let-7a-1 - AOC1 : -0.111982836188
hsa-let-7a-1 - ABR : 0.376839202099
hsa-let-7a-1 - ABRA : -0.225012825695
hsa-let-7a-1 - ABT1 : -0.172973083269
hsa-let-7a-1 - ABTB1 : 0.319735872488
hsa-let-7a-1 - ABTB2 : 0.515631157695
hsa-let-7a-1 - ACAA1 : 0.402131046885
hsa-let-7a-1 - ACAA2 : 0.118285270512
hsa-let-7a-1 - ACACA : 0.228921841593
hsa-let-7a-1 - ACACB : -0.133620915257
hsa-let-7a-1 - ACAD10 : 0.101798036176
hsa-let-7a-1 - ACAD11 : 0.174158910008
hsa-let-7a-1 - ACAD8 : -0.394238208277
hsa-let-7a-1 - ACAD9 : 0.177853436629
hsa

hsa-let-7a-1 - ACCSL : nan
hsa-let-7a-1 - ACD : 0.0426707045269
hsa-let-7a-1 - ACE : 0.402872968495
hsa-let-7a-1 - ACE2 : -0.0528328235977
hsa-let-7a-1 - ACER1 : 0.249871137394
hsa-let-7a-1 - ACER2 : -0.161946109063
hsa-let-7a-1 - ACER3 : 0.00729565704537
hsa-let-7a-1 - ACHE : 0.0204525112434
hsa-let-7a-1 - ACIN1 : 0.0344351698045
hsa-let-7a-1 - ACLY : 0.037459787234
hsa-let-7a-1 - ACMSD : 0.30526512273
hsa-let-7a-1 - ACN9 : -0.234733114108
hsa-let-7a-1 - ACO1 : -0.0051242547096
hsa-let-7a-1 - ACO2 : 0.207267078505
hsa-let-7a-1 - ACOT1 : -0.0323441624778
hsa-let-7a-1 - ACOT11 : 0.0939763026268
hsa-let-7a-1 - ACOT12 : nan
hsa-let-7a-1 - ACOT13 : 0.0556626551496
hsa-let-7a-1 - ACOT2 : 0.0352847514817
hsa-let-7a-1 - ACOT4 : 0.172938108938
hsa-let-7a-1 - ACOT6 : -0.225579987829
hsa-let-7a-1 - ACOT7 : 0.391713073412
hsa-let-7a-1 - ACOT8 : 0.3749951032
hsa-let-7a-1 - ACOT9 : 0.152038278537
hsa-let-7a-1 - ACOX1 : 0.500073442961
hsa-let-7a-1 - ACOX2 : -0.0947498900207
hsa-let-7a-1 - ACOX3 : 0.


hsa-let-7a-1 - ACSM1 : 0.130710615003
hsa-let-7a-1 - ACSM2A : 0.059177808238
hsa-let-7a-1 - ACSM2B : -0.253358747795
hsa-let-7a-1 - ACSM3 : -0.581931929452
hsa-let-7a-1 - ACSM4 : 0.0717502700719
hsa-let-7a-1 - ACSM5 : 0.104798499067
hsa-let-7a-1 - ACSS1 : 0.0286462381782
hsa-let-7a-1 - ACSS2 : 0.148010770214
hsa-let-7a-1 - ACSS3 : -0.391217306214
hsa-let-7a-1 - ACTA1 : -0.0152919685773
hsa-let-7a-1 - ACTA2 : -0.427032891978
hsa-let-7a-1 - ACTB : 0.224868460762
hsa-let-7a-1 - ACTBL2 : 0.307097991579
hsa-let-7a-1 - ACTC1 : 0.101138633759
hsa-let-7a-1 - ACTG1 : 0.17664015752
hsa-let-7a-1 - ACTG2 : -0.324490468182
hsa-let-7a-1 - ACTL6A : -0.26971444827
hsa-let-7a-1 - ACTL6B : -0.163181610796
hsa-let-7a-1 - ACTL7A : -0.437413259053
hsa-let-7a-1 - ACTL7B : 0.18180636323
hsa-let-7a-1 - ACTL8 : 0.465467581586
hsa-let-7a-1 - ACTL9 : nan
hsa-let-7a-1 - ACTN1 : -0.0012198787234
hsa-let-7a-1 - ACTN2 : 0.212821460582
hsa-let-7a-1 - ACTN3 : 0.129864062797
hsa-let-7a-1 - ACTN4 : 0.3558797387
hsa-let

-0.0391526786058
hsa-let-7a-1 - ADA : -0.0558232343811
hsa-let-7a-1 - ADAD1 : -0.0368283566105
hsa-let-7a-1 - ADAD2 : -0.314500373683
hsa-let-7a-1 - ADAL : -0.19515264526
hsa-let-7a-1 - ADAM10 : -0.266159280642
hsa-let-7a-1 - ADAM11 : -0.149842367617
hsa-let-7a-1 - ADAM12 : -0.214956726693
hsa-let-7a-1 - ADAM15 : -0.178898050359
hsa-let-7a-1 - ADAM17 : 0.171225677377
hsa-let-7a-1 - ADAM18 : nan
hsa-let-7a-1 - ADAM19 : -0.171639202945
hsa-let-7a-1 - ADAM2 : nan
hsa-let-7a-1 - ADAM20 : 0.220003227305
hsa-let-7a-1 - ADAM21 : 0.122991354219
hsa-let-7a-1 - ADAM21P1 : 0.475888031308
hsa-let-7a-1 - ADAM22 : 0.067047858866
hsa-let-7a-1 - ADAM23 : -0.38287662467
hsa-let-7a-1 - ADAM28 : 0.0151523804462
hsa-let-7a-1 - ADAM29 : 0.179672820349
hsa-let-7a-1 - ADAM30 : 0.213208865511
hsa-let-7a-1 - ADAM32 : -0.204195535375
hsa-let-7a-1 - ADAM33 : 0.0417228672215
hsa-let-7a-1 - ADAM3A : nan
hsa-let-7a-1 - ADAM5 : nan
hsa-let-7a-1 - ADAM6 : -0.241465925665
hsa-let-7a-1 - ADAM7 : nan
hsa-let-7a-1 - ADAM


hsa-let-7a-1 - ADAMTSL2 : -0.0425481456091
hsa-let-7a-1 - ADAMTSL3 : -0.192576679466
hsa-let-7a-1 - ADAMTSL4 : 0.361133482793
hsa-let-7a-1 - ADAMTSL5 : 0.525585666592
hsa-let-7a-1 - ADAP1 : 0.373995905666
hsa-let-7a-1 - ADAP2 : 0.129536088868
hsa-let-7a-1 - ADAR : 0.101111953557
hsa-let-7a-1 - ADARB1 : -0.0807565221638
hsa-let-7a-1 - ADARB2 : 0.111975379689
hsa-let-7a-1 - ADAT1 : -0.241310026557
hsa-let-7a-1 - ADAT2 : -0.164584968287
hsa-let-7a-1 - ADAT3 : 0.319811078159
hsa-let-7a-1 - AZIN2 : -0.140754472387
hsa-let-7a-1 - ADCK1 : -0.106655986853
hsa-let-7a-1 - ADCK2 : 0.042809818048
hsa-let-7a-1 - ADCK4 : 0.532375544389
hsa-let-7a-1 - ADCK5 : 0.1563741647
hsa-let-7a-1 - ADCY1 : -0.642465643726
hsa-let-7a-1 - ADCY10 : -0.150901201421
hsa-let-7a-1 - ADCY2 : -0.0476267100145
hsa-let-7a-1 - ADCY3 : 0.169094278279
hsa-let-7a-1 - ADCY4 : -0.135641588353
hsa-let-7a-1 - ADCY5 : -0.106441877213
hsa-let-7a-1 - ADCY6 : 0.0177190374922
hsa-let-7a-1 - ADCY7 : -0.0943224380815
hsa-let-7a-1 - ADCY

 : -0.247855989365
hsa-let-7a-1 - ADNP : -0.192699890432
hsa-let-7a-1 - ADNP2 : 0.0928398060715
hsa-let-7a-1 - ADO : -0.613756095493
hsa-let-7a-1 - ADORA1 : 0.056747226557
hsa-let-7a-1 - ADORA2A : 0.401418243326
hsa-let-7a-1 - ADORA2B : 0.404903412077
hsa-let-7a-1 - ADORA3 : 0.284072095575
hsa-let-7a-1 - ADPGK : 0.0954904312897
hsa-let-7a-1 - ADPRH : -0.0994339987167
hsa-let-7a-1 - ADPRHL1 : 0.0477300130206
hsa-let-7a-1 - ADPRHL2 : 0.369875121755
hsa-let-7a-1 - ADRA1A : -0.0372252597754
hsa-let-7a-1 - ADRA1B : 0.259735334282
hsa-let-7a-1 - ADRA1D : 0.276078327512
hsa-let-7a-1 - ADRA2A : -0.217414784416
hsa-let-7a-1 - ADRA2B : -0.0970778116224
hsa-let-7a-1 - ADRA2C : -0.0745511534
hsa-let-7a-1 - ADRB1 : 0.455273996783
hsa-let-7a-1 - ADRB2 : 0.123539831742
hsa-let-7a-1 - ADRB3 : 0.144007368735
hsa-let-7a-1 - ADRBK1 : -0.0467591766829
hsa-let-7a-1 - ADRBK2 : 0.0930857159919
hsa-let-7a-1 - ADRM1 : 0.270037992299
hsa-let-7a-1 - ADSL : -0.125207112995
hsa-let-7a-1 - ADSS : -0.0534256001494
h

hsa-let-7a-1 - AGAP3 : 0.499932760281
hsa-let-7a-1 - AGAP4 : AGAP4    0.151736
AGAP4    0.164651
dtype: float64


ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

## Loading miRecords experimentally validated miRNA-targets interactions

In [157]:
# Load data frame from file
miRecords_df = pandas.read_table(os.path.join(ROOT_DIR, 'data/external/miRecords_version4.tsv'), delimiter='\t')

# Select only homo sapiens miRNA-target pairs
miRecords_df = miRecords_df[(miRecords_df["miRNA_species"] == "Homo sapiens") &
                            (miRecords_df["Target gene_species_scientific"] == "Homo sapiens")]
miRecords_df = miRecords_df[["miRNA_mature_ID", "Target gene_name"]]

# Standardize miRNA and gene symbols
miRecords_df['miRNA_mature_ID'] = miRecords_df['miRNA_mature_ID'].str.lower()
miRecords_df['miRNA_mature_ID'] = miRecords_df['miRNA_mature_ID'].str.replace('*', '')
miRecords_df['Target gene_name'] = miRecords_df['Target gene_name'].str.upper()

# Filter miRNA-target pairs to only miRNA's included in miRNA expression data, same for gene targets 
miRecords_df = miRecords_df[miRecords_df["miRNA_mature_ID"].isin(mirna_list) &
                            miRecords_df["Target gene_name"].isin(gene_symbols)]

# miRecords_df