## Finding differentialy expressed miRNA's between LUAD stages I, II, III, IV vs normal using Logit regression
### Utilize Group Lasso with MISIM miRNA similarity data

In [166]:
# Necessary imports
import os
import pandas
import numpy as np

from definitions import ROOT_DIR

## Load normal and cancer miRNA expression data

In [167]:
from definitions import ROOT_DIR

# Load files into pandas data frames
mirna_tumor_df = pandas.read_csv(os.path.join(ROOT_DIR, "data/processed/miRNA/tumor_miRNA.csv"))
mirna_normal_df = pandas.read_csv(os.path.join(ROOT_DIR, "data/processed/miRNA/normal_miRNA.csv"))
clinical_df = pandas.read_csv(os.path.join(ROOT_DIR, "data/processed/clinical/clinical.csv"))
validated_miRNA_csv = pandas.read_csv(os.path.join(ROOT_DIR, 'data/external/validated_luad_miRNAs_miRCancer.csv'))

# Print data frame shapes
print "mirna_tumor_df.shape", mirna_tumor_df.shape, ', nulls:', mirna_tumor_df.isnull().sum().sum()
print "mirna_normal_df.shape", mirna_normal_df.shape, ', nulls:', mirna_normal_df.isnull().sum().sum()
print 'validated_miRNAs.shape', validated_miRNA_csv.shape

# Merge normal and tumor miRNA expression profiles with clinical cancer stage data
mirna_normal = pandas.merge(clinical_df[['patient_barcode', 'pathologic_stage']], mirna_normal_df, on='patient_barcode')
mirna_normal['pathologic_stage'] = 'normal'
mirna_tumor = pandas.merge(clinical_df[['patient_barcode', 'pathologic_stage']], mirna_tumor_df, on='patient_barcode')

# Map stage IA to stage I, IB to I, etc. ...
pathologic_stage_map = {'Stage IA': 'Stage I', 'Stage IB': 'Stage I', 
                        'Stage IIA': 'Stage II', 'Stage IIB': 'Stage II', 
                        'Stage IIIA': 'Stage III', 'Stage IIIB': 'Stage III'}
mirna_tumor.replace({'pathologic_stage': pathologic_stage_map}, inplace=True)

# Store list of all miRNA's in miRNA expression data
mirna_list = list(mirna_tumor.columns)[2:]

# Print quick overview of data
print mirna_normal['pathologic_stage'].value_counts().sort_index(axis=0)
print mirna_tumor['pathologic_stage'].value_counts().sort_index(axis=0)

mirna_tumor_df.shape (513, 1882) , nulls: 0
mirna_normal_df.shape (46, 1882) , nulls: 0
validated_miRNAs.shape (34, 2)
normal    46
Name: pathologic_stage, dtype: int64
Stage I      277
Stage II     121
Stage III     84
Stage IV      24
Name: pathologic_stage, dtype: int64


## Load normal & cancer Gene Expression data

In [168]:
gene_exp_tumor_df = pandas.read_table(os.path.join(ROOT_DIR, 'data/processed/gene_expression/tumor/READ__illuminahiseq_rnaseqv2__GeneExp.txt'), 
                                      header=0, delimiter='\t')
gene_exp_normal_df = pandas.read_table(os.path.join(ROOT_DIR, 'data/processed/gene_expression/normal/READ__illuminahiseq_rnaseqv2__GeneExp.txt'), 
                                      header=0, delimiter='\t')

gene_exp_tumor_df.rename(columns=lambda x: x[:12], inplace=True)
gene_exp_normal_df.rename(columns=lambda x: x[:12], inplace=True)

print gene_exp_tumor_df.shape
print gene_exp_normal_df.shape

(20530, 517)
(20530, 61)


## Reshape gene expression data frames to have shape (patients x gene expression) 

In [169]:
print len(list(gene_exp_tumor_df.columns))-2
print len(list(gene_exp_normal_df.columns))-2

# Remove entries with unknown Gene Symbol
gene_exp_tumor_df = gene_exp_tumor_df[gene_exp_tumor_df.GeneSymbol != '?']
gene_exp_normal_df = gene_exp_normal_df[gene_exp_normal_df.GeneSymbol != '?']

# Get list of all gene_symbols
gene_symbols = list(gene_exp_tumor_df['GeneSymbol'])
# Get list of tumor and normal patient_barcode
gene_exp_tumor_patient_barcodes = list(gene_exp_tumor_df.columns)[2:]
gene_exp_normal_patient_barcodes = list(gene_exp_normal_df.columns)[2:]

# Drop EntrezID column
gene_exp_tumor = gene_exp_tumor_df.drop(['EntrezID', 'GeneSymbol'], axis=1)
gene_exp_normal = gene_exp_normal_df.drop(['EntrezID', 'GeneSymbol'], axis=1)

# Reshaping data frame to have columns for GeneSymbols, and rows of patients
gene_exp_tumor = gene_exp_tumor.T
gene_exp_normal = gene_exp_normal.T
gene_exp_tumor.columns = gene_symbols
gene_exp_normal.columns = gene_symbols

# Add column for patients barcode
gene_exp_tumor['patient_barcode'] = gene_exp_tumor.index
gene_exp_normal['patient_barcode'] = gene_exp_normal.index

print "gene_symbols", len(gene_symbols)
print "gene_exp_tumor_patients", len(gene_exp_tumor_patient_barcodes)
print "gene_exp_normal_patients", len(gene_exp_normal_patient_barcodes)

print gene_exp_tumor.shape
print gene_exp_normal.shape

515
59
gene_symbols 20502
gene_exp_tumor_patients 515
gene_exp_normal_patients 59
(515, 20503)
(59, 20503)


## Match Gene Expression data to miRNA data

In [170]:
# Merge normal and tumor miRNA expression profiles with clinical cancer stage data
merged_normal_patients = pandas.merge(gene_exp_normal[['patient_barcode']], mirna_normal, on='patient_barcode')[['patient_barcode', 'pathologic_stage']]
merged_tumor_patients = pandas.merge(gene_exp_tumor[['patient_barcode']], mirna_tumor, on='patient_barcode')[['patient_barcode', 'pathologic_stage']]

# Print quick overview of data
print merged_normal_patients['pathologic_stage'].value_counts().sort_index(axis=0)
print merged_tumor_patients['pathologic_stage'].value_counts().sort_index(axis=0)

normal    20
Name: pathologic_stage, dtype: int64
Stage I      275
Stage II     120
Stage III     84
Stage IV      24
Name: pathologic_stage, dtype: int64


## Build miRNA-target relationship network to identify 
## Xu et al. (xu2011prioritizing)

In [171]:
from src.models.miRNA_target_network import miRNATargetNetwork
import networkx as nx

network = miRNATargetNetwork(threshold=0.6)
network.train(miRNAs_A=mirna_tumor, targets_A=gene_exp_tumor, miRNAs_B=mirna_normal, targets_B=gene_exp_normal)
print nx.bipartite.sets(network.B)

TypeError: Could not convert TCGA-53-7813TCGA-49-6745TCGA-44-A47GTCGA-55-7910TCGA-97-7547TCGA-L4-A4E6TCGA-67-4679TCGA-55-8094TCGA-91-6831TCGA-55-7724TCGA-78-7147TCGA-44-6778TCGA-55-8206TCGA-78-7540TCGA-44-5645TCGA-67-6215TCGA-99-AA5RTCGA-73-A9RSTCGA-MP-A4T4TCGA-97-7552TCGA-78-7153TCGA-93-A4JOTCGA-44-6146TCGA-93-A4JPTCGA-93-7348TCGA-MP-A4TCTCGA-80-5607TCGA-J2-8194TCGA-78-7166TCGA-73-7499TCGA-49-6761TCGA-97-A4M5TCGA-62-A46OTCGA-78-7535TCGA-67-3774TCGA-MP-A4TJTCGA-MP-A4TITCGA-55-8299TCGA-62-8397TCGA-55-1594TCGA-86-8359TCGA-55-6972TCGA-44-2668TCGA-55-7574TCGA-05-4420TCGA-64-1679TCGA-97-8176TCGA-62-A46RTCGA-91-6828TCGA-L4-A4E5TCGA-05-4410TCGA-50-5932TCGA-78-8660TCGA-44-6774TCGA-55-8097TCGA-05-4390TCGA-75-6207TCGA-55-7913TCGA-55-8511TCGA-44-8117TCGA-38-4626TCGA-75-7027TCGA-97-7554TCGA-97-8552TCGA-91-6836TCGA-95-7562TCGA-44-2662TCGA-NJ-A55ATCGA-44-6775TCGA-55-6969TCGA-55-A492TCGA-55-A491TCGA-73-4676TCGA-L9-A8F4TCGA-91-A4BDTCGA-L9-A444TCGA-78-7163TCGA-69-7763TCGA-NJ-A4YGTCGA-05-4249TCGA-55-A4DGTCGA-86-7713TCGA-86-7711TCGA-44-A479TCGA-55-6970TCGA-55-6982TCGA-95-8039TCGA-86-8280TCGA-NJ-A4YFTCGA-50-5941TCGA-49-6744TCGA-55-A4DFTCGA-64-5778TCGA-62-8394TCGA-93-8067TCGA-44-2666TCGA-L9-A743TCGA-64-5774TCGA-97-A4M0TCGA-49-AARQTCGA-97-7546TCGA-55-8616TCGA-78-7146TCGA-93-7347TCGA-69-7760TCGA-44-A47FTCGA-78-7145TCGA-44-6776TCGA-05-4424TCGA-55-6981TCGA-49-4514TCGA-86-8669TCGA-05-4397TCGA-44-3919TCGA-44-7667TCGA-69-7765TCGA-MP-A4SVTCGA-05-4389TCGA-75-7025TCGA-50-6591TCGA-78-7536TCGA-99-7458TCGA-55-A48XTCGA-44-4112TCGA-55-8301TCGA-05-4415TCGA-49-4486TCGA-55-1592TCGA-69-7978TCGA-73-7498TCGA-62-A46PTCGA-44-5644TCGA-05-4434TCGA-95-7567TCGA-97-8547TCGA-MN-A4N1TCGA-50-5072TCGA-49-AAQVTCGA-50-5936TCGA-62-A470TCGA-J2-A4ADTCGA-49-AAR2TCGA-MN-A4N4TCGA-97-8172TCGA-50-5930TCGA-62-A472TCGA-91-6829TCGA-78-7150TCGA-55-8620TCGA-44-7671TCGA-50-5068TCGA-95-7948TCGA-69-7974TCGA-MP-A4THTCGA-67-6217TCGA-67-6216TCGA-97-A4M3TCGA-05-4430TCGA-55-7903TCGA-05-4432TCGA-55-8614TCGA-67-3771TCGA-49-4487TCGA-05-4427TCGA-50-5055TCGA-50-6593TCGA-53-7624TCGA-55-7570TCGA-MP-A5C7TCGA-67-3770TCGA-50-5935TCGA-62-A46VTCGA-44-8119TCGA-49-4512TCGA-49-4494TCGA-50-5944TCGA-64-1680TCGA-38-4631TCGA-55-6984TCGA-55-7725TCGA-55-1596TCGA-69-7764TCGA-49-AAR3TCGA-95-7039TCGA-78-7154TCGA-38-7271TCGA-05-4405TCGA-69-7979TCGA-97-8179TCGA-69-7980TCGA-49-6742TCGA-J2-8192TCGA-95-8494TCGA-50-5933TCGA-55-A490TCGA-44-3398TCGA-62-A46STCGA-44-8120TCGA-75-5122TCGA-97-A4M2TCGA-78-7155TCGA-NJ-A7XGTCGA-44-A47ATCGA-35-5375TCGA-L9-A5IPTCGA-55-6971TCGA-38-4630TCGA-55-8508TCGA-38-4629TCGA-55-7576TCGA-55-6712TCGA-05-4250TCGA-55-7281TCGA-62-A46YTCGA-55-8621TCGA-MP-A4T9TCGA-78-7161TCGA-55-7283TCGA-49-4507TCGA-75-6211TCGA-50-6595TCGA-86-8671TCGA-73-4662TCGA-35-3615TCGA-53-7626TCGA-05-5423TCGA-55-7911TCGA-86-7701TCGA-MP-A4T8TCGA-75-7031TCGA-55-7727TCGA-86-7955TCGA-35-4123TCGA-91-6849TCGA-44-5643TCGA-75-5147TCGA-97-7938TCGA-64-5815TCGA-05-4396TCGA-05-5425TCGA-78-7220TCGA-55-8089TCGA-69-7761TCGA-75-5146TCGA-55-8208TCGA-44-6144TCGA-NJ-A55OTCGA-86-8281TCGA-53-A4EZTCGA-38-4632TCGA-O1-A52JTCGA-05-4402TCGA-95-A4VNTCGA-86-A456TCGA-38-6178TCGA-49-AAR0TCGA-91-8499TCGA-55-6978TCGA-78-7158TCGA-35-4122TCGA-55-8505TCGA-71-6725TCGA-86-8668TCGA-62-A46UTCGA-44-A47BTCGA-97-8175TCGA-78-7160TCGA-55-8204TCGA-55-6985TCGA-44-A4SSTCGA-69-8255TCGA-86-7714TCGA-05-4395TCGA-97-8177TCGA-55-6983TCGA-86-A4P8TCGA-MP-A4SYTCGA-50-5942TCGA-MN-A4N5TCGA-80-5611TCGA-55-6986TCGA-97-7941TCGA-55-8205TCGA-MP-A4TFTCGA-S2-AA1ATCGA-62-8395TCGA-MP-A4TATCGA-44-7662TCGA-44-7660TCGA-55-7815TCGA-78-7633TCGA-44-2661TCGA-86-8075TCGA-86-8674TCGA-86-8054TCGA-49-AARETCGA-86-8358TCGA-67-3773TCGA-55-A494TCGA-86-8056TCGA-91-7771TCGA-86-8055TCGA-05-4422TCGA-49-4501TCGA-78-7162TCGA-69-8453TCGA-55-6642TCGA-49-4510TCGA-55-8096TCGA-55-8510TCGA-62-A471TCGA-44-2659TCGA-MP-A4T7TCGA-49-4505TCGA-75-6203TCGA-50-7109TCGA-55-6980TCGA-75-5126TCGA-J2-A4AETCGA-67-3772TCGA-49-AAR9TCGA-50-5931TCGA-49-6767TCGA-05-5715TCGA-64-5775TCGA-50-5045TCGA-55-A493TCGA-38-4627TCGA-44-6148TCGA-78-7152TCGA-55-8514TCGA-44-A4SUTCGA-44-2655TCGA-44-7669TCGA-05-4425TCGA-99-8033TCGA-86-A4JFTCGA-78-8648TCGA-05-4403TCGA-55-8619TCGA-50-5946TCGA-50-5939TCGA-91-8496TCGA-95-A4VKTCGA-05-4382TCGA-05-4418TCGA-55-8207TCGA-95-7043TCGA-86-7953TCGA-49-AAROTCGA-91-6847TCGA-71-8520TCGA-05-4384TCGA-78-8640TCGA-55-A48YTCGA-91-6835TCGA-50-5044TCGA-73-4675TCGA-73-4659TCGA-95-7944TCGA-L9-A50WTCGA-97-7553TCGA-L9-A443TCGA-50-6597TCGA-50-6592TCGA-83-5908TCGA-69-8253TCGA-55-8615TCGA-69-7973TCGA-55-5899TCGA-75-5125TCGA-55-7816TCGA-55-7573TCGA-97-8174TCGA-NJ-A4YITCGA-86-8673TCGA-78-7539TCGA-64-1676TCGA-86-8074TCGA-49-4506TCGA-55-8302TCGA-86-A4P7TCGA-MP-A4TDTCGA-86-8278TCGA-86-8672TCGA-50-8457TCGA-NJ-A4YPTCGA-NJ-A4YQTCGA-78-7156TCGA-05-4426TCGA-97-A4LXTCGA-49-AARNTCGA-55-7284TCGA-MP-A4TETCGA-38-4628TCGA-91-6830TCGA-86-8585TCGA-55-8085TCGA-95-7947TCGA-97-7937TCGA-44-3396TCGA-44-6779TCGA-55-7726TCGA-91-6840TCGA-55-7995TCGA-55-8513TCGA-75-6205TCGA-91-A4BCTCGA-86-8076TCGA-64-5781TCGA-4B-A93VTCGA-50-6590TCGA-50-8459TCGA-55-6987TCGA-05-5428TCGA-86-8073TCGA-93-A4JQTCGA-50-5051TCGA-64-1681TCGA-44-6147TCGA-75-6212TCGA-64-1677TCGA-80-5608TCGA-MP-A4TKTCGA-99-8025TCGA-62-8399TCGA-55-7907TCGA-69-8254TCGA-55-7728TCGA-55-A57BTCGA-50-6673TCGA-44-7670TCGA-86-7954TCGA-55-1595TCGA-05-5420TCGA-49-4488TCGA-50-5049TCGA-44-2656TCGA-05-4398TCGA-44-6145TCGA-55-8090TCGA-55-8203TCGA-55-8506TCGA-05-4244TCGA-91-6848TCGA-49-4490TCGA-05-5429TCGA-49-AARRTCGA-55-8092TCGA-86-8279TCGA-49-AAR4TCGA-93-A4JNTCGA-78-7167TCGA-78-8662TCGA-78-7159TCGA-49-6743TCGA-86-6562TCGA-64-5779TCGA-62-8398TCGA-95-A4VPTCGA-50-8460TCGA-55-7994TCGA-44-6777TCGA-99-8032TCGA-44-3918TCGA-78-8655TCGA-75-6206TCGA-44-2657TCGA-86-A4D0TCGA-55-8512TCGA-78-7542TCGA-44-2665TCGA-78-7143TCGA-05-4433TCGA-55-8087TCGA-L9-A7SVTCGA-J2-A4AGTCGA-44-7659TCGA-NJ-A55RTCGA-38-A44FTCGA-97-8171TCGA-38-4625TCGA-55-7227TCGA-64-1678TCGA-55-6968TCGA-55-6543TCGA-99-8028TCGA-55-8091TCGA-91-8497TCGA-55-A48ZTCGA-75-7030TCGA-75-6214TCGA-78-7149TCGA-05-4417TCGA-97-A4M7TCGA-MP-A4SWTCGA-62-8402TCGA-97-A4M1TCGA-55-8507TCGA-55-7914TCGA-69-A59KTCGA-44-7672TCGA-50-6594TCGA-78-7148TCGA-MP-A4T2TCGA-55-6979TCGA-78-7537TCGA-50-5066TCGA-73-4658TCGA-44-7661TCGA-86-6851TCGA-MP-A4T6TCGA-97-A4M6 to numeric

In [173]:
nx.bipartite.sets(network.B)

(set(),
 {'MTVR2',
  'ATRX',
  'LOC441204',
  'TCOF1',
  'NSRP1',
  'LOC441208',
  'SPPL3',
  'OPA3',
  'OPA1',
  'hsa-mir-4707',
  'hsa-mir-4706',
  'hsa-mir-4705',
  'hsa-mir-4704',
  'hsa-mir-4703',
  'hsa-mir-4701',
  'hsa-mir-4700',
  'ITGA1',
  'ITGA2',
  'ITGA3',
  'ITGA4',
  'ITGA5',
  'hsa-mir-4709',
  'hsa-mir-4708',
  'TRHR',
  'LOC100132288',
  'UFSP1',
  'UFSP2',
  'FAM212B',
  'FAM212A',
  'CHST9',
  'CHST8',
  'CHST1',
  'CHST3',
  'CHST2',
  'CHST5',
  'CHST4',
  'CHST7',
  'CHST6',
  'ITGAX',
  'BCL2A1',
  'ITGAV',
  'PREP',
  'ITGAL',
  'ITGAM',
  'ITGAD',
  'SPR',
  'IGF2R',
  'SLC36A4',
  'SLC36A3',
  'CKS1B',
  'SLC36A1',
  'C14orf119',
  'ART1',
  'ART3',
  'ART5',
  'ART4',
  'hsa-mir-3168',
  'hsa-mir-3169',
  'hsa-mir-3166',
  'hsa-mir-3167',
  'hsa-mir-3164',
  'hsa-mir-3165',
  'hsa-mir-3162',
  'hsa-mir-3163',
  'hsa-mir-3161',
  'MGC57346',
  'CRTAM',
  'CADPS2',
  'RLF',
  'CRTAP',
  'MDH1',
  'HPYR1',
  'CPEB4',
  'CPEB1',
  'CPEB3',
  'SBSPON',
  'SRBD1'

## Loading miRecords experimentally validated miRNA-targets interactions

In [157]:
# Load data frame from file
miRecords_df = pandas.read_table(os.path.join(ROOT_DIR, 'data/external/miRecords_version4.tsv'), delimiter='\t')

# Select only homo sapiens miRNA-target pairs
miRecords_df = miRecords_df[(miRecords_df["miRNA_species"] == "Homo sapiens") &
                            (miRecords_df["Target gene_species_scientific"] == "Homo sapiens")]
miRecords_df = miRecords_df[["miRNA_mature_ID", "Target gene_name"]]

# Standardize miRNA and gene symbols
miRecords_df['miRNA_mature_ID'] = miRecords_df['miRNA_mature_ID'].str.lower()
miRecords_df['miRNA_mature_ID'] = miRecords_df['miRNA_mature_ID'].str.replace('*', '')
miRecords_df['Target gene_name'] = miRecords_df['Target gene_name'].str.upper()

# Filter miRNA-target pairs to only miRNA's included in miRNA expression data, same for gene targets 
miRecords_df = miRecords_df[miRecords_df["miRNA_mature_ID"].isin(mirna_list) &
                            miRecords_df["Target gene_name"].isin(gene_symbols)]

# miRecords_df