## Finding differentialy expressed miRNA's between LUAD stages I, II, III, IV vs normal using Logit regression
### Utilize Group Lasso with MISIM miRNA similarity data

In [1]:
# Necessary imports

import os
import pandas
import numpy as np

from definitions import ROOT_DIR

## Load normal and cancer miRNA expression data

In [2]:
from definitions import ROOT_DIR

# Load files into pandas data frames
mirna_tumor_df = pandas.read_csv(os.path.join(ROOT_DIR, "data/processed/miRNA/tumor_miRNA.csv"))
mirna_normal_df = pandas.read_csv(os.path.join(ROOT_DIR, "data/processed/miRNA/normal_miRNA.csv"))
clinical_df = pandas.read_csv(os.path.join(ROOT_DIR, "data/processed/clinical/clinical.csv"))
validated_miRNA_csv = pandas.read_csv(os.path.join(ROOT_DIR, 'data/external/validated_luad_miRNAs_miRCancer.csv'))

# Print data frame shapes
print "mirna_tumor_df.shape", mirna_tumor_df.shape, ', nulls:', mirna_tumor_df.isnull().sum().sum()
print "mirna_normal_df.shape", mirna_normal_df.shape, ', nulls:', mirna_normal_df.isnull().sum().sum()
print 'validated_miRNAs.shape', validated_miRNA_csv.shape

# Merge normal and tumor miRNA expression profiles with clinical cancer stage data
mirna_normal = pandas.merge(clinical_df[['patient_barcode', 'pathologic_stage']], mirna_normal_df, on='patient_barcode')
mirna_normal['pathologic_stage'] = 'normal'
mirna_tumor = pandas.merge(clinical_df[['patient_barcode', 'pathologic_stage']], mirna_tumor_df, on='patient_barcode')

# Map stage IA to stage I, IB to I, etc. ...
pathologic_stage_map = {'Stage IA': 'Stage I', 'Stage IB': 'Stage I', 
                        'Stage IIA': 'Stage II', 'Stage IIB': 'Stage II', 
                        'Stage IIIA': 'Stage III', 'Stage IIIB': 'Stage III'}
mirna_tumor.replace({'pathologic_stage': pathologic_stage_map}, inplace=True)

# Store list of all miRNA's in miRNA expression data
mirna_list = list(mirna_tumor.columns)[2:]

# Print quick overview of data
print mirna_normal['pathologic_stage'].value_counts().sort_index(axis=0)
print mirna_tumor['pathologic_stage'].value_counts().sort_index(axis=0)

mirna_tumor_df.shape (513, 1882) , nulls: 0
mirna_normal_df.shape (46, 1882) , nulls: 0
validated_miRNAs.shape (34, 2)
normal    46
Name: pathologic_stage, dtype: int64
Stage I      277
Stage II     121
Stage III     84
Stage IV      24
Name: pathologic_stage, dtype: int64


## Load normal & cancer Gene Expression data

In [3]:
gene_exp_tumor_df = pandas.read_table(os.path.join(ROOT_DIR, 'data/processed/gene_expression/tumor/READ__illuminahiseq_rnaseqv2__GeneExp.txt'), 
                                      header=0, delimiter='\t')
gene_exp_normal_df = pandas.read_table(os.path.join(ROOT_DIR, 'data/processed/gene_expression/normal/READ__illuminahiseq_rnaseqv2__GeneExp.txt'), 
                                      header=0, delimiter='\t')

gene_exp_tumor_df.rename(columns=lambda x: x[:12], inplace=True)
gene_exp_normal_df.rename(columns=lambda x: x[:12], inplace=True)

print gene_exp_tumor_df.shape
print gene_exp_normal_df.shape

(20530, 517)
(20530, 61)


## Reshape gene expression data frames to have shape (patients x gene expression) 

In [4]:
print len(list(gene_exp_tumor_df.columns))-2
print len(list(gene_exp_normal_df.columns))-2

# Remove entries with unknown Gene Symbol
gene_exp_tumor_df = gene_exp_tumor_df[gene_exp_tumor_df.GeneSymbol != '?']
gene_exp_normal_df = gene_exp_normal_df[gene_exp_normal_df.GeneSymbol != '?']

# Get list of all gene_symbols
gene_symbols = list(gene_exp_tumor_df['EntrezID'])
# Get list of tumor and normal patient_barcode
gene_exp_tumor_patient_barcodes = list(gene_exp_tumor_df.columns)[2:]
gene_exp_normal_patient_barcodes = list(gene_exp_normal_df.columns)[2:]

# Drop EntrezID column
gene_exp_tumor = gene_exp_tumor_df.drop(['EntrezID', 'GeneSymbol'], axis=1)
gene_exp_normal = gene_exp_normal_df.drop(['EntrezID', 'GeneSymbol'], axis=1)

# Reshaping data frame to have columns for GeneSymbols, and rows of patients
gene_exp_tumor = gene_exp_tumor.T
gene_exp_normal = gene_exp_normal.T
gene_exp_tumor.columns = gene_symbols
gene_exp_normal.columns = gene_symbols

# Add column for patients barcode
gene_exp_tumor['patient_barcode'] = gene_exp_tumor.index
gene_exp_normal['patient_barcode'] = gene_exp_normal.index

print "gene_symbols", len(gene_symbols)
print "gene_exp_tumor_patients", len(gene_exp_tumor_patient_barcodes)
print "gene_exp_normal_patients", len(gene_exp_normal_patient_barcodes)

print gene_exp_tumor.shape
print gene_exp_normal.shape

515
59
gene_symbols 20502
gene_exp_tumor_patients 515
gene_exp_normal_patients 59
(515, 20503)
(59, 20503)


## Filter samples with matched Gene Expression data and miRNA data

In [5]:
# Merge normal and tumor miRNA expression profiles with clinical cancer stage data
merged_normal_patients = pandas.merge(gene_exp_normal[['patient_barcode']], mirna_normal, on='patient_barcode')[['patient_barcode', 'pathologic_stage']]
merged_tumor_patients = pandas.merge(gene_exp_tumor[['patient_barcode']], mirna_tumor, on='patient_barcode')[['patient_barcode', 'pathologic_stage']]

# Print quick overview of data
print merged_tumor_patients['pathologic_stage'].value_counts().sort_index(axis=0)
print merged_normal_patients['pathologic_stage'].value_counts().sort_index(axis=0)

# Filter samples
mirna_tumor = mirna_tumor[mirna_tumor['patient_barcode'].isin(merged_tumor_patients['patient_barcode'])]
gene_exp_tumor = gene_exp_tumor[gene_exp_tumor['patient_barcode'].isin(merged_tumor_patients['patient_barcode'])]
mirna_normal = mirna_normal[mirna_normal['patient_barcode'].isin(merged_normal_patients['patient_barcode'])]
gene_exp_normal = gene_exp_normal[gene_exp_normal['patient_barcode'].isin(merged_normal_patients['patient_barcode'])]

# Drop categorical columns
mirna_tumor.drop(['patient_barcode', 'pathologic_stage'], 1, inplace=True)
mirna_normal.drop(['patient_barcode', 'pathologic_stage'], 1, inplace=True)
gene_exp_tumor.drop(['patient_barcode'], 1, inplace=True)
gene_exp_normal.drop(['patient_barcode'], 1, inplace=True)

print "mirna_tumor", mirna_tumor.shape, mirna_tumor.isnull().sum().sum()
print "gene_exp_tumor", gene_exp_tumor.shape, gene_exp_tumor.isnull().sum().sum()
print "mirna_normal", mirna_normal.shape, mirna_normal.isnull().sum().sum()
print "gene_exp_normal", gene_exp_normal.shape, gene_exp_normal.isnull().sum().sum()

Stage I      275
Stage II     120
Stage III     84
Stage IV      24
Name: pathologic_stage, dtype: int64
normal    20
Name: pathologic_stage, dtype: int64
mirna_tumor (510, 1881) 0
gene_exp_tumor (510, 20502)

 0
mirna_normal (20, 1881) 0
gene_exp_normal (20, 20502) 0


## Build miRNA-target relationship network to identify 
## Xu et al. (xu2011prioritizing)

In [6]:
from src.models.miRNA_target_network import miRNATargetNetwork
import networkx as nx

network = miRNATargetNetwork(threshold=0.8)
network.train(miRNAs_A=mirna_tumor, targets_A=gene_exp_tumor, miRNAs_B=mirna_normal, targets_B=gene_exp_normal)
# print nx.bipartite.sets(network.B)

n_A 510
n_B 20


  ((n_B - 1) * miRNA_B_m_std * np.std(targets_B[t]))
  if np.abs(dys) >= self.threshold:


  ((n_A - 1) * miRNA_A_m_std * np.std(targets_A[t]))


hsa-let-7a-1 : 4


hsa-let-7a-2 : 6


hsa-let-7a-3 : 3


hsa-let-7b : 5


hsa-let-7c : 7


hsa-let-7d : 2


hsa-let-7e : 2


hsa-let-7f-1 : 1


hsa-let-7f-2 : 1


hsa-let-7g : 4


hsa-let-7i : 10


hsa-mir-100 : 2


hsa-mir-101-1 : 1


hsa-mir-101-2 : 1


hsa-mir-103a-1 : 10


hsa-mir-103a-2 : 9


hsa-mir-103b-1 : 1


hsa-mir-103b-2 : 1


hsa-mir-105-1 : 106


hsa-mir-105-2 : 187


hsa-mir-106a : 121


hsa-mir-106b : 1


hsa-mir-107 : 44


hsa-mir-10a : 2


hsa-mir-10b : 36


hsa-mir-1-1 : 1


hsa-mir-1178 : 1


hsa-mir-1179 : 213


hsa-mir-1180 : 2


hsa-mir-1181 : 61


hsa-mir-1182 : 1


hsa-mir-1183 : 1


hsa-mir-1184-1 : 1


hsa-mir-1184-2 : 1


hsa-mir-1184-3 : 1


hsa-mir-1185-1 : 16


hsa-mir-1185-2 : 183


hsa-mir-1193 : 1


hsa-mir-1197 : 50


hsa-mir-1199 : 71


hsa-mir-1-2 : 1


hsa-mir-1200 : 1


hsa-mir-1202 : 1


hsa-mir-1203 : 1


hsa-mir-1204 : 1


hsa-mir-1205 : 1


hsa-mir-1206 : 1


hsa-mir-1207 : 1


hsa-mir-1208 : 1


hsa-mir-122 : 18


hsa-mir-1224 : 146


hsa-mir-1225 : 34


hsa-mir-1226 : 2


hsa-mir-1227 : 2


hsa-mir-1228 : 4


hsa-mir-1229 : 3


hsa-mir-1231 : 1


hsa-mir-1233-1 : 1


hsa-mir-1233-2 : 1


hsa-mir-1234 : 25


hsa-mir-1236 : 1


hsa-mir-1237 : 10


hsa-mir-1238 : 30


hsa-mir-124-1 : 85


hsa-mir-124-2 : 130


hsa-mir-1243 : 512


hsa-mir-124-3 : 55


hsa-mir-1244-1 : 1


hsa-mir-1244-2 : 1


hsa-mir-1244-3 : 1


hsa-mir-1244-4 : 1


hsa-mir-1245a : 4


hsa-mir-1245b : 49


hsa-mir-1246 : 54


hsa-mir-1247 : 1


hsa-mir-1248 : 6


hsa-mir-1249 : 4


hsa-mir-1250 : 71


hsa-mir-1251 : 39


hsa-mir-1252 : 1


hsa-mir-1253 : 1


hsa-mir-1254-1 : 6


hsa-mir-1254-2 : 8


hsa-mir-1255a : 15


hsa-mir-1255b-1 : 1


hsa-mir-1255b-2 : 1


hsa-mir-1256 : 12


hsa-mir-1257 : 1


hsa-mir-1258 : 5


hsa-mir-125a : 2


hsa-mir-125b-1 : 1


hsa-mir-125b-2 : 1


hsa-mir-126 : 34


hsa-mir-1260a : 476


hsa-mir-1260b : 46


hsa-mir-1261 : 1


hsa-mir-1262 : 10


KeyError: 'AGAP4'

## Loading miRecords experimentally validated miRNA-targets interactions

In [7]:
# Load data frame from file
miRecords_df = pandas.read_table(os.path.join(ROOT_DIR, 'data/external/miRecords_version4.tsv'), delimiter='\t')

# Select only homo sapiens miRNA-target pairs
miRecords_df = miRecords_df[(miRecords_df["miRNA_species"] == "Homo sapiens") &
                            (miRecords_df["Target gene_species_scientific"] == "Homo sapiens")]
miRecords_df = miRecords_df[["miRNA_mature_ID", "Target gene_name"]]

# Standardize miRNA and gene symbols
miRecords_df['miRNA_mature_ID'] = miRecords_df['miRNA_mature_ID'].str.lower()
miRecords_df['miRNA_mature_ID'] = miRecords_df['miRNA_mature_ID'].str.replace('*', '')
miRecords_df['Target gene_name'] = miRecords_df['Target gene_name'].str.upper()

# Filter miRNA-target pairs to only miRNA's included in miRNA expression data, same for gene targets 
miRecords_df = miRecords_df[miRecords_df["miRNA_mature_ID"].isin(mirna_list) &
                            miRecords_df["Target gene_name"].isin(gene_symbols)]

# miRecords_df