# Análise Exploratória de Dados do miRWalk

# Importação das Bibiotecas

In [1]:
from pathlib import Path
import numpy as np
import pandas as pd

# Constantes

In [2]:
# Caminho da pasta raiz de dados
DATA_PATH = Path.cwd().parent / 'data'

# Caminho da pasta de dados intermediários
INTERIM_DATA_PATH = DATA_PATH / 'interim'

# Carregamento de Dados

In [3]:
# Lê o arquivo um arquivo com as interações mineradas no miRWalk
df_interactions = pd.read_csv(
    filepath_or_buffer=Path(INTERIM_DATA_PATH / 'miRWalk-interactions.csv')
)

# Visualização de uma Amostra de Dados

In [4]:
# Imprime os nomes de todas as colunas
df_interactions.columns

Index(['mirnaid', 'refseqid', 'genesymbol', 'duplex', 'start', 'end',
       'bindingp', 'energy', 'seed', 'accessibility', 'au', 'phylopstem',
       'phylopflank', 'me', 'number_of_pairings', 'binding_region_length',
       'longest_consecutive_pairings', 'position', 'validated', 'TargetScan',
       'miRDB'],
      dtype='object')

In [5]:
# Imprime uma amostra dos dados
pd.set_option('display.max_columns', df_interactions.shape[1])
df_interactions.head()

Unnamed: 0,mirnaid,refseqid,genesymbol,duplex,start,end,bindingp,energy,seed,accessibility,au,phylopstem,phylopflank,me,number_of_pairings,binding_region_length,longest_consecutive_pairings,position,validated,TargetScan,miRDB
0,hsa-miR-495-3p,NM_001330416,ST8SIA2,AAACAAACATGGTGCACTTCTT#GTGCACCGTT#........((((...,3595,3605,0.846154,-15.3,0,0.029613,0.441,4.187171,5.161961,-19.42388,9,10,9,3UTR,,1,1
1,hsa-miR-495-3p,NM_001330420,LUC7L,AAACAAACATGGTGCACTTCTT#AGAAGTGTACAGAGTTGCTCCTG...,2462,2507,0.846154,-16.4,0,0.004972,0.529,4.36817,5.301501,-9.076038,18,45,10,3UTR,,0,0
2,hsa-miR-495-3p,NM_001330425,NAGK,AAACAAACATGGTGCACTTCTT#GTGACACTATGTGTTGTG#..((...,1699,1717,0.846154,-16.7,1,0.000437,0.5,5.845809,5.747344,-4.957001,15,18,8,3UTR,,0,0
3,hsa-miR-495-3p,NM_001330437,PTPN11,AAACAAACATGGTGCACTTCTT#AGGAGGTGCACCATA#..........,1495,1510,0.846154,-22.0,0,0.001344,0.368,5.191274,4.818038,-19.42388,14,15,14,CDS,,0,0
4,hsa-miR-495-3p,NM_001330438,DDX25,AAACAAACATGGTGCACTTCTT#GAGAAGTGTATCCAGATACTTTG...,3709,3735,0.884615,-18.7,1,0.000172,0.603,-0.342248,0.030155,-7.163349,19,26,10,3UTR,,0,0


# Preparação dos Dados

In [6]:
# Colunas com dados de interesse
columns = [
    'mirnaid',     # id do microRNA
    'refseqid',    # id RNA mensageiro
    'genesymbol',  # id gene
    'bindingp',    # p-valor do algoritmo TarPmiR
    'validated',   # id do miRTarBase
    'TargetScan',  # interação predita pelo TargetScanHuman
    'miRDB'        # interação predita pelo miRDB
]

# Seleciona as colunas com os dados de interesse
df_eda = df_interactions[columns].copy()

# Cria coluna para indicar se a interação foi validada pelo miRTarBase
df_eda['miRTarBase'] = np.where(df_eda['validated'].isna(), 0, 1)

In [7]:
# Imprime o DataFrame preparado para a análise exploratória
df_eda

Unnamed: 0,mirnaid,refseqid,genesymbol,bindingp,validated,TargetScan,miRDB,miRTarBase
0,hsa-miR-495-3p,NM_001330416,ST8SIA2,0.846154,,1,1,0
1,hsa-miR-495-3p,NM_001330420,LUC7L,0.846154,,0,0,0
2,hsa-miR-495-3p,NM_001330425,NAGK,0.846154,,0,0,0
3,hsa-miR-495-3p,NM_001330437,PTPN11,0.846154,,0,0,0
4,hsa-miR-495-3p,NM_001330438,DDX25,0.884615,,0,0,0
...,...,...,...,...,...,...,...,...
1449871,hsa-miR-496,NM_001042364,PTPN20,1.000000,,0,0,0
1449872,hsa-miR-496,NM_001042440,CAST,1.000000,,0,0,0
1449873,hsa-miR-496,NM_001042442,CAST,1.000000,,0,0,0
1449874,hsa-miR-496,NM_001042443,CAST,1.000000,,0,0,0


# Análise Exploratória

## MicroRNAs

In [8]:
# Total de microRNAs
df_eda['mirnaid'].nunique()

44

In [9]:
# Interações por microRNA
df_eda['mirnaid'].value_counts().describe()

count       44.000000
mean     32951.727273
std      23397.920592
min       1285.000000
25%      10658.250000
50%      30306.500000
75%      56759.250000
max      73687.000000
Name: mirnaid, dtype: float64

## RNAs Mensageiros

In [10]:
# Total de RNAs mensageiros
df_eda['refseqid'].nunique()

47560

In [11]:
# Interações por RNA mensageiro
df_eda['refseqid'].value_counts().describe()

count    47560.000000
mean        30.485198
std         11.995171
min          1.000000
25%         22.000000
50%         29.000000
75%         38.000000
max        103.000000
Name: refseqid, dtype: float64

## Genes

In [12]:
# Total de genes
df_eda['genesymbol'].nunique()

19341

In [13]:
# Interações por gene
df_eda['genesymbol'].value_counts().describe()

count    19341.000000
mean        74.963859
std         82.356375
min          1.000000
25%         28.000000
50%         49.000000
75%         94.000000
max       1905.000000
Name: genesymbol, dtype: float64

## Interações

In [14]:
# Total de interações
df_eda.shape[0]

1449876

In [15]:
# Interações por par microRNA-RNA mensageiro
df_eda[['mirnaid', 'refseqid']].value_counts().describe()

count    966523.000000
mean          1.500095
std           0.707323
min           1.000000
25%           1.000000
50%           1.000000
75%           2.000000
max           5.000000
dtype: float64

In [16]:
# Descrição do p-valor por interação
df_eda['bindingp'].describe()

count    1.449876e+06
mean     9.103747e-01
std      6.120402e-02
min      8.012821e-01
25%      8.461538e-01
50%      9.230769e-01
75%      9.615385e-01
max      1.000000e+00
Name: bindingp, dtype: float64

In [17]:
# Interações por caracterização (validada e/ou predita)
df_eda[['miRTarBase', 'TargetScan', 'miRDB']].value_counts()

miRTarBase  TargetScan  miRDB
0           0           0        1356245
                        1          50829
            1           0          13898
                        1          12468
1           0           0          12327
                        1           2620
            1           1            873
                        0            616
dtype: int64

In [18]:
# Total de interações de interesse
df_eda.query(
    '(bindingp >= 0.8) and ' +
    '(miRTarBase == 1 or TargetScan == 1 or miRDB == 1)'
).shape[0]

93631