In [2]:
import numpy as np 
import pandas as pd 
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.manifold import TSNE
import matplotlib.patches as mpatches
import time

In [3]:
# all the known virus miRNAs are used as positive examples
features_virus_mirnas = pd.read_csv('/content/drive/MyDrive/features_virus_miRNAs.csv')

# The negative set is composed of a random set of hsa hairpin-like sequences 
features_unlabeled_hairpins = pd.read_csv('/content/drive/MyDrive/features_unlabeled_hairpins.csv') # Hairpins from hsa genome

In [4]:
features_virus_mirnas.drop('sequence_names', inplace=True, axis=1)

In [5]:
features_unlabeled_hairpins.drop('sequence_names', inplace=True, axis=1)

In [6]:
features_unlabeled_hairpins['class'] = 0

In [7]:
features_virus_mirnas['class'] = 1

In [8]:
da = [features_virus_mirnas, features_unlabeled_hairpins]

In [9]:
df = pd.concat(da)
df.head()

Unnamed: 0,nt_proportion0,nt_proportion1,nt_proportion2,nt_proportion3,dinucleotide_proportion0,dinucleotide_proportion1,dinucleotide_proportion2,dinucleotide_proportion3,dinucleotide_proportion4,dinucleotide_proportion5,...,efe,ensemble_frequency,diversity,mfe_efe_difference,dQ,dG,mfei1,mfei2,mfei4,class
0,0.292308,0.169231,0.2,0.338462,0.109375,0.015625,0.078125,0.09375,0.0625,0.03125,...,-27.28,0.457857,1.67,0.007385,0.066635,-0.412308,-1.116667,-0.137436,-1.072,1
1,0.276923,0.215385,0.230769,0.276923,0.125,0.078125,0.046875,0.03125,0.078125,0.0,...,-23.95,0.154283,5.77,0.017692,0.22873,-0.350769,-0.786207,-0.116923,-1.036364,1
2,0.242424,0.287879,0.227273,0.242424,0.061538,0.061538,0.061538,0.046154,0.061538,0.138462,...,-27.02,0.426696,2.62,0.007879,0.107199,-0.401515,-0.779412,-0.100379,-1.204545,1
3,0.2,0.257143,0.3,0.242857,0.028986,0.057971,0.057971,0.057971,0.057971,0.072464,...,-32.11,0.10108,6.03,0.020143,0.214866,-0.438571,-0.787179,-0.14619,-1.228,1
4,0.258065,0.193548,0.225806,0.322581,0.098361,0.016393,0.04918,0.081967,0.04918,0.04918,...,-26.28,0.639769,0.94,0.004516,0.040681,-0.419355,-1.0,-0.139785,-1.238095,1


In [10]:
df.fillna(value=0, inplace = True)

In [11]:
df = df.sample(frac=1)


positive_df = df.loc[df['class'] == 1]
negative_df = df.loc[df['class'] == 0][:569]

normal_distributed_df = pd.concat([positive_df, negative_df])


new_df = normal_distributed_df.sample(frac=1, random_state=42)

new_df.head()

Unnamed: 0,nt_proportion0,nt_proportion1,nt_proportion2,nt_proportion3,dinucleotide_proportion0,dinucleotide_proportion1,dinucleotide_proportion2,dinucleotide_proportion3,dinucleotide_proportion4,dinucleotide_proportion5,...,efe,ensemble_frequency,diversity,mfe_efe_difference,dQ,dG,mfei1,mfei2,mfei4,class
944862,0.229,0.2,0.343,0.229,0.043,0.029,0.101,0.043,0.072,0.087,...,-24.8,0.103,5.56,0.02,0.285,-0.334,-0.616,-0.111,-1.3,0
458609,0.391,0.195,0.149,0.264,0.105,0.07,0.047,0.163,0.116,0.035,...,-17.19,0.124,8.13,0.015,0.26,-0.183,-0.53,-0.061,-0.662,0
250,0.25,0.196429,0.196429,0.357143,0.036364,0.072727,0.054545,0.090909,0.054545,0.036364,...,-19.67,0.243365,2.72,0.015536,0.131961,-0.335714,-0.854545,-0.111905,-0.895238,1
442867,0.296,0.197,0.07,0.437,0.1,0.086,0.014,0.086,0.086,0.014,...,-7.66,0.13,9.0,0.018,0.404,-0.09,-0.337,-0.03,-0.4,0
7,0.2,0.228571,0.285714,0.285714,0.014493,0.028986,0.057971,0.101449,0.057971,0.057971,...,-34.7,0.121166,3.7,0.018571,0.13255,-0.477143,-0.927778,-0.238571,-1.284615,1


In [13]:
new_df.describe()

Unnamed: 0,nt_proportion0,nt_proportion1,nt_proportion2,nt_proportion3,dinucleotide_proportion0,dinucleotide_proportion1,dinucleotide_proportion2,dinucleotide_proportion3,dinucleotide_proportion4,dinucleotide_proportion5,...,efe,ensemble_frequency,diversity,mfe_efe_difference,dQ,dG,mfei1,mfei2,mfei4,class
count,1138.0,1138.0,1138.0,1138.0,1138.0,1138.0,1138.0,1138.0,1138.0,1138.0,...,1138.0,1138.0,1138.0,1138.0,1138.0,1138.0,1138.0,1138.0,1138.0,1138.0
mean,0.257757,0.215111,0.239015,0.288122,0.074153,0.048384,0.066007,0.068849,0.065594,0.053429,...,-27.86696,0.171482,9.269446,0.017003,0.311247,-0.331082,-0.723682,-0.121176,-0.994056,0.5
std,0.081483,0.070648,0.074735,0.080566,0.053489,0.024331,0.030406,0.042239,0.027731,0.041168,...,14.647302,0.161036,7.208106,0.007003,0.2255,0.148483,0.277358,0.091135,0.345199,0.50022
min,0.028986,0.031,0.022,0.017544,0.0,0.0,0.0,0.0,0.0,0.0,...,-152.91,0.000553,0.39,0.001169,0.00631,-0.898462,-2.372727,-0.898462,-2.355882,0.0
25%,0.203,0.167,0.1875,0.234676,0.033333,0.03125,0.045542,0.036036,0.046243,0.024,...,-34.2775,0.051304,4.1125,0.012,0.150962,-0.431549,-0.876241,-0.151067,-1.237448,0.0
50%,0.253,0.209939,0.236055,0.286,0.064051,0.047,0.063225,0.064196,0.064103,0.044776,...,-25.275,0.12158,7.02,0.016854,0.242057,-0.335242,-0.690122,-0.100546,-1.014,0.5
75%,0.307692,0.25,0.286,0.335885,0.103448,0.064077,0.083333,0.094899,0.083833,0.072727,...,-18.5975,0.238916,12.5,0.022,0.41375,-0.209,-0.52125,-0.061,-0.71525,1.0
max,0.519,0.473684,0.507,0.598,0.333,0.163,0.194,0.284,0.169492,0.303571,...,-6.78,0.865094,48.02,0.04,1.221,-0.047,-0.145,0.0,-0.217,1.0


In [12]:
from google.colab import data_table
data_table.DataTable(new_df.describe(), max_columns = 74)

Unnamed: 0,nt_proportion0,nt_proportion1,nt_proportion2,nt_proportion3,dinucleotide_proportion0,dinucleotide_proportion1,dinucleotide_proportion2,dinucleotide_proportion3,dinucleotide_proportion4,dinucleotide_proportion5,...,efe,ensemble_frequency,diversity,mfe_efe_difference,dQ,dG,mfei1,mfei2,mfei4,class
count,1138.0,1138.0,1138.0,1138.0,1138.0,1138.0,1138.0,1138.0,1138.0,1138.0,...,1138.0,1138.0,1138.0,1138.0,1138.0,1138.0,1138.0,1138.0,1138.0,1138.0
mean,0.257757,0.215111,0.239015,0.288122,0.074153,0.048384,0.066007,0.068849,0.065594,0.053429,...,-27.86696,0.171482,9.269446,0.017003,0.311247,-0.331082,-0.723682,-0.121176,-0.994056,0.5
std,0.081483,0.070648,0.074735,0.080566,0.053489,0.024331,0.030406,0.042239,0.027731,0.041168,...,14.647302,0.161036,7.208106,0.007003,0.2255,0.148483,0.277358,0.091135,0.345199,0.50022
min,0.028986,0.031,0.022,0.017544,0.0,0.0,0.0,0.0,0.0,0.0,...,-152.91,0.000553,0.39,0.001169,0.00631,-0.898462,-2.372727,-0.898462,-2.355882,0.0
25%,0.203,0.167,0.1875,0.234676,0.033333,0.03125,0.045542,0.036036,0.046243,0.024,...,-34.2775,0.051304,4.1125,0.012,0.150962,-0.431549,-0.876241,-0.151067,-1.237448,0.0
50%,0.253,0.209939,0.236055,0.286,0.064051,0.047,0.063225,0.064196,0.064103,0.044776,...,-25.275,0.12158,7.02,0.016854,0.242057,-0.335242,-0.690122,-0.100546,-1.014,0.5
75%,0.307692,0.25,0.286,0.335885,0.103448,0.064077,0.083333,0.094899,0.083833,0.072727,...,-18.5975,0.238916,12.5,0.022,0.41375,-0.209,-0.52125,-0.061,-0.71525,1.0
max,0.519,0.473684,0.507,0.598,0.333,0.163,0.194,0.284,0.169492,0.303571,...,-6.78,0.865094,48.02,0.04,1.221,-0.047,-0.145,0.0,-0.217,1.0


In [14]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1138 entries, 944862 to 426886
Data columns (total 74 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0    nt_proportion0             1138 non-null   float64
 1    nt_proportion1             1138 non-null   float64
 2    nt_proportion2             1138 non-null   float64
 3    nt_proportion3             1138 non-null   float64
 4    dinucleotide_proportion0   1138 non-null   float64
 5    dinucleotide_proportion1   1138 non-null   float64
 6    dinucleotide_proportion2   1138 non-null   float64
 7    dinucleotide_proportion3   1138 non-null   float64
 8    dinucleotide_proportion4   1138 non-null   float64
 9    dinucleotide_proportion5   1138 non-null   float64
 10   dinucleotide_proportion6   1138 non-null   float64
 11   dinucleotide_proportion7   1138 non-null   float64
 12   dinucleotide_proportion8   1138 non-null   float64
 13   dinucleotide_proportion9 