# Virus × EVE's classification

___Inference___

### Initialization

Imports

In [1]:
import pandas as pd 
# import numpy as np
# import matplotlib.pyplot as plt
# import seaborn as sns

Parameters

In [2]:
path_root = '/home/small_rna_metavir/src'

## Build Model _(Random Forest Classifier)_

Import data

In [3]:
path_ds_eves = f'{path_root}/ds/virus-eves/ds_virus-eves.tab'

In [4]:
df_eves = pd.read_table(path_ds_eves, index_col='V1')

### Separate: X _(features)_ × Y _(classes)_

In [127]:
col_class_eve = 'class'
x_range_eve = list(range(4, 52))

In [128]:
X_eves = df_eves.iloc[:, x_range_eve]
y_eves = df_eves[col_class_eve]
n_samples, n_features = X_eves.shape

print(f"n_samples: {n_samples} × n_features: {n_features}")
print(f'y.shape ({col_class_eve}): {y_eves.shape}')
print(f'X.shape: {X_eves.shape}')


n_samples: 2315 × n_features: 48
y.shape (class): (2315,)
X.shape: (2315, 48)


In [60]:
feat_eves = X_eves.columns
feat_eves

Index(['X15', 'X16', 'X17', 'X18', 'X19', 'X20', 'X21', 'X22', 'X23', 'X24',
       'X25', 'X26', 'X27', 'X28', 'X29', 'X30', 'X31', 'X32', 'X33', 'X34',
       'X35', 'X.15', 'X.16', 'X.17', 'X.18', 'X.19', 'X.20', 'X.21', 'X.22',
       'X.23', 'X.24', 'X.25', 'X.26', 'X.27', 'X.28', 'X.29', 'X.30', 'X.31',
       'X.32', 'X.33', 'X.34', 'X.35', 'dens15to18', 'dens20to22',
       'dens25to29', 'ratiosi_pi', 'ratio_si', 'dens18to35'],
      dtype='object')

**Class names**

_Classes are clearly unbalanced. It will be necessary to handle this when splitting test × training data..._

In [61]:
classes = list(y_eves.unique())
class_counts = list(y_eves.value_counts())
y_eves.value_counts()

viral    1321
eve       994
Name: class, dtype: int64

### Train

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

Split training × testing data

In [63]:
X_train, X_test, y_train, y_test = train_test_split(X_eves, y_eves, test_size=0.3, random_state=13, stratify=y_eves)

print(f'X_train.shape: {X_train.shape}, y_train.shape: {y_train.shape}')
print(f'X_test.shape: {X_test.shape}, y_test.shape: {y_test.shape}')

X_train.shape: (1620, 48), y_train.shape: (1620,)
X_test.shape: (695, 48), y_test.shape: (695,)


Train

In [12]:
classifier = RandomForestClassifier(n_estimators=50)
classifier.fit(X_train, y_train)

Check test classification accuracy

In [13]:
y_hat = classifier.predict(X_test)
accuracy = (y_hat == y_test).sum() / y_test.shape[0]
print(f'accuracy: {100 * accuracy}')

accuracy: 92.5179856115108


## Parse Dark Matter Data

Import Data

In [163]:
path_ds_libs = [
    f'{path_root}/ds/virus-dark-matter/ds_virus-dark-matter-RKPM44.tab'
]

In [164]:
# ds_id, ds_data = path_ds_libs[0]
df_dark_matter = pd.read_table(path_ds_libs[0])

### EDA

In [31]:
df_dark_matter.head()

Unnamed: 0,Contigs_ID,Similarity_label,15,16,17,18,19,20,21,22,...,-33,-34,-35,dens15to18,dens20to22,dens25to29,ratiosi_pi,ratio_si,dens18to35,length
0,bN_RKPM44_Contig180_179,viral,-0.285242,-0.285242,-0.285242,-0.285242,-0.260587,0.027052,5.467541,0.290036,...,-0.285242,-0.285242,-0.285242,-16.60964,2.159199,-1.889817,4.049016,0.431799,2.380272,315
1,bN_RKPM44_Contig18_17,viral,-0.338402,-0.338402,-0.338402,-0.338402,-0.305252,-0.156079,4.761109,0.25829,...,-0.321827,-0.327352,-0.327352,-16.60964,2.051209,-0.388975,2.440184,0.008159,2.500344,512
2,bN_RKPM44_Contig1_0,viral,-0.402533,-0.402533,-0.402533,-0.402533,-0.37145,-0.196612,5.386557,0.55325,...,-0.379221,-0.336483,-0.355909,-16.60964,2.379971,0.741514,1.638457,0.602814,3.072798,570
3,bN_RKPM44_Contig231_230,viral,-0.449882,-0.449882,-0.449882,-0.449882,-0.420562,-0.039399,5.032997,0.854867,...,-0.449882,-0.449882,-0.449882,-16.60964,1.839535,-0.406625,2.246161,0.595283,2.521359,228
4,bN_RKPM44_Contig2333_2332,viral,-0.353041,-0.353041,-0.353041,-0.352393,-0.345198,-0.314995,0.096574,-0.209996,...,-0.349994,-0.351031,-0.351226,-7.527122,2.823717,6.633881,-3.810164,1.07033,7.003396,2029


In [32]:
df_dark_matter.shape

(71, 51)

In [33]:
df_dark_matter.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71 entries, 0 to 70
Data columns (total 51 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Contigs_ID        71 non-null     object 
 1   Similarity_label  71 non-null     object 
 2   15                71 non-null     float64
 3   16                71 non-null     float64
 4   17                71 non-null     float64
 5   18                71 non-null     float64
 6   19                71 non-null     float64
 7   20                71 non-null     float64
 8   21                71 non-null     float64
 9   22                71 non-null     float64
 10  23                71 non-null     float64
 11  24                71 non-null     float64
 12  25                71 non-null     float64
 13  26                71 non-null     float64
 14  27                71 non-null     float64
 15  28                71 non-null     float64
 16  29                71 non-null     float64
 17 

In [34]:
df_dark_matter.describe()

Unnamed: 0,15,16,17,18,19,20,21,22,23,24,...,-33,-34,-35,dens15to18,dens20to22,dens25to29,ratiosi_pi,ratio_si,dens18to35,length
count,71.0,71.0,71.0,71.0,71.0,71.0,71.0,71.0,71.0,71.0,...,71.0,71.0,71.0,71.0,71.0,71.0,71.0,71.0,71.0,71.0
mean,-0.379685,-0.379685,-0.379685,-0.378783,-0.348706,-0.246149,1.484773,0.130181,-0.087895,0.17528,...,-0.34064,-0.359726,-0.357947,-14.958188,-4.155694,0.117687,-4.433725,-7.372549,1.568951,621.43662
std,0.127385,0.127385,0.127385,0.126785,0.167522,0.216397,2.426432,0.849369,0.47074,1.252228,...,0.159909,0.106565,0.133845,3.582246,6.58162,2.807269,6.438304,8.985951,2.662602,1036.038261
min,-0.883554,-0.883554,-0.883554,-0.882372,-0.816644,-0.59113,-0.59113,-0.59113,-0.59113,-0.477807,...,-0.665553,-0.666372,-0.743008,-16.60964,-16.60964,-8.005625,-16.60964,-16.60964,-4.420662,200.0
25%,-0.426108,-0.426108,-0.426108,-0.423563,-0.404896,-0.375114,-0.32517,-0.293821,-0.294702,-0.243343,...,-0.412855,-0.414675,-0.416422,-16.60964,-6.945441,-1.172023,-7.151563,-16.60964,-0.06057,227.5
50%,-0.355324,-0.355324,-0.355324,-0.355324,-0.342654,-0.287091,-0.229763,-0.202737,-0.207429,-0.164455,...,-0.343789,-0.345863,-0.349842,-16.60964,-5.121769,-0.215013,-4.807355,0.0,1.019808,260.0
75%,-0.286505,-0.286505,-0.286505,-0.286505,-0.280469,-0.180278,3.996927,0.244589,-0.030907,-0.035318,...,-0.280464,-0.28558,-0.28558,-16.60964,2.078502,0.813902,0.369576,0.590123,3.214124,440.0
max,-0.179646,-0.179646,-0.179646,-0.179646,0.613707,0.597666,6.745262,4.122626,2.376995,6.535886,...,0.530289,-0.152393,0.264124,-2.958421,6.426056,8.269759,6.158616,6.594947,8.895236,6811.0


Check for missing values

In [35]:
df_dark_matter.isnull().sum().sum() # Whole data frame
# df_virus_dark_matter.isna().sum().sort_values(ascending=False)[2:] # By columns

0

### Separate: X _(features)_ × Y _(classes)_ ×  ID's _(contig ID's)_

In [165]:
col_class_dark = 'Similarity_label'
col_id_dark = 'Contigs_ID'
x_range_dark = list(range(2, 51))

In [166]:
ids = df_dark_matter[col_id_dark] # Contig ID's
X_dark_matter = df_dark_matter.iloc[:, x_range_dark] # Feature Values
y_dark_matter = df_dark_matter[col_class_dark] # Labels
n_samples, n_features = X_dark_matter.shape

print(f"n_samples: {n_samples} × n_features: {n_features}")
print(f"ID's -> ids.shape {ids.shape} ({ids.unique().shape[0]} are unique)")
print(f'y.shape ({col_class_dark}): {y_dark_matter.shape}')
print(f'X.shape: {X_dark_matter.shape}')

n_samples: 72 × n_features: 49
ID's -> ids.shape (72,) (72 are unique)
y.shape (Similarity_label): (72,)
X.shape: (72, 49)


In [167]:
feat_dark_matter = X_dark_matter.columns
feat_dark_matter

Index(['15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26',
       '27', '28', '29', '30', '31', '32', '33', '34', '35', '-15', '-16',
       '-17', '-18', '-19', '-20', '-21', '-22', '-23', '-24', '-25', '-26',
       '-27', '-28', '-29', '-30', '-31', '-32', '-33', '-34', '-35',
       'dens15to18', 'dens20to22', 'dens25to29', 'ratiosi_pi', 'ratio_si',
       'dens18to35', 'length'],
      dtype='object')

Check if all features data is numeric

_Yes, they are..._

In [168]:
set(X_dark_matter.dtypes)

{dtype('int64'), dtype('float64')}

**Class names**

_We can see that the 3 classes are relatively well balanced..._

In [169]:
classes = list(y_dark_matter.unique())
class_counts = list(y_dark_matter.value_counts())
y_dark_matter.value_counts()

nohit       26
viral       25
nonviral    21
Name: Similarity_label, dtype: int64

### Transform data to classify

In [170]:
feat_eves

Index(['X15', 'X16', 'X17', 'X18', 'X19', 'X20', 'X21', 'X22', 'X23', 'X24',
       'X25', 'X26', 'X27', 'X28', 'X29', 'X30', 'X31', 'X32', 'X33', 'X34',
       'X35', 'X.15', 'X.16', 'X.17', 'X.18', 'X.19', 'X.20', 'X.21', 'X.22',
       'X.23', 'X.24', 'X.25', 'X.26', 'X.27', 'X.28', 'X.29', 'X.30', 'X.31',
       'X.32', 'X.33', 'X.34', 'X.35', 'dens15to18', 'dens20to22',
       'dens25to29', 'ratiosi_pi', 'ratio_si', 'dens18to35'],
      dtype='object')

In [171]:
feat_dark_matter

Index(['15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26',
       '27', '28', '29', '30', '31', '32', '33', '34', '35', '-15', '-16',
       '-17', '-18', '-19', '-20', '-21', '-22', '-23', '-24', '-25', '-26',
       '-27', '-28', '-29', '-30', '-31', '-32', '-33', '-34', '-35',
       'dens15to18', 'dens20to22', 'dens25to29', 'ratiosi_pi', 'ratio_si',
       'dens18to35', 'length'],
      dtype='object')

Common features

In [172]:
feat_common = [f for f in feat_eves if f in feat_dark_matter]
feat_common

['dens15to18',
 'dens20to22',
 'dens25to29',
 'ratiosi_pi',
 'ratio_si',
 'dens18to35']

Build new data frame

In [173]:

# Create new data frame
df_dark2 = pd.DataFrame()

df_dark2[col_id_dark] = df_dark_matter[col_id_dark]
df_dark2[col_class_dark] = df_dark_matter[col_class_dark]

# Add features with proper names
feat_dark2 = []

for i in range(15, 35 + 1):
    feat_sense = f'X{i}'
    feat_dark2.append(feat_sense)
    df_dark2[feat_sense] = X_dark_matter[f'{i}']

for i in range(15, 35 + 1):
    feat_anti_sense = f'X.{i}'
    feat_dark2.append(feat_anti_sense)
    df_dark2[feat_anti_sense] = X_dark_matter[f'-{i}']

df_dark2[feat_common] = X_dark_matter[feat_common].copy()
feat_dark2 += feat_common.copy()


Check features

In [174]:
n_match_feats = sum(feat_dark2 == feat_eves)

print(f'feat_eves: {len(feat_eves)}') 
print(f'feat_dark2: {len(feat_dark2)}')
print(f'n_match_feats: {n_match_feats}')

feat_eves: 48
feat_dark2: 48
n_match_feats: 48


In [175]:
df_dark2.head()

Unnamed: 0,Contigs_ID,Similarity_label,X15,X16,X17,X18,X19,X20,X21,X22,...,X.32,X.33,X.34,X.35,dens15to18,dens20to22,dens25to29,ratiosi_pi,ratio_si,dens18to35
0,bN_RKPM44_Contig11_10,viral,-0.45808,-0.45808,-0.45808,-0.451239,-0.423021,-0.303308,2.654454,0.147325,...,-0.440123,-0.443543,-0.434992,-0.41789,-7.790348,2.105795,2.058274,0.04752,0.549719,3.81756
1,bN_RKPM44_Contig12_11,viral,-0.338402,-0.338402,-0.338402,-0.338402,-0.305252,-0.156079,4.761109,0.25829,...,-0.321827,-0.321827,-0.327352,-0.327352,-16.60964,2.051209,-0.388975,2.440184,0.008159,2.500344
2,bN_RKPM44_Contig195_194,viral,-0.285242,-0.285242,-0.285242,-0.285242,-0.260587,0.027052,5.467541,0.290036,...,-0.285242,-0.285242,-0.285242,-0.285242,-16.60964,2.159199,-1.889817,4.049016,0.431799,2.380272
3,bN_RKPM44_Contig1_0,viral,-0.402533,-0.402533,-0.402533,-0.402533,-0.37145,-0.196612,5.386557,0.55325,...,-0.352024,-0.379221,-0.336483,-0.355909,-16.60964,2.379971,0.741514,1.638457,0.602814,3.072798
4,bN_RKPM44_Contig2332_2331,viral,-0.353041,-0.353041,-0.353041,-0.352393,-0.345198,-0.314995,0.096574,-0.209996,...,-0.351031,-0.349994,-0.351031,-0.351226,-7.527122,2.823717,6.633881,-3.810164,1.07033,7.003396


## Classify Dark Matter DB sequences

In [176]:
y_hat = classifier.predict(df_dark2[feat_dark2])
df_dark2[col_class_eve] = y_hat.copy()

In [177]:
cols_report = [col_class_dark, col_class_eve, col_id_dark]

In [178]:
df_report = df_dark2[cols_report].copy()
df_report = df_report.sort_values(by=cols_report, ascending=True)

In [179]:
df_report.groupby(col_class_dark)[col_class_eve].value_counts()

Similarity_label  class
nohit             eve      26
nonviral          eve      21
viral             viral    22
                  eve       3
Name: class, dtype: int64

In [180]:
# df_report[ (df_report[col_class_eve] == 'eve') & (df_report[col_class_dark == 'viral']) ]
foo = df_report[df_report[col_class_dark] == 'viral' ].copy()
# foo = foo[df_report[col_class_eve] == 'eve']
foo[df_report[col_class_eve] == 'eve']

  foo[df_report[col_class_eve] == 'eve']


Unnamed: 0,Similarity_label,class,Contigs_ID
4,viral,eve,bN_RKPM44_Contig2332_2331
22,viral,eve,bX_RKPM44_Contig115_114
24,viral,eve,bX_RKPM44_Contig2402_2401
