# Extra Validation Set

Load the required libraries


In [1]:
#| label: loading library

from pathlib import Path
import pandas as pd
from sklearn.model_selection import train_test_split
import xgboost as xgb

## Combining Data


In [2]:
#| label: loading dataset

# set the data path
data_path = Path.cwd().parent/'datas'

# Load all data
df_all = pd.read_csv(data_path / 'seqs_anno.csv').set_index('Accession')

# Load and filter remove_data
cols_to_read = ["Name", "Accession", "Release_Date"]
remove_data = pd.read_csv(data_path / 'arbovirus_cleaned.csv', usecols=cols_to_read)
remove_data["Release_Date"] = pd.to_datetime(remove_data["Release_Date"], errors='coerce')
accessions_after_2022 = remove_data[remove_data["Release_Date"].dt.year >= 2022]["Accession"].tolist()

# Filter vector after 2022
df_1 = df_all[df_all.index.isin(accessions_after_2022) & (df_all["anno"] == "vector")].iloc[:, list(range(0, 33)) + [36]]

# Load eibi data
eibi_all = pd.read_csv(data_path / 'seqs_eibi.tsv', sep='\t', index_col="query").iloc[:, list(range(6, 39))] 

# Concatenate
vali_df = pd.concat([df_1, eibi_all], axis=0)

vali_df = vali_df[~vali_df.index.duplicated(keep='first')].fillna(0)

vali_df.head()

Unnamed: 0,disable_organ,cytotoxicity,degrade_ecm,induce_inflammation,bacterial_counter_signaling,viral_counter_signaling,resist_complement,counter_immunoglobulin,plant_rna_silencing,resist_oxidative,...,secreted_effector,antibiotic_resistance,develop_in_host,nonviral_adhesion,secretion,toxin_synthase,viral_adhesion,virulence_regulator,size,homo_infected
NC_055408,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,893,0.0
NC_055410,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,4550,0.0
NC_055409,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,6863,0.0
NC_055216,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,4478,0.0
NC_055217,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,11816,0.0


## load model


In [None]:
#| label: loading-xgb

# set the data path
model_path = Path.cwd().parent / 'models'

model = xgb.Booster()
model.load_model(model_path / "XGB_cli.json")

prediction


In [None]:
#| label: prediction

vali_D = xgb.DMatrix(vali_df.iloc[:, : 33])

vali_result = model.predict(vali_D)

vali_result_pred = (model.predict(vali_D) > 0.2).astype(int)

vali_result_pred_df = pd.DataFrame(vali_result_pred, columns=['Prediction'], index=vali_df.index)


result = pd.concat([vali_df.iloc[:, [0] + [34]], vali_result_pred_df], axis=1)

result