## Interpretation of LR and RF models trained on UniRef90 dataset for multiclass classification

In [3]:
import sys
import os
sys.path.append(os.path.join(os.getcwd(), "..", "..", ".."))
sys.path.append(os.path.join(os.getcwd(), "..", ".."))
sys.path.append(os.path.join(os.getcwd(), ".."))
sys.path

['/home/grads/blessyantony/dev/git/zoonosis/src/jupyter_notebooks/interpretation',
 '/home/grads/blessyantony/anaconda3/envs/zoonosis/lib/python310.zip',
 '/home/grads/blessyantony/anaconda3/envs/zoonosis/lib/python3.10',
 '/home/grads/blessyantony/anaconda3/envs/zoonosis/lib/python3.10/lib-dynload',
 '',
 '/home/grads/blessyantony/anaconda3/envs/zoonosis/lib/python3.10/site-packages',
 '/home/grads/blessyantony/anaconda3/envs/zoonosis/lib/python3.10/site-packages/PyQt5_sip-12.11.0-py3.10-linux-x86_64.egg',
 '/home/grads/blessyantony/dev/git/zoonosis/src/jupyter_notebooks/interpretation/../../..',
 '/home/grads/blessyantony/dev/git/zoonosis/src/jupyter_notebooks/interpretation/../..',
 '/home/grads/blessyantony/dev/git/zoonosis/src/jupyter_notebooks/interpretation/..',
 '/home/grads/blessyantony/dev/git/zoonosis/src/jupyter_notebooks/interpretation/../../..',
 '/home/grads/blessyantony/dev/git/zoonosis/src/jupyter_notebooks/interpretation/../..',
 '/home/grads/blessyantony/dev/git/zoon

In [4]:
from utils import kmer_utils, utils
import pandas as pd
import numpy as np
import joblib

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.manifold import TSNE


In [22]:
input_dir = "/home/grads/blessyantony/dev/git/zoonosis/input/data/uniref90/"
input_file_names = ["uniref90_final_msl1114.csv"]

label_groupings = {"Human": [ "Homo sapiens" ],
                  "Desert warthog": [ "Phacochoerus aethiopicus" ],
                  "Lesser bandicoot rat": [ "Bandicota bengalensis" ],
                  "Horse": [ "Equus caballus" ],
                  "Goat": [ "Capra hircus" ],
                  "Red junglefowl": [ "Gallus gallus" ],
                  "Wood mouse": [ "Apodemus sylvaticus" ],
                  "Cattle": [ "Bos taurus" ],
                  "Others": [ "*" ]}
host_classes = ["Homo sapiens",  "Phacochoerus aethiopicus",    "Bandicota bengalensis",     "Equus caballus",   "Capra hircus", 
                "Gallus gallus",   "Apodemus sylvaticus",     "Bos taurus",  "Others"]

split_col = "split"
k = 3
sequence_settings= {
    "sequence_col": "seq",
    "id_col": "uniref90_id",
    
}
    
classification_settings =  {
    "train_proportion": 0.8,
    "batch_size": 8,
    "max_sequence_length": 1115,
    "pad_sequence_val": 0,
    "truncate": True
}

label_settings = {
    "label_col": "virus_host_name",
    "exclude_labels": [ "nan"],
    "label_groupings":  label_groupings
}

lr_model = {
    "C": [0.01, 0.1, 1],
    "multiclass_type": "multinomial"
}

rf_model = {
    "n_estimators": [10, 100, 1000],
    "max_depth": [3, 5]
}

### Load the datasets

In [23]:
def load_dataset(input_dir, input_file_names, seed, train_proportion, kmer_keys=None):
    id_col = sequence_settings["id_col"]
    label_col = label_settings["label_col"]
    sequence_col = sequence_settings["sequence_col"]
    df = utils.read_dataset(input_dir, input_file_names, 
                            cols=[id_col, sequence_col, label_col])
    df, index_label_map = utils.transform_labels(df, label_settings, classification_type="multi")
    train_df, test_df = utils.split_dataset(df, seed, classification_settings["train_proportion"], stratify_col=label_col)
    train_df[split_col] = "train"
    test_df[split_col] = "test"
    df = pd.concat([train_df, test_df])
    print(f"Loaded dataset size = {df.shape}")
    
    kmer_df = kmer_utils.compute_kmer_features(df, k, id_col, sequence_col, label_col, kmer_keys)
    print(f"kmer_df size = {kmer_df.shape}")
    
    kmer_df = kmer_df.join(df["split"], on=id_col, how="left")
    print(f"kmer_df size after join with split on id = {kmer_df.shape}")
    return index_label_map, dataset_loader

### Load models

In [24]:
print("Loading models")
lr_model_path = "/home/grads/blessyantony/dev/git/zoonosis/output/raw/uniref90/20230705/host_multi_msl1114_baseline/kmer_k3_virus_host_name_multi_kmer_k3-loss_ce_itr4_lr_model.joblib"
lr_model = joblib.load(lr_model_path)
print(lr_model)

rf_model_path = "/home/grads/blessyantony/dev/git/zoonosis/output/raw/uniref90/20230705/host_multi_msl1114_baseline/kmer_k3_virus_host_name_multi_kmer_k3-loss_ce_itr4_rf_model.joblib"
rf_model =joblib.load(rf_model_path)
print(rf_model)

Loading models
LogisticRegression(C=1, class_weight='balanced', max_iter=5000,
                   multi_class='multinomial', n_jobs=-1, penalty='l1',
                   solver='saga')
RandomForestClassifier(class_weight='balanced', max_depth=5, n_estimators=1000)


In [25]:
print(lr_model.coef_.shape)
print(lr_model.feature_names_in_)
print(lr_model.get_params())

(9, 500)
['PIP' 'FII' 'NIL' 'GIL' 'GPS' 'YNI' 'CLW' 'FFP' 'AAP' 'IPS' 'LLA' 'YSL'
 'WAF' 'GNC' 'GAK' 'RRF' 'GTV' 'SPF' 'AVT' 'HLP' 'FFL' 'PIF' 'PHC' 'MLL'
 'LVV' 'WPK' 'CRT' 'PSC' 'VIW' 'LSP' 'LQS' 'MDD' 'GFR' 'LDS' 'GSS' 'SCC'
 'PLL' 'KNP' 'DGN' 'KTC' 'WMM' 'FPS' 'GGV' 'IPL' 'PKF' 'IPQ' 'ESL' 'RRA'
 'PFL' 'DPA' 'TFG' 'PAG' 'SWA' 'PPP' 'PMG' 'ALY' 'PPL' 'HCL' 'ILT' 'VLL'
 'FYH' 'LFT' 'VFL' 'ICP' 'QFS' 'LSL' 'IWM' 'VNR' 'SMF' 'ASA' 'YKT' 'PAR'
 'FCL' 'QGT' 'AFP' 'SRY' 'LLT' 'FTA' 'VPF' 'FGR' 'QSL' 'LVG' 'LLD' 'PHN'
 'YLW' 'PLH' 'TAS' 'VVR' 'TPP' 'QNL' 'VTN' 'TVP' 'AKS' 'PAP' 'TVW' 'ARV'
 'ILG' 'RVT' 'AQG' 'CSR' 'IFL' 'HDS' 'PTS' 'RYV' 'GML' 'PAS' 'TSL' 'DFS'
 'LTT' 'FMG' 'TSN' 'LGW' 'RKI' 'RFS' 'SSN' 'GVG' 'VLG' 'GIH' 'LGQ' 'AFG'
 'SGF' 'LTN' 'LGL' 'PNL' 'WWT' 'GST' 'MLP' 'TKR' 'LRR' 'FAV' 'NFL' 'SLN'
 'CLR' 'WAS' 'IHL' 'CTK' 'DSC' 'LGG' 'GTS' 'HSP' 'SLT' 'AAM' 'PNK' 'LSW'
 'GQN' 'SSW' 'NSR' 'ARF' 'PID' 'LGI' 'SDG' 'AQF' 'VVD' 'NRP' 'TST' 'RLS'
 'SGL' 'ICS' 'VAR' 'FTS' 'ILL' 'LSS' 'PII'

### Training-based interpretation
#### Encoding visualization - all viruses, all hosts

In [26]:
def compute_dataset_representations(baseline_model, df, label_col):
    df = df[df[split_col] == "train"]
    coeffs = baseline_model.feature_names_in_
    embed_df = None
    for _, row in df.iterrows():
        label = row[label_col]
        row_features = row[features]
        print(f"features shape = {row_features.shape}")
        print(f"label = {label}")
        embedding = row_features.multiply(coeffs[label])
        if embed_df is None:
            embed_df = embedding
            embedding[label_col] = label
        else:
            embed_df = pd.concat([embed_df, embedding])
    print("Final embed_df shape = ")
    print(embed_df.shape)
    return embed_df


def visualize_dataset(rep_df):
    columns = rep_df.columns
    print(columns)
    X = rep_df[range(512)]
    tsne_model = TSNE(n_components=2, verbose=1, init="pca", learning_rate="auto").fit(X)
    X_emb = pd.DataFrame(tsne_model.fit_transform(X))
    print(X_emb.shape)
    print(X_emb)
    X_emb["label"] = rep_df["label"].values
    return tsne_model, X_emb

In [27]:
dataset = load_dataset(input_dir,
                       input_file_names,
                       seed=79221635, train_proportion=0.8, kmer_keys=lr_model.feature_names_in_)

input file: /home/grads/blessyantony/dev/git/zoonosis/input/data/uniref90/uniref90_final_msl1114.csv, size = (18292, 3)
Size of input dataset = (18292, 3)
Grouping labels using config : {'Human': ['Homo sapiens'], 'Desert warthog': ['Phacochoerus aethiopicus'], 'Lesser bandicoot rat': ['Bandicota bengalensis'], 'Horse': ['Equus caballus'], 'Goat': ['Capra hircus'], 'Red junglefowl': ['Gallus gallus'], 'Wood mouse': ['Apodemus sylvaticus'], 'Cattle': ['Bos taurus'], 'Others': ['*']}
label_idx_map={'Cattle': 0, 'Desert warthog': 1, 'Goat': 2, 'Horse': 3, 'Human': 4, 'Lesser bandicoot rat': 5, 'Others': 6, 'Red junglefowl': 7, 'Wood mouse': 8}
idx_label_map={0: 'Cattle', 1: 'Desert warthog', 2: 'Goat', 3: 'Horse', 4: 'Human', 5: 'Lesser bandicoot rat', 6: 'Others', 7: 'Red junglefowl', 8: 'Wood mouse'}
[4 5 6 3 2 1 7 0 8]
Splitting dataset with seed=79221635, train_proportion=0.8, stratify_col=virus_host_name
Size of train_dataset = (14633, 3)
Size of test_dataset = (3659, 3)
Loaded datas

TypeError: compute_kmer_features() takes 5 positional arguments but 6 were given