Data preparation

In [None]:
import json
import numpy as np
import pandas as pd

np.random.seed(0)

In [None]:
DATA_JSON = 'gdrive/MyDrive/DSA4262/data.json' # Path to data.json
DATA_INFO = 'gdrive/MyDrive/DSA4262/data.info' # Path to data.info

In [None]:
transcript_lists = []

with open(DATA_JSON) as f:
  for transcript_json in f:
    transcript_dict = json.loads(transcript_json)
    for transcript_id, transcript_pos_dict in transcript_dict.items():
      for transcript_pos, nucleotides_dict in transcript_pos_dict.items():
        for nucleotides, data in nucleotides_dict.items():
          transcript_lists.append([transcript_id, transcript_pos, nucleotides, data])

In [None]:
labels_df = pd.read_csv(DATA_INFO)
labels_dict = {}

for _, row in labels_df.iterrows():
  labels_dict[row['transcript_id']] = labels_dict.get(row['transcript_id'], {})
  labels_dict[row['transcript_id']][row['transcript_position']] = row['label']

In [None]:
bag_instance_idx = [] # Start and end indices of each bag where a bag is a unique combination of transcript ID and transcript position
bag_label = []
instance_lists = []
instance_idx = 0

for lst in transcript_lists:
  bag_label.append(labels_dict[lst[0]][int(lst[1])])
  bag_instance_idx.append([instance_idx, instance_idx + len(lst[3])])
  instance_idx += len(lst[3])

  for data in lst[3]:
    instance_lists.append(lst[:3] + data + [labels_dict[lst[0]][int(lst[1])]])

In [None]:
complete_df = pd.DataFrame(instance_lists, columns=['transcript_id', 'transcript_position', 'nucleotides', '0', '1', '2', '3', '4', '5', '6', '7', '8','label'])
complete_df.head()

Model training where the model is supposed to predict the probablity that an **instance** belong to label `1`

In [None]:
!pip install xgboost==1.6.2

In [None]:
import xgboost

In [None]:
xgb_model = xgboost.XGBClassifier(tree_method='gpu_hist') # We can remove `tree_method` parameter if the machine does not have GPU
xgb_model.fit(complete_df.iloc[:, 3:12], complete_df.iloc[:, -1])

In [None]:
# Predicting class probabilites of each instance

xgb_class_proba = xgb_model.predict_proba(complete_df.iloc[:, 3:12])
xgb_class_proba

Once we have the class probabilities of each instance, we can use sum rule as described below to predict the bag labels ie. if the bag belongs to class `1` or `0`.

Sum rule:

$$p(+|X_i) = (1-n_i)p(+) + \sum_{i=1}^{n_i} p(+|x_{ij})$$
$$p(-|X_i) = (1-n_i)p(-) + \sum_{i=1}^{n_i} p(-|x_{ij})$$

where

$$n_i = Number\;of\;instances\;in\;Bag_i$$
$$X_i = Bag_i$$
$$x_{ij} = Instance_j\;of\;Bag_i$$

In [None]:
# sum_rule function works on an individual bag
# y_instance_proba is of the shape (N_i, 2)

from scipy.special import softmax

def sum_rule(y_instance_proba, pos_prior, neg_prior):
  n_instances = len(y_instance_proba)
  p_neg = (1 - n_instances) * (neg_prior) + np.sum(y_instance_proba[:, 0])
  p_pos = (1 - n_instances) * (pos_prior) + np.sum(y_instance_proba[:, 1])

  return softmax(np.array([p_neg, p_pos]) / (p_neg + p_pos))

In [None]:
# Predicting bag labels

neg_prior, pos_prior = np.unique(bag_label, return_counts=True)[1] / len(bag_label)
predictions = []

for start, end in bag_instance_idx:
  proba = sum_rule(xgb_class_proba[start:end], pos_prior, neg_prior)
  predictions.append(int(proba[1] > proba[0]))

Evaluating the predictions

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, balanced_accuracy_score, roc_curve, precision_recall_curve, auc
from scipy.stats import mode

def get_roc_auc(y_true, y_pred):
    fpr, tpr, _  = roc_curve(y_true, y_pred)
    roc_auc = auc(fpr, tpr)
    return roc_auc

def get_pr_auc(y_true, y_pred):
    precision, recall, _ = precision_recall_curve(y_true, y_pred, pos_label=1)
    pr_auc = auc(recall, precision)
    return pr_auc

def get_accuracy(y_true, y_pred):
    return balanced_accuracy_score(y_true, y_pred)

In [None]:
print(f'ROC AUC: {get_roc_auc(bag_label, predictions)}') # ROC AUC: 0.7754103959907857
print(f'PR AUC: {get_pr_auc(bag_label, predictions)}') # PR AUC: 0.4789541907111754
print(f'Accuracy: {get_accuracy(bag_label, predictions)}') # Accuracy: 0.7754103959907857