Data preparation

In [None]:
import json
import numpy as np
import pandas as pd

np.random.seed(0)

In [None]:
DATA_JSON = 'gdrive/MyDrive/DSA4262/data.json' # Path to data.json
DATA_INFO = 'gdrive/MyDrive/DSA4262/data.info' # Path to data.info

In [None]:
# Run this cell to load existing train_gene_id and test_gene_id

import pickle

with open('gdrive/MyDrive/DSA4262/gtr.pickle', 'rb') as file1:
  train_gene_id = pickle.load(file1)

with open('gdrive/MyDrive/DSA4262/gte.pickle', 'rb') as file2:
  test_gene_id = pickle.load(file2)

In [None]:
transcript_lists = []

with open(DATA_JSON) as f:
  for transcript_json in f:
    transcript_dict = json.loads(transcript_json)
    for transcript_id, transcript_pos_dict in transcript_dict.items():
      for transcript_pos, nucleotides_dict in transcript_pos_dict.items():
        for nucleotides, data in nucleotides_dict.items():
          transcript_lists.append([transcript_id, transcript_pos, nucleotides, data])

In [None]:
labels_df = pd.read_csv(DATA_INFO)
transcript_id_pos_dict = {}

for _, row in labels_df.iterrows():
  transcript_id_pos_dict[row['transcript_id']] = transcript_id_pos_dict.get(row['transcript_id'], {})
  transcript_id_pos_dict[row['transcript_id']][row['transcript_position']] = [row['gene_id'], row['label']]

In [None]:
train_bag_instance_idx = [] # Start and end indices of each bag where a bag is a unique combination of transcript ID and transcript position
test_bag_instance_idx = [] # Same as above but for testing
train_bag_label = []
test_bag_label = []
train_instance_lists = []
test_instance_lists = []
train_instance_idx = 0
test_instance_idx = 0

for lst in transcript_lists:
  if transcript_id_pos_dict[lst[0]][int(lst[1])][0] in train_gene_id:
    train_bag_label.append(transcript_id_pos_dict[lst[0]][int(lst[1])][1])
    train_bag_instance_idx.append([train_instance_idx, train_instance_idx + len(lst[3])])
    train_instance_idx += len(lst[3])

    for data in lst[3]:
      train_instance_lists.append(lst[:3] + data + [transcript_id_pos_dict[lst[0]][int(lst[1])][1]])
  
  else:
    test_bag_label.append(transcript_id_pos_dict[lst[0]][int(lst[1])][1])
    test_bag_instance_idx.append([test_instance_idx, test_instance_idx + len(lst[3])])
    test_instance_idx += len(lst[3])

    for data in lst[3]:
      test_instance_lists.append(lst[:3] + data + [transcript_id_pos_dict[lst[0]][int(lst[1])][1]])

In [None]:
train_df = pd.DataFrame(train_instance_lists, columns=['transcript_id', 'transcript_position', 'nucleotides', '0', '1', '2', '3', '4', '5', '6', '7', '8','label'])
train_df.head()

In [None]:
test_df = pd.DataFrame(test_instance_lists, columns=['transcript_id', 'transcript_position', 'nucleotides', '0', '1', '2', '3', '4', '5', '6', '7', '8','label'])
test_df.head()

In [None]:
# Save features, bag labels and bag instance indices

import pickle

test_df.iloc[:, 3:-1].to_csv('gdrive/MyDrive/DSA4262/sum_rule_test_feature.csv', index=False)

with open('gdrive/MyDrive/DSA4262/test_bag_indices.pickle', 'wb') as f:
    pickle.dump(test_bag_instance_idx, f, protocol=pickle.HIGHEST_PROTOCOL)

with open('gdrive/MyDrive/DSA4262/test_bag_labels.pickle', 'wb') as f:
    pickle.dump(test_bag_label, f, protocol=pickle.HIGHEST_PROTOCOL)

Model training where the model is supposed to predict the probablity that an **instance** belong to label `1`

In [None]:
!pip install xgboost==1.6.2

In [None]:
import xgboost

In [None]:
xgb_model = xgboost.XGBClassifier(tree_method='gpu_hist')
xgb_model.fit(train_df.iloc[:, 3:12], train_df.iloc[:, -1])

In [None]:
# Save model for predicting instance probability

xgb_model.save_model('gdrive/MyDrive/DSA4262/sum_rule_xgb.model')

In [None]:
# Predicting class probabilites of each instance

xgb_instance_proba = xgb_model.predict_proba(test_df.iloc[:, 3:12])
xgb_instance_proba

Once we have the class probabilities of each instance, we can use sum rule as described below to predict the bag labels ie. if the bag belongs to class `1` or `0`.

Sum rule:

$$p(+|X_i) = (1-n_i)p(+) + \sum_{i=1}^{n_i} p(+|x_{ij})$$
$$p(-|X_i) = (1-n_i)p(-) + \sum_{i=1}^{n_i} p(-|x_{ij})$$

where

$$n_i = Number\;of\;instances\;in\;Bag_i$$
$$X_i = Bag_i$$
$$x_{ij} = Instance_j\;of\;Bag_i$$

In [None]:
# sum_rule function works on an individual bag
# y_instance_proba is of the shape (N_i, 2)

from scipy.special import softmax

def sum_rule(y_instance_proba, pos_prior, neg_prior):
  n_instances = len(y_instance_proba)
  p_neg = (1 - n_instances) * (neg_prior) + np.sum(y_instance_proba[:, 0])
  p_pos = (1 - n_instances) * (pos_prior) + np.sum(y_instance_proba[:, 1])

  return softmax(np.array([p_neg, p_pos]) / (p_neg + p_pos))

In [None]:
# Predicting bag labels

neg_prior, pos_prior = np.unique(train_bag_label, return_counts=True)[1] / len(train_bag_label)
predictions = []

for start, end in test_bag_instance_idx:
  proba = sum_rule(xgb_instance_proba[start:end], pos_prior, neg_prior)
  predictions.append(int(proba[1] > proba[0]))

Evaluating the predictions

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, balanced_accuracy_score, roc_curve, precision_recall_curve, auc
from scipy.stats import mode

def get_roc_auc(y_true, y_pred):
    fpr, tpr, _  = roc_curve(y_true, y_pred)
    roc_auc = auc(fpr, tpr)
    return roc_auc

def get_pr_auc(y_true, y_pred):
    precision, recall, _ = precision_recall_curve(y_true, y_pred, pos_label=1)
    pr_auc = auc(recall, precision)
    return pr_auc

def get_accuracy(y_true, y_pred):
    return balanced_accuracy_score(y_true, y_pred)

In [None]:
print(f'ROC AUC: {get_roc_auc(test_bag_label, predictions)}') # ROC AUC: 0.7629566318353611
print(f'PR AUC: {get_pr_auc(test_bag_label, predictions)}') # PR AUC: 0.47212478019287896
print(f'Accuracy: {get_accuracy(test_bag_label, predictions)}') # Accuracy: 0.7629566318353611

Create a class for making predictions using the above model

In [None]:
class SumRule:
  import numpy
  import scipy
  
  def __init__(self, model):
    self.model = model

  def sum_rule(self, y_instance_proba, pos_prior, neg_prior):
    n_instances = len(y_instance_proba)
    p_neg = (1 - n_instances) * (neg_prior) + self.numpy.sum(y_instance_proba[:, 0])
    p_pos = (1 - n_instances) * (pos_prior) + self.numpy.sum(y_instance_proba[:, 1])

    return self.scipy.special.softmax(self.numpy.array([p_neg, p_pos]) / (p_neg + p_pos))


  def predict(self, features, bag_indices, pos_prior=0.04427960226597472, neg_prior=0.9557203977340253):
    instance_proba = self.model.predict_proba(features)
    predictions = []

    for start, end in bag_indices:
      proba = self.sum_rule(instance_proba[start:end], pos_prior, neg_prior)
      predictions.append(int(proba[1] > proba[0]))

    return self.numpy.array(predictions)

  def predict_proba(self, features, bag_indices, pos_prior=0.04427960226597472, neg_prior=0.9557203977340253):
    instance_proba = self.model.predict_proba(features)
    bag_proba = []

    for start, end in bag_indices:
      proba = self.sum_rule(instance_proba[start:end], pos_prior, neg_prior)
      bag_proba.append(proba[1])

    return self.numpy.array(bag_proba)

Using the created class to make predictions

In [None]:
import pandas as pd
import pickle
from xgboost import XGBClassifier

features = pd.read_csv('gdrive/MyDrive/DSA4262/sum_rule_test_feature.csv')

with open('gdrive/MyDrive/DSA4262/test_bag_indices.pickle', 'rb') as f:
  bag_indices = pickle.load(f)

with open('gdrive/MyDrive/DSA4262/test_bag_labels.pickle', 'rb') as f:
  bag_labels = pickle.load(f)

xgb_model = XGBClassifier()
xgb_model.load_model('gdrive/MyDrive/DSA4262/sum_rule_xgb.model')

In [None]:
sum_rule_model = SumRule(xgb_model)

predictions = sum_rule_model.predict(features, bag_indices)
prediction_proba = sum_rule_model.predict_proba(features, bag_indices)