Data preparation

In [1]:
import json
import pandas as pd
import numpy as np

np.random.seed(0)

In [None]:
DATA_JSON = 'gdrive/MyDrive/DSA4262/data.json' # Path to data.json
DATA_INFO = 'gdrive/MyDrive/DSA4262/data.info' # Path to data.info

In [None]:
labels_df = pd.read_csv(DATA_INFO)
labels_dict = {}

for _, row in labels_df.iterrows():
  labels_dict[row['transcript_id']] = labels_dict.get(row['transcript_id'], {})
  labels_dict[row['transcript_id']][row['transcript_position']] = row['label']

In [None]:
unique_gene_id = list(labels_df.gene_id.unique())

unique_gene_id

In [None]:
# Split gene ID for training and testing

n = len(unique_gene_id)
test_gene_id = unique_gene_id[:int(0.2 * len(unique_gene_id))]
train_gene_id = unique_gene_id[int(0.2 * len(unique_gene_id)):]

In [None]:
train_labels_df = labels_df[labels_df['gene_id'].isin(train_gene_id)]
test_labels_df = labels_df[labels_df['gene_id'].isin(test_gene_id)]

train0, train1 = train_labels_df["label"].value_counts()
test0, test1 = test_labels_df["label"].value_counts()

In [None]:
print(f'Train class ratio {int(train0/train1)}:1') # 21:1
print(f'Test class ratio {int(test0/test1)}:1') # 20:1

In [None]:
instance_lists = []

with open(DATA_JSON) as f:
  for transcript_json in f:
    transcript_dict = json.loads(transcript_json)
    for transcript_id, transcript_pos_dict in transcript_dict.items():
      for transcript_pos, nucleotides_dict in transcript_pos_dict.items():
        for nucleotides, data in nucleotides_dict.items():
          for row in data:
            instance_lists.append([transcript_id, transcript_pos, nucleotides] + row + [labels_dict[transcript_id][int(transcript_pos)]])

In [None]:
complete_df = pd.DataFrame(instance_lists, columns=['transcript_id', 'transcript_position', 'nucleotides', '0', '1', '2', '3', '4', '5', '6', '7', '8','label'])

complete_df.head()

In [None]:
complete_df_mean = complete_df.groupby(by=['transcript_id', 'transcript_position', 'nucleotides']).mean().reset_index()

complete_df_mean.head()

In [None]:
complete_df_min = complete_df.groupby(by=['transcript_id', 'transcript_position', 'nucleotides']).min().reset_index()
complete_df_min.columns = ['transcript_id', 'transcript_position', 'nucleotides', '9', '10', '11', '12', '13', '14', '15', '16', '17', 'label']

complete_df_min.head()

In [None]:
complete_df_max = complete_df.groupby(by=['transcript_id', 'transcript_position', 'nucleotides']).max().reset_index()
complete_df_max.columns = ['transcript_id', 'transcript_position', 'nucleotides', '18', '19', '20', '21', '22', '23', '24', '25', '26', 'label']

complete_df_max.head()

In [None]:
complete_df_all = complete_df_mean.drop(columns=['label']).merge(complete_df_min.drop(columns=['label'])).merge(complete_df_max.drop(columns=['label']))
complete_df_all['transcript_position'] = complete_df_all['transcript_position'].astype('int')

complete_df_all.head()

In [None]:
complete_df_all = complete_df_all.merge(labels_df, on=['transcript_id', 'transcript_position'])

complete_df_all.head()

In [None]:
label_counts = labels_df.label.value_counts()
label_ratio = label_counts[0]/label_counts[1]

label_ratio

In [None]:
# Save features and labels dataframes to csv

complete_df_all.iloc[:, 3:-2].to_csv('gdrive/MyDrive/DSA4262/xgb3_feature.csv', index=False)
complete_df_all.iloc[:, -1].to_csv('gdrive/MyDrive/DSA4262/xgb3_label.csv', index=False)

Model training and evaluation

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, balanced_accuracy_score, roc_curve, precision_recall_curve, auc
from scipy.stats import mode

In [None]:
def get_roc_auc(y_true, y_pred):
    fpr, tpr, _  = roc_curve(y_true, y_pred)
    roc_auc = auc(fpr, tpr)
    return roc_auc


def get_pr_auc(y_true, y_pred):
    precision, recall, _ = precision_recall_curve(y_true, y_pred, pos_label=1)
    pr_auc = auc(recall, precision)
    return pr_auc


def get_accuracy(y_true, y_pred):
    return balanced_accuracy_score(y_true, y_pred)

In [None]:
!pip install xgboost==1.6.2

In [None]:
from xgboost import XGBClassifier
from math import ceil

In [None]:
# To make sure our model is robust against unseen data, we perform multiple iterations of model training and evaluation
# Each time, we split the data into training and evaluation set by gene ID

roc_auc = []
pr_auc = []
accuracy = []

for i in range(len(unique_gene_id)):
  print(f'Inspecting gene {i+1}/{len(unique_gene_id)}')
  gene_id = unique_gene_id[i]
  train_df = complete_df_all[complete_df_all['gene_id'] != gene_id].iloc[:, 3:-2]
  train_label = complete_df_all[complete_df_all['gene_id'] != gene_id].iloc[:, -1]
  eval_df = complete_df_all[complete_df_all['gene_id'] != gene_id].iloc[:, 3:-2]
  eval_label = complete_df_all[complete_df_all['gene_id'] != gene_id].iloc[:, -1]
  
  xgb_model = XGBClassifier(
      objective = 'binary:logistic',
      scale_pos_weight = ceil(label_ratio),
      max_delta_step = 1,
      seed = 0,
      tree_method = 'gpu_hist'
  )

  xgb_model.fit(train_df, train_label)
  predictions = xgb_model.predict(eval_df)

  roc_auc.append(get_roc_auc(eval_label.to_numpy(), predictions))
  pr_auc.append(get_pr_auc(eval_label.to_numpy(), predictions))
  accuracy.append(get_accuracy(eval_label.to_numpy(), predictions))

In [None]:
print(f'ROC AUC: {np.mean(roc_auc)}') # ROC AUC: 0.9392459265323052
print(f'PR AUC: {np.mean(pr_auc)}') # PR AUC: 0.6521694199992294
print(f'Accuracy: {np.mean(accuracy)}') # Accuracy: 0.9392459265323052

In [None]:
# Do the same thing now, but we use mean only to compare the performances of the two models

complete_df_mean['transcript_position'] = complete_df_mean['transcript_position'].astype('int')
complete_df_mean = complete_df_mean.drop(columns=['label']).merge(labels_df, on=['transcript_id', 'transcript_position'])

complete_df_mean.head()

In [None]:
# Mean only

roc_auc = []
pr_auc = []
accuracy = []

for i in range(len(unique_gene_id)):
  print(f'Inspecting gene {i+1}/{len(unique_gene_id)}')
  gene_id = unique_gene_id[i]
  train_df = complete_df_mean[complete_df_mean['gene_id'] != gene_id].iloc[:, 3:-2]
  train_label = complete_df_mean[complete_df_mean['gene_id'] != gene_id].iloc[:, -1]
  eval_df = complete_df_mean[complete_df_mean['gene_id'] != gene_id].iloc[:, 3:-2]
  eval_label = complete_df_mean[complete_df_mean['gene_id'] != gene_id].iloc[:, -1]
  
  xgb_model = XGBClassifier(
      objective = 'binary:logistic',
      scale_pos_weight = ceil(label_ratio),
      max_delta_step = 1,
      seed = 0,
      tree_method = 'gpu_hist'
  )

  xgb_model.fit(train_df, train_label)
  predictions = xgb_model.predict(eval_df)

  roc_auc.append(get_roc_auc(eval_label.to_numpy(), predictions))
  pr_auc.append(get_pr_auc(eval_label.to_numpy(), predictions))
  accuracy.append(get_accuracy(eval_label.to_numpy(), predictions))

In [None]:
print(f'ROC AUC: {np.mean(roc_auc)}') # ROC AUC: 0.913483812474404
print(f'ROC AUC: {np.mean(pr_auc)}') # PR AUC: 0.6115034727496196
print(f'ROC AUC: {np.mean(accuracy)}') # Accuracy: 0.913483812474404

In [None]:
# Train model with the full dataset and save it

xgb_model = XGBClassifier(
    objective = 'binary:logistic',
    scale_pos_weight = ceil(label_ratio),
    max_delta_step = 1,
    seed = 0,
    tree_method = 'gpu_hist'
)

xgb_model.fit(complete_df_all.iloc[:, 3:-2], complete_df_all.iloc[:, -1])
predictions = xgb_model.predict(complete_df_all.iloc[:, 3:-2])

print(f'ROC AUC: {get_roc_auc(complete_df_all.iloc[:, -1].to_numpy(), predictions)}')
print(f'PR AUC: {get_pr_auc(complete_df_all.iloc[:, -1].to_numpy(), predictions)}')
print(f'Accuracy: {get_accuracy(complete_df_all.iloc[:, -1].to_numpy(), predictions)}')

xgb_model.save_model('gdrive/MyDrive/DSA4262/xgb6.model')