Notebook for predicting Protein Stability based on Graph Centrality. 
Fine-tuned for the purpose of determining Tm of TCR constructs.

In [4]:
!pip install biopandas
import pandas as pd
from biopandas.pdb import PandasPdb
import numpy as np
import math

import matplotlib.pyplot as plt
import matplotlib.colors
import os



In [12]:
import pandas as pd
Novozyme_data_dir="/home/lilian/TCR_Graphs/Novozyme_data"

df_train = pd.read_csv(os.path.join(Novozyme_data_dir,"train.csv"), index_col="seq_id")
df_train_updates = pd.read_csv(os.path.join(Novozyme_data_dir,"train_updates_20220929.csv"), index_col="seq_id")

all_features_nan = df_train_updates.isnull().all("columns")

drop_indices = df_train_updates[all_features_nan].index
df_train = df_train.drop(index=drop_indices)

swap_ph_tm_indices = df_train_updates[~all_features_nan].index
df_train.loc[swap_ph_tm_indices, ["pH", "tm"]] = df_train_updates.loc[swap_ph_tm_indices, ["pH", "tm"]]



df_test = pd.read_csv(os.path.join(Novozyme_data_dir,"test.csv"), index_col="seq_id")

In [14]:
# Install graphein if necessary
!pip install graphein 




In [2]:
import torch
from graphein.ml import ProteinGraphDataset
import graphein.protein as gp

g_lab_map = {"wildtype": 1}

# Create the dataset
ds = ProteinGraphDataset(
    root = "/home/lilian/TCR_Graphs/Novozyme_data",
    graphein_config=gp.ProteinGraphConfig()
)

AttributeError: 'ProteinGraphDataset' object has no attribute 'structures'

In [None]:
!pip install biopandas -q
!pip install dgl dglgo -f https://data.dgl.ai/wheels/repo.html -q
import numpy as np
from biopandas.pdb import PandasPdb
import torch
import dgl
def get_distance_matrix(coords):
    diff_tensor = np.expand_dims(coords, axis=1) - np.expand_dims(coords, axis=0)
    distance_matrix = np.sqrt(np.sum(np.power(diff_tensor, 2), axis=-1))
    return distance_matrix

def pdb_to_graph(pdb_path, distance_threshold=6.0, contain_b_factor=True):
    atom_df = PandasPdb().read_pdb(pdb_path)
    atom_df = atom_df.df['ATOM']
    residue_df = atom_df.groupby('residue_number', as_index=False)[['x_coord', 'y_coord', 'z_coord', 'b_factor']].mean().sort_values('residue_number')
    coords = residue_df[['x_coord', 'y_coord', 'z_coord']].values
    distance_matrix = get_distance_matrix(coords)
    adj = distance_matrix < distance_threshold
    u, v = np.nonzero(adj)
    u, v = torch.from_numpy(u), torch.from_numpy(v)
    graph = dgl.graph((u, v), num_nodes=len(coords))
    if contain_b_factor:
        b_factor = torch.from_numpy(residue_df['b_factor'].values)
        graph.ndata['b_factor'] = b_factor
    return graph


graph = pdb_to_graph('.pdb')

