# Imports

In [1]:
from biopandas.pdb.pandas_pdb import PandasPdb
import os

# Testing BioPandas

In [51]:
parser = PandasPdb()
pdb = parser.read_pdb("d2aeva_.pdb")
df_atoms = pdb.df["ATOM"]
df_hetero = pdb.df["HETATM"]
df_others = pdb.df["OTHERS"]
df_aniso = pdb.df["ANISOU"]

In [56]:
df_atoms[3180:3240]
print(df_atoms["atom_name"][3183][0])

O


In [38]:
df_hetero.head(10)

Unnamed: 0,record_name,atom_number,blank_1,atom_name,alt_loc,residue_name,blank_2,chain_id,residue_number,insertion,blank_3,x_coord,y_coord,z_coord,occupancy,b_factor,blank_4,segment_id,element_symbol,charge,line_idx


In [39]:
df_others.head(10)

Unnamed: 0,record_name,entry,line_idx
0,HEADER,,0
1,TITLE,MDANALYSIS FRAME 0: Created by PDBWriter,1
2,CRYST1,1.000 1.000 1.000 90.00 90.00 90....,2
3,REMARK,285 UNITARY VALUES FOR THE UNIT CELL AUTO...,3
4,REMARK,285 BY MDANALYSIS PDBWRITER BECAUSE UNIT ...,4
5,REMARK,285 WAS MISSING.,5
6,REMARK,285 PROTEIN DATA BANK CONVENTIONS REQUIRE...,6
7,REMARK,"285 CRYST1 RECORD IS INCLUDED, BUT THE VA...",7
8,REMARK,285 THIS RECORD ARE MEANINGLESS.,8
9,END,,2274


In [40]:
df_aniso.head(10)

Unnamed: 0,record_name,atom_number,blank_1,atom_name,alt_loc,residue_name,blank_2,chain_id,residue_number,insertion,blank_3,"U(1,1)","U(2,2)","U(3,3)","U(1,2)","U(1,3)","U(2,3)",blank_4,element_symbol,charge,line_idx


# Check what elements are present

In [None]:
def check_for_elements(pdb_directory, parser):
    elements = list()
    for i in pdb_directory:
        biopdb = parser.read_pdb(i)
        df = biopdb.df["ATOM"]
        for j in df["atom_name"]:
            j = j[0]
            if j in elements:
                continue
            elif j not in elements:
                elements.append(j)
        print(f"Parsed {i}. Current elements: {elements}")
    return None

directory = os.listdir()
for i in directory:
    if i.endswith(".pdb") == False:
        directory.remove(i)

parser = PandasPdb()
check_for_elements(directory, parser)

# Creating Dataset

In [2]:
import torch
import copy
import numpy as np

### Tensor design:

##### Tensors are 10,000 (max atoms) by 4 (atom type, x, y, z).

If a pdb has less than the max atoms the remaining atoms will be considered a NoneType, with x, y, and z values of 999.999

##### There are 7 atom types, one-hot encoded:

None, Hydrogen, Carbon, Nitrogen, Oxygen, Phosphorous, Sulfur

In [38]:
def pdbToTorchFile(pdb_loc, parser, save_loc):
    print(f"Working on {pdb_loc}")
    array = np.zeros((10_000, 10))
    array[:, 7:] = 999.9999
    
    biopdb = parser.read_pdb(pdb_loc)
    df = biopdb.df["ATOM"]
    for i in range(len(df)):
        array[i, 7] = df["x_coord"][i]
        array[i, 8] = df["y_coord"][i]
        array[i, 9] = df["z_coord"][i]

        curAtm = df["atom_name"][i]
        
        if curAtm[0] == "H":
            array[i, 0] = 0
            array[i, 1] = 1
        elif curAtm[0] == "C":
            array[i, 0] = 0
            array[i, 2] = 1
        elif curAtm[0] == "N":
            array[i, 0] = 0
            array[i, 3] = 1
        elif curAtm[0] == "O":
            array[i, 0] = 0
            array[i, 4] = 1
        elif curAtm[0] == "P":
            array[i, 0] = 0
            array[i, 5] = 1
        elif curAtm[0] == "S":
            array[i, 0] = 0
            array[i, 6] = 1

    tens = torch.from_numpy(array)
    torch.save(tens, save_loc)
    return None

In [43]:
os.chdir("/home/taylor/UCDThesis")

parser = PandasPdb()

pdb_files = os.listdir("aligned")
pdb_files.sort()
for i in pdb_files:
    if i.endswith(".pdb") == False:
        pdb_files.remove(i)
    
names = copy.deepcopy(pdb_files)
for i in range(len(names)):
    new_name = names[i]
    new_name = new_name[:-4]
    names[i] = new_name

for i in range(len(pdb_files)):
    cur_file = pdb_files[i]
    pdb_files[i] = "aligned/" + cur_file
    
saveLoc = copy.deepcopy(names)
for i in range(len(saveLoc)):
    cur_file = saveLoc[i]
    saveLoc[i] = "machineLearning/simpleTorchInputs/" + cur_file + ".pt"

In [None]:
for i, j in zip(pdb_files, saveLoc):
    pdbToTorchFile(i, parser, j)

In [41]:
test = torch.load("simpleTorchInputs/d4pv4a1.pt")

In [42]:
print(test)

tensor([[  0.0000,   0.0000,   0.0000,  ...,   9.7580, -15.9190,  -3.4930],
        [  0.0000,   0.0000,   1.0000,  ...,   8.4760, -16.6050,  -3.7490],
        [  0.0000,   0.0000,   1.0000,  ...,   8.1140, -16.3040,  -5.1630],
        ...,
        [  0.0000,   0.0000,   0.0000,  ..., 999.9999, 999.9999, 999.9999],
        [  0.0000,   0.0000,   0.0000,  ..., 999.9999, 999.9999, 999.9999],
        [  0.0000,   0.0000,   0.0000,  ..., 999.9999, 999.9999, 999.9999]],
       dtype=torch.float64)


In [None]:
os.chdir("/home/taylor/UCDThesis")