## Create datastructure of a RNA sequence

Intended to be a series of 4 dataframes, obtained reading top and pdb file:
- bonds, 3 * Nbonds matrix, obtained from Section_bonds
- angles, 4 * Nangles matrix, obtained from Section_angles
- torsions, 5 * Ntors matrix, obtained from section_dihedrals
- sequence, a Nparticle * 6 matrix, containing:
-- particle type (number in range 1:11)
-- mass
-- charge
-- coordinates (x,y,z)


In [35]:
import pandas as pd
import numpy as np
from Bio.PDB import *
import os

In [36]:
rootpath = '/home/flechenault/Documents/Gianluca/Structure_DB_Amber_HiRE'
filename = '/Prep_pureHire/Output/2g1w/2g1w_CG.pdb'

parser = PDBParser()
model = parser.get_structure('Y', rootpath+filename)[0]
coords = []
for a in model.get_atoms():
    coords.append(a.get_coord())
coords = np.array(coords)
print(coords.shape)

(142, 3)


In [37]:
# Top file parser to extract relevant information

sections = ('PARTICLE_MASSES','PARTICLE_TYPE','CHARGES','BONDS','ANGLES','DIHEDRALS')

with open(rootpath+'/Prep_pureHire/Output/2g1w/'+'parameters.top', 'r') as f:
    reader = f.read()

    text = reader.split("SECTION PARTICLE_MASSES")[1].split("SECTION PARTICLE_TYPE")[0].strip()
    mass = [float(i) for i in text.split()]
    # print(mass)

    text = reader.split("SECTION PARTICLE_TYPE")[1].split("SECTION CHARGES")[0].strip()
    atom_type = [int(i) for i in text.split()]
    # print(atom_type)

    text = reader.split("SECTION CHARGES")[1].split("SECTION BOND_FORCE_CONSTANT")[0].strip()
    charge = [float(i) for i in text.split()]
    # print(charge)

    text = reader.split("SECTION BONDS")[1].split("SECTION ANGLES")[0].strip()
    bonds = [int(i) for i in text.split()]
    # print(bonds)

    text = reader.split("SECTION ANGLES")[1].split("SECTION DIHEDRALS")[0].strip()
    angles = [int(i) for i in text.split()]
    # print(angles)

    text = reader.split("SECTION DIHEDRALS")[1].strip()
    tors = [int(i) for i in text.split()]
    # print(tors)

In [38]:
# obtain energies from Amber

def is_float(n):
    try:
        float(n)
        return True
    except:
        return False

with open(rootpath+'/FA_PDB/Amber_energies/2g1w_energy', 'r') as f:
    energy = [float(n) for n in f.read().split() if is_float(n)]

In [39]:
# create Series with pandas, with all info about the RNA sequence

data = {'atom_type': atom_type,
        'mass': mass,
        'charge': charge,
        'x': coords[:,0],
        'y': coords[:,1],
        'z': coords[:,2],
        'bonds': bonds,
        'angles': angles,
        'torsions': tors,
        'energy': energy}
ds = pd.Series(data)
# print(ds[0].iloc[:,0])  # access first column of seq_info dataframe
print(ds)

atom_type    [2, 1, 4, 5, 6, 7, 3, 2, 1, 4, 5, 6, 7, 3, 2, ...
mass         [16.0, 12.01, 20.0, 12.0, 75.0, 75.0, 36.97, 1...
charge       [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0,...
x            [-3.051, -3.76, -4.308, -5.478, -5.461, -6.871...
y            [-9.645, -9.267, -7.866, -6.666, -7.28, -5.735...
z            [-13.68, -14.861, -14.729, -13.092, -10.543, -...
bonds        [6, 18, 1, 6, 9, 2, 9, 12, 3, 12, 15, 7, 3, 6,...
angles       [6, 9, 12, 3, 9, 12, 15, 6, 0, 3, 6, 8, 3, 6, ...
torsions     [6, 9, 12, 15, 1, 6, 9, 12, 15, 2, 6, 12, 15, ...
energy       [45.101535, 154.8894705, 621.3587353, 167.3106...
dtype: object


In [40]:
ds.to_pickle('/home/flechenault/Documents/Gianluca/2g1w.pkl')

In [41]:
ds2 = pd.read_pickle('/home/flechenault/Documents/Gianluca/2g1w.pkl')
print(type(ds2[0]))

<class 'list'>


In [42]:
# find all pdb files in a directory
path = '/home/flechenault/Documents/Gianluca/Structure_DB_Amber_HiRE/Prep_pureHire/Input'
for file in os.listdir(path):
    if file.endswith(".pdb"):
        print(file.split('.')[0])

2o32
1m5l
2g1g
2mbj
1f85
1r4h
1f7h
1p5n
1bgz
1yn1
1d0u
2tob
1ato
1y26
1nem
1q8n
1tfn
1ik1
1pjy
1i4c
1kp7
2gis
1ow9
2fdt
1p5p
2a43
1f9l
1ymo
377d
1zig
1nc0
1rng
1r2p
1jo7
2ixz
1roq
2aht
255d
1atw
1mfy
1hs2
1yne
1jp0
1scl
1hs8
1c0o
2hem
1s2f
1mfk
1i46
2b7g
1zih
1i4b
1fyp
1slp
1cq5
480d
1kaj
1jur
1ebq
2u2a
1byj
1r7z
1s9s
1nbr
1z30
1z31
1wks
1slo
1xst
1txs
6tna
1xwp
1idv
483d
28sp
28sr
1jtw
1z2j
1szy
1jwc
1qwb
1bzu
1mt4
1t28
1oq0
1b36
1hwq
2g1w
1ebr
1ie2
413d
2au4
1yn2
1qd3
2o33
1tbk
1lvj
1n8x
1hs3
1f7g
1hlx
1f79
2f87
1p79
2gv3
1fqz
1xsu
1f6x
1k6g
1l1w
1q75
1k5i
2euy
1bz2
1ju7
1ei2
1tjz
WTFSE
1p5o
2fey
1bz3
2es5
1e95
1uuu
1k4b
1l2x
2b57
2a9l
1mfj
1k2g
2g9c
1na2
1jtj
1hs1
1f7f
1jzc
1tlr
1yng
1u3k
387d
2gip
283d
1d0t
1syz
1ebs
1u2a
1atv
1lux
1uui
1o15
1zc5
1s34
2f88
1bn0
1lc6
1f78
2gio
1zif
1p5m
1xhp
1xsh
1kka
2ixy
1uts
1esh
2k96
1ie1
1k4a
1m82
1f7i
1i3x
1kks
1ysv
1luu
1xsg
2ho7
2kx8
1cql
1rnk
1sy4
1rfr
1xwu
1f6z
1nz1
1feq
1ync
1fyo
2tpk
1fhk
3dig
1ylg
1k6h
1hs4
1qwa
1rht
1vop
1j4y
1jox
1r7w

In [43]:
# extract pars from scale_RNA.dat

with open(rootpath+'/Prep_pureHire/scale_RNA.dat', 'r') as f:
    pars = []
    for line in f.readlines():
        pars.append(float(line.split()[1]))
    print(pars)

[2.608, 2.073, 1.519, 2.355, 4.19, 4.698, 4.824, 5.636, 2.13, 1.307, 15.223, 1.0, 2.8, 2.505, 1.826, 3.932, 4.309, 4.775, 4.546, 2.821, 3.813, 3.01, 0.908, 3.0, 4.0, 2.257, 0.48, 0.5, 4.247, 10.816, 11.121, 5.819, 0.501, 0.73, 0.331, 0.257, 0.224, 0.207, 1.2, 1.5, 0.4, 1.8, 0.8, 142.306, 1.0, 0.0, 1.483]


In [44]:
# Obtain bond_type, angle_type, tors_type from top file

with open(rootpath+'/Prep_pureHire/Output/2g1w/'+'parameters.top', 'r') as f:
    reader = f.read()

    text = reader.split("SECTION BOND_FORCE_CONSTANT")[1].split("SECTION ANGLE_FORCE_CONSTANT")[0].strip()
    bond_type = np.array([float(i) for i in text.split() if is_float(i)]).reshape(2,-1).transpose()
    # print(bond_type)

    text = reader.split("SECTION ANGLE_FORCE_CONSTANT")[1].split("SECTION DIHEDRAL_FORCE_CONSTANT")[0].strip()
    angle_type = np.array([float(i) for i in text.split() if is_float(i)]).reshape(2,-1).transpose()

    text = reader.split("SECTION DIHEDRAL_FORCE_CONSTANT")[1].split("SECTION BONDS")[0].strip()
    tors_type = np.array([float(i) for i in text.split() if is_float(i)]).reshape(3,-1).transpose()
    # print(tors_type)


In [45]:
import torch
import LocalEnergy as le

In [46]:
## get energies from file

ds = pd.read_pickle('/home/flechenault/Documents/Gianluca/2g1w.pkl')
print(ds)
atom = ds[0]
mass =  ds[1]
charge = ds[2]
coords = torch.tensor(np.array((ds[3],ds[4],ds[5])).transpose())
print(coords.shape)
bonds = np.array(ds[6]).reshape(-1,3).transpose()
angles = np.array(ds[7]).reshape(-1,4).transpose()
tors = np.array(ds[8]).reshape(-1,5).transpose()

E_bonds = le.bonds_energy(coords,bonds,bond_type,pars)
E_angles = le.angles_energy(atom,coords,angles,angle_type,pars)
E_tors = le.torsions_energy(atom,coords,tors,tors_type,pars)
print(E_bonds, E_angles, E_tors)

atom_type    [2, 1, 4, 5, 6, 7, 3, 2, 1, 4, 5, 6, 7, 3, 2, ...
mass         [16.0, 12.01, 20.0, 12.0, 75.0, 75.0, 36.97, 1...
charge       [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0,...
x            [-3.051, -3.76, -4.308, -5.478, -5.461, -6.871...
y            [-9.645, -9.267, -7.866, -6.666, -7.28, -5.735...
z            [-13.68, -14.861, -14.729, -13.092, -10.543, -...
bonds        [6, 18, 1, 6, 9, 2, 9, 12, 3, 12, 15, 7, 3, 6,...
angles       [6, 9, 12, 3, 9, 12, 15, 6, 0, 3, 6, 8, 3, 6, ...
torsions     [6, 9, 12, 15, 1, 6, 9, 12, 15, 2, 6, 12, 15, ...
energy       [45.101535, 154.8894705, 621.3587353, 167.3106...
dtype: object
torch.Size([142, 3])
tensor(59.4831) tensor(240403.1250) tensor(1440.0828)


In [47]:
a = torch.tensor([1,2,3])
b = torch.tensor([-1,-2,-6])
torch.cross(a,b)

tensor([-6,  3,  0])

In [48]:
path = '/home/flechenault/Documents/Gianluca/Structure_DB_Amber_HiRE/FA_PDB/Source_PDB/'
filelist = []

for file in os.listdir(path):
    if file.endswith(".pdb"):
        filelist.append(file)
print(filelist)

i = filelist.index('1ymo.pdb')
print(filelist[i+1])

['2o32.pdb', '1m5l.pdb', '2g1g.pdb', '2mbj.pdb', '1f85.pdb', '1r4h.pdb', '1f7h.pdb', '1p5n.pdb', '1bgz.pdb', '1yn1.pdb', '1d0u.pdb', '2tob.pdb', '1ato.pdb', '1y26.pdb', '1nem.pdb', '1q8n.pdb', '1tfn.pdb', '1ik1.pdb', '1pjy.pdb', '1i4c.pdb', '1kp7.pdb', '2gis.pdb', '1ow9.pdb', '2fdt.pdb', '1p5p.pdb', '2a43.pdb', '1f9l.pdb', '1ymo.pdb', '377d.pdb', '1zig.pdb', '1nc0.pdb', '1rng.pdb', '1r2p.pdb', '1jo7.pdb', '2ixz.pdb', '1roq.pdb', '2aht.pdb', '255d.pdb', '1atw.pdb', '1mfy.pdb', '1hs2.pdb', '1yne.pdb', '1jp0.pdb', '1scl.pdb', '1hs8.pdb', '1c0o.pdb', '2hem.pdb', '1s2f.pdb', '1mfk.pdb', '1i46.pdb', '2b7g.pdb', '1zih.pdb', '1i4b.pdb', '1fyp.pdb', '1slp.pdb', '1cq5.pdb', '480d.pdb', '1kaj.pdb', '1jur.pdb', '1ebq.pdb', '2u2a.pdb', '1byj.pdb', '1r7z.pdb', '1s9s.pdb', '1nbr.pdb', '1z30.pdb', '1z31.pdb', '1wks.pdb', '1slo.pdb', '1xst.pdb', '1txs.pdb', '6tna.pdb', '1xwp.pdb', '1idv.pdb', '483d.pdb', '28sp.pdb', '28sr.pdb', '1jtw.pdb', '1z2j.pdb', '1szy.pdb', '1jwc.pdb', '1qwb.pdb', '1bzu.pdb', '1m