In [1]:
import numpy as np
import math
import json

In [2]:
# Parse Protein Data
proteins = { }
header = True
headerValues = [ ]
lineValues = [ ]
with open( "./data/BioVis-challenge-alphafold-data.csv", "r" ) as alphafoldData :
    line = alphafoldData.readline( )
    while line :
        if header :
            header = False
            headerValues = line.strip( ).split( "," )
        else :
            lineValues = line.strip( ).split( "," )
            if not lineValues[ 0 ] in proteins :
                proteins[ lineValues[ 0 ] ] = { "residues": { } }
            proteins[ lineValues[ 0 ] ][ "residues" ][ lineValues[ 1 ] + "@" + lineValues[ 2 ] ] = {
                "coordinates": dict( zip( headerValues[ 4:16 ], [ float( v ) if v != '' else 'null' for v in lineValues[ 4:16 ] ] ) ),
                "structureInformation": dict( zip( headerValues[ 16:18 ], lineValues[ 16:18 ] ) ),
                "ptm": [ ],
                "contacts": [ ]
            }
        line = alphafoldData.readline( )

In [3]:
# Enrich Protein Data with PTMs
header = True
headerValues = [ ]
lineValues = [ ]
with open( "./data/BioVis-challenge-test-data.csv", "r" ) as testData :
    line = testData.readline( )
    while line :
        if header :
            header = False
            headerValues = line.strip( ).split( "," )
        else :
            lineValues = line.strip( ).split( "," )
            ACC = lineValues[ 0 ]
            if "classification" not in proteins[ ACC ] :
                proteins[ ACC ][ "classification" ] = lineValues[ 4 ]
            proteins[ ACC ][ "residues" ][ lineValues[ 1 ] + "@" + lineValues[ 2 ] ][ "ptm" ].append( lineValues[ 3 ] )
            proteins[ ACC ][ "residues" ][ lineValues[ 1 ] + "@" + lineValues[ 2 ] ][ "predicted_class" ].append( lineValues[ 7 ] ) # Adds class for artefact catching
        line = testData.readline( )

In [4]:
# Infer close contacts, i.e. ca below 5 Angstrom
for ACC in proteins :
    for ri in proteins[ ACC ][ "residues" ] :
        ri_x = proteins[ ACC ][ "residues" ][ ri ][ "coordinates" ][ "x_coord_ca" ]
        ri_y = proteins[ ACC ][ "residues" ][ ri ][ "coordinates" ][ "y_coord_ca" ]
        ri_z = proteins[ ACC ][ "residues" ][ ri ][ "coordinates" ][ "z_coord_ca" ]
        for rj in proteins[ ACC ][ "residues" ] :
            if ri != rj :
                rj_x = proteins[ ACC ][ "residues" ][ rj ][ "coordinates" ][ "x_coord_ca" ]
                rj_y = proteins[ ACC ][ "residues" ][ rj ][ "coordinates" ][ "y_coord_ca" ]
                rj_z = proteins[ ACC ][ "residues" ][ ri ][ "coordinates" ][ "z_coord_ca" ]
                distance = math.dist(
                    ( ri_x, ri_y, ri_z ),
                    ( rj_x, rj_y, rj_z )
                )
                if distance <= 5.0 :
                    proteins[ ACC ][ "residues" ][ ri ][ "contacts" ].append( rj )

In [5]:
# Store collected data in JSON.
with open( "./data/proteinData.json", "w+" ) as outFile :
    json.dump( proteins, outFile, indent = 1 )