In [1]:
"""
This file will create processed graphs from data
and store them in graphs directory output format will be gml
"""

# data file format
"""
	edge-id(0)	
	node_1_id(1)	
	empty(2)
	node_1_swissprot_id(3)		
	node_1_PIR_id(4)	
	node_1_GI_code(5)		
	node_2_id(6)	
	empty(7)
	node_2_swissprot_id(8)		
	node_2_PIR_id(9)	
	node_2_GI_code(10)
"""

# Import here
import networkx as nx
import csv
import os

In [2]:
# Constants here
DATA_DIR = "data"
INPUT_DATA_FILES = os.listdir(DATA_DIR)
FILENAME_PLACEHOLDER = "FILENAME_PLACEHOLDER"
ABSOLUTE_FILE_PATH = f'{DATA_DIR}/{FILENAME_PLACEHOLDER}'

NODE_INDEX = [1, 6]
IDENTIFIER_INDEX = [3, 8]


In [3]:
# Function here

def read_file_get_data(filename: str):
    file_path = ABSOLUTE_FILE_PATH.replace(FILENAME_PLACEHOLDER, filename)
    file_obj = open(file_path)
    csv_obj = csv.reader(file_obj, delimiter="\t")
    file_data = []
    for row in csv_obj:
        file_data.append(row)
    file_obj.close()
    return file_data


def generate_node_id(raw_data):
    # Column 1 & 6
    DIP_ID_TO_NID = {}
    NID_TO_PID = {}
    for record in raw_data:
        DIP_ID_TO_NID[record[NODE_INDEX[0]]] = DIP_ID_TO_NID.get(
            record[NODE_INDEX[0]], len(DIP_ID_TO_NID))
        DIP_ID_TO_NID[record[NODE_INDEX[1]]] = DIP_ID_TO_NID.get(
            record[NODE_INDEX[1]], len(DIP_ID_TO_NID))

    for record in raw_data:
        NID_TO_PID[DIP_ID_TO_NID[record[NODE_INDEX[0]]]
                   ] = record[IDENTIFIER_INDEX[0]]
        NID_TO_PID[DIP_ID_TO_NID[record[NODE_INDEX[1]]]
                   ] = record[IDENTIFIER_INDEX[1]]

    return DIP_ID_TO_NID, NID_TO_PID


def preprocess_data(raw_data):
    DIP_ID_TO_NID, NID_TO_PID = generate_node_id(raw_data)
    edgeList = []
    for record in raw_data:
        src = DIP_ID_TO_NID[record[NODE_INDEX[0]]]
        dst = DIP_ID_TO_NID[record[NODE_INDEX[1]]]
        edgeList.append([src, dst])
        edgeList.append([dst, src])
    return DIP_ID_TO_NID, NID_TO_PID, edgeList


def generate_nx_graph(raw_data):
    pre_processed_data = preprocess_data(raw_data=raw_data)
    DIP_ID_TO_NID, NID_TO_PID, edgeList = preprocess_data(raw_data)
    Graph = nx.Graph()
    nodes = []
    for dip in DIP_ID_TO_NID:
        nid = DIP_ID_TO_NID[dip]
        pid = NID_TO_PID[nid]
        nodes.append((nid, {'pid': pid}))

    Graph.add_nodes_from(nodes)
    Graph.add_edges_from(edgeList)
    return DIP_ID_TO_NID, NID_TO_PID, Graph


def procedure():
    for filename in INPUT_DATA_FILES:
        # get data
        raw_data = read_file_get_data(filename=filename)
        # generate graph
        DIP_ID_TO_NID, NID_TO_PID, Graph = generate_nx_graph(raw_data)
        # write to gml file
        nx.write_gml(Graph, f"graphs/{filename}.gml")
        print(filename, "nodes", len(Graph.nodes), "edges", len(Graph.edges))


In [4]:
# Driver code
if __name__=='__main__':
	
	procedure()

Core.tab nodes 2610 edges 6455
Full.tab nodes 4753 edges 15262
