# Molecules

- Course: Pattern Recognition
- Exercise: Exercise 5 - Molecules
- Groupe: chaussette
- Students: Sergiy Goloviatinski, Ludovic Heberlin, Hina Khadija, Raphaël Margueron

## Requirements
Install the following libaries and their dependencies (pip a requirements file is available):
- networkx (for graph manipulation)

In [4]:
!pip install -r requirements.txt



## Preprocessing

In [5]:
GXL_FOLDER = "./MoleculesClassification/gxl/"
TRAIN_FILE = "./MoleculesClassification/train.txt"
VALIDATION_FILE = "./MoleculesClassification/valid.txt"

In [6]:
import os
import networkx as nx
import xml.etree.ElementTree as ET

type_table = {
    "string": str,
    "int": int,
    "float": float
}

def load_gxl_file(file_name):
    full_path = GXL_FOLDER + file_name + ".gxl"
    tree = ET.parse(full_path)
    root = tree.getroot()
    graph = root.find("graph")
    nodes = graph.findall("node")
    edges = graph.findall("edge")
    
    G = nx.Graph()
    for node in nodes:
        node_id = node.get("id")
        attr_dict = {attr.get("name"): type_table[attr[0].tag](attr[0].text) for attr in node.findall("attr")}
        G.add_node(node_id, attr_dict=attr_dict)

    for edge in edges:
        edge_from = edge.get("from")
        edge_to = edge.get("to")
        attr_dict = {attr.get("name"): type_table[attr[0].tag](attr[0].text) for attr in node.findall("attr")}
        G.add_edge(edge_from, edge_to, attr_dict=attr_dict)

    return G

def get_gxl_data(dataset):
    lines = [l.split(" ") for l in open(dataset).read().split("\n")]
    lines = [l for l in lines if len(l) == 2]
    
    file_names, classes = zip(*lines)
    graphs = tuple(load_gxl_file(file_name) for file_name in file_names)

    return graphs, classes
    
    
train_graphs, train_classes = get_gxl_data(TRAIN_FILE)
validation_graphs, validation_classes = get_gxl_data(VALIDATION_FILE)