# Molecules

- Course: Pattern Recognition
- Exercise: Exercise 5 - Molecules
- Groupe: chaussette
- Students: Sergiy Goloviatinski, Ludovic Heberlin, Hina Khadija, Raphaël Margueron

## Requirements
Install the following libaries and their dependencies (pip a requirements file is available):
- networkx (for graph manipulation)
- numpy, scipy

In [1]:
!pip install -r requirements.txt



You should consider upgrading via the 'E:\Documents\School\Pattern_Recognition\Exercises\PR_Course_Team\exercise5\venv\Scripts\python.exe -m pip install --upgrade pip' command.




## Preprocessing

In [2]:
import os
import networkx as nx
import xml.etree.ElementTree as ET
import numpy as np
import scipy as sp
import sklearn

In [3]:
GXL_FOLDER = "./MoleculesClassification/gxl/"
TRAIN_FILE = "./MoleculesClassification/train.txt"
VALIDATION_FILE = "./MoleculesClassification/valid.txt"

In [4]:


type_table = {
    "string": str,
    "int": int,
    "float": float
}

def load_gxl_file(file_name):
    full_path = GXL_FOLDER + file_name + ".gxl"
    tree = ET.parse(full_path)
    root = tree.getroot()
    graph = root.find("graph")
    nodes = graph.findall("node")
    edges = graph.findall("edge")
    
    G = nx.Graph()
    for node in nodes:
        node_id = node.get("id")
        attr_dict = {attr.get("name"): type_table[attr[0].tag](attr[0].text) for attr in node.findall("attr")}
        G.add_node(node_id, attr_dict=attr_dict)

    for edge in edges:
        edge_from = edge.get("from")
        edge_to = edge.get("to")
        attr_dict = {attr.get("name"): type_table[attr[0].tag](attr[0].text) for attr in node.findall("attr")}
        G.add_edge(edge_from, edge_to, attr_dict=attr_dict)

    return G

def get_gxl_data(dataset):
    lines = [l.split(" ") for l in open(dataset).read().split("\n")]
    lines = [l for l in lines if len(l) == 2]
    
    file_names, classes = zip(*lines)
    graphs = tuple(load_gxl_file(file_name) for file_name in file_names)

    return graphs, classes
    
    
train_graphs, train_classes = get_gxl_data(TRAIN_FILE)
validation_graphs, validation_classes = get_gxl_data(VALIDATION_FILE)

print(f"Nb of graphs for training : {len(train_graphs)}")
print(f"Nb of graphs for validation : {len(validation_graphs)}")

Nb of graphs for training : 250
Nb of graphs for validation : 250


In [12]:
def ged(G_1, G_2, matrix_form=False, n_approxims=1):
    
    if(matrix_form):
        G_1 = nx.from_scipy_sparse_matrix(G_1)
        G_2 = nx.from_scipy_sparse_matrix(G_2)
        
    ged_generator = nx.optimize_graph_edit_distance(G_1, G_2)
    for i in range(n_approxims):
        val = next(ged_generator)
        
    return val

<class 'networkx.classes.graph.Graph'>
25.0
<class 'networkx.classes.graph.Graph'>


In [9]:
from sklearn.neighbors import NearestNeighbors

params = {
    'n_neighbors':1,
    'algorithm':'ball_tree',
}

metric_params = {
    'matrix_form':True,
    'n_approxims':1,
}

X_train = np.array([nx.to_scipy_sparse_matrix(G) for G in train_graphs])
y_train = train_classes
print(X_train[0])

  (0, 1)	1
  (0, 3)	1
  (0, 4)	1
  (1, 0)	1
  (1, 2)	1
  (1, 9)	1
  (2, 1)	1
  (3, 0)	1
  (4, 0)	1
  (4, 5)	1
  (5, 4)	1
  (5, 6)	1
  (6, 5)	1
  (6, 7)	1
  (6, 8)	1
  (6, 9)	1
  (7, 6)	1
  (8, 6)	1
  (9, 1)	1
  (9, 6)	1


In [10]:
nbrs = NearestNeighbors(n_neighbors=params['n_neighbors'], algorithm=params['algorithm'], 
                        metric=ged, metric_params=metric_params,
                        n_jobs=4)

nbrs.fit(X_train, y_train)

ValueError: setting an array element with a sequence.