# Molecules

- Course: Pattern Recognition
- Exercise: Exercise 5 - Molecules
- Groupe: chaussette
- Students: Sergiy Goloviatinski, Ludovic Heberlin, Hina Khadija, Raphaël Margueron

## Requirements
Install the following libaries and their dependencies (pip a requirements file is available):
- networkx (for graph manipulation)
- numpy, scipy

In [1]:
!pip install -r requirements.txt

Defaulting to user installation because normal site-packages is not writeable


## Preprocessing

In [103]:
import os
import networkx as nx
import xml.etree.ElementTree as ET
import numpy as np
import scipy as sp
import sklearn
from tqdm.notebook import tqdm

In [90]:
GXL_FOLDER = "./MoleculesClassification/gxl/"
TRAIN_FILE = "./MoleculesClassification/train.txt"
VALIDATION_FILE = "./MoleculesClassification/valid.txt"

In [91]:


type_table = {
    "string": str,
    "int": int,
    "float": float
}

def load_gxl_file(file_name):
    full_path = GXL_FOLDER + file_name + ".gxl"
    tree = ET.parse(full_path)
    root = tree.getroot()
    graph = root.find("graph")
    nodes = graph.findall("node")
    edges = graph.findall("edge")
    
    G = nx.Graph()
    for node in nodes:
        node_id = node.get("id")
        attr_dict = {attr.get("name"): type_table[attr[0].tag](attr[0].text) for attr in node.findall("attr")}
        G.add_node(node_id, attr_dict=attr_dict)

    for edge in edges:
        edge_from = edge.get("from")
        edge_to = edge.get("to")
        attr_dict = {attr.get("name"): type_table[attr[0].tag](attr[0].text) for attr in node.findall("attr")}
        G.add_edge(edge_from, edge_to, attr_dict=attr_dict)

    return G

def get_gxl_data(dataset):
    lines = [l.split(" ") for l in open(dataset).read().split("\n")]
    lines = [l for l in lines if len(l) == 2]
    
    file_names, classes = zip(*lines)
    graphs = tuple(load_gxl_file(file_name) for file_name in file_names)

    return graphs, classes
    
    
train_graphs, train_classes = get_gxl_data(TRAIN_FILE)
validation_graphs, validation_classes = get_gxl_data(VALIDATION_FILE)

print(f"Nb of graphs for training : {len(train_graphs)}")
print(f"Nb of graphs for validation : {len(validation_graphs)}")

Nb of graphs for training : 250
Nb of graphs for validation : 250


In [92]:
def ged(G_1, G_2, matrix_form=False, n_approxims=1):
    
    if(matrix_form):
        G_1 = nx.from_scipy_sparse_matrix(G_1)
        G_2 = nx.from_scipy_sparse_matrix(G_2)
        
    ged_generator = nx.optimize_graph_edit_distance(G_1, G_2)
    for i in range(n_approxims):
        val = next(ged_generator)
        
    return val

In [133]:
def get_distance_matrix(dataset):
    distance_matrix=[[0 for _ in range(len(dataset))] for _ in range(len(dataset))]

    for i in tqdm(range(len(dataset))):
        for j in range(len(dataset)):
            if i>j:
                distance_matrix[i][j]=ged(dataset[i],dataset[j])

    for i in tqdm(range(len(dataset))):
        for j in range(len(dataset)):
            if i<j:
                distance_matrix[i][j]=distance_matrix[j][i]
    
    return distance_matrix

In [127]:
X_train=get_distance_matrix(train_graphs)

HBox(children=(FloatProgress(value=0.0, max=250.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=250.0), HTML(value='')))




In [129]:
y_train = train_classes

In [137]:
from sklearn.neighbors import KNeighborsClassifier

params = {
    'n_neighbors':3,
    'algorithm':'ball_tree',
}

metric_params = {
    'matrix_form':True,
    'n_approxims':1,
}

nbrs = KNeighborsClassifier(n_neighbors=params['n_neighbors'], 
                        metric='precomputed',
                        n_jobs=4)

nbrs.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='precomputed',
                     metric_params=None, n_jobs=4, n_neighbors=3, p=2,
                     weights='uniform')

In [138]:
X_validation=get_distance_matrix(validation_graphs)

HBox(children=(FloatProgress(value=0.0, max=250.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=250.0), HTML(value='')))




In [139]:
y_validation=validation_classes

In [140]:
predictions=nbrs.predict(X_validation)

In [142]:
from sklearn.metrics import accuracy_score
accuracy=accuracy_score(y_validation, predictions)
print(accuracy)

0.98
