In [20]:
from graph_converter.converter import Converter

In [21]:
## changing relational data to graph structure

# input locations
data_file = 'data/inputs/hospital_dataset.csv'
fd_file = 'data/constraints/hospital_constraints.txt'

# output locations
edges_file = 'data/edges/hospital.edges'
map_file = 'data/maps/hospital.map'

converter = Converter()

converter.save_graph(data_file, fd_file, edges_file, map_file)

# maps (value, attr) pairs to integer values. need ints in the embedding to distinguish between identical values in different columns
graph_map = converter.get_map()

In [22]:
import numpy as np
import pandas as pd

data = pd.read_csv(data_file)

In [23]:
import random
from distances.distances import HypDistance

NUM_SAMPLES = 50000

hyp_dist = HypDistance('hospital.r3.emb', graph_map)
cols = range(data.shape[1])

changes = list()
distances = list()
for i in range(NUM_SAMPLES):
    num_changes = random.randint(1, data.shape[1])
    change_cols = random.sample(cols, num_changes)
    
    tup = data.sample(1).values[0]
    
    modified_tup = np.copy(tup)
    
    for change_col in change_cols:
        new_val = random.sample(data.iloc[:,change_col].unique(), 1)[0]
        modified_tup[change_col] = new_val
        
    dist = hyp_dist.pair_distance(tup, modified_tup)
    
    changes.append(num_changes)
    distances.append(dist) 

In [24]:
df = pd.DataFrame(data = {'count': changes, 'distance': distances})

In [25]:
df.head()

Unnamed: 0,count,distance
0,7,5.333804
1,12,12.795169
2,2,2.440976
3,1,1.167744
4,7,5.956419
