In [1]:
# %cd /data/bruingjde/on-going/SNAM2021-code/

import numpy as np
import pandas as pd
import sklearn.model_selection
import sklearn.pipeline
import sklearn.preprocessing
import sklearn.metrics
import sklearn.linear_model
from tqdm.auto import tqdm

In [7]:
networks = [network for network in np.arange(1, 1)]
# networks = [network for network in np.arange(1, 31) if network not in [15, 17, 26, 27]]

In [8]:
def get_performance(network: int, nswap_perc: int = 0, feature_set: str = 'II-A', model: str = 'LogisticRegression'):
    with open(f'data/{network:02}/{nswap_perc:+04.0f}/properties/{feature_set}_{model}.float') as file:
        return float(file.read())

In [9]:
def read_file(path):
    extension = path.split('.')[1]
    if extension == 'int':
        with open(path) as file:
            return int(file.read())
    elif extension == 'float':
        with open(path) as file:
            return float(file.read())
    else:
        raise Exception(f'{extension=}')

def get_stats(network: int):
    properties_dir = f'data/{network:02}/+000/properties/'
    properties = {prop.split('.')[0]: read_file(properties_dir + prop) 
                  for prop 
                  in ['nodes.int', 'edges.int', 'connected_pairs.int', 'edges.int', 
                      'assortativity.float', 'average_clustering.float', 'diameter.int']}
    info = pd.read_json('networks.jsonl', lines=True).set_index('index').loc[network]
    return {
        'Label': info['label'],
        'Domain': info['category'],
        '\\bar e': properties['edges'] / properties['connected_pairs'],
        'Nodes': properties['nodes'], 
        'Edges': properties['edges'],
        'Density': 2*properties['connected_pairs'] / (properties['nodes']*(properties['nodes'] - 1)),
        'D.a.': properties['assortativity'],
        'A.c.c': properties['average_clustering'],
        'Diameter': properties['diameter'],
        '': '\cite{' + info['source'] + '}' #type: ignore
    }

In [10]:
info = pd.read_json('networks.jsonl', lines=True).set_index('index')
table = pd.DataFrame({network: get_stats(network) for network in networks}).T
df = pd.DataFrame({
    'label': info['label'],
    'domain': info['category'],
    '$\\bar e$': table['\\bar e'],
    'Nodes $(n)$': table['Nodes'],
    'I': {network: get_performance(network, feature_set='I') for network in networks},
    'II-A': {network: get_performance(network, feature_set='II-A') for network in networks},
    'II-B': {network: get_performance(network, feature_set='II-B') for network in networks},
    'III': {network: get_performance(network, feature_set='III') for network in networks},
}).dropna().sort_values('Nodes $(n)$')
df

ValueError: Unexpected character found when decoding 'null'

In [59]:
(df['II-A'] - df['I']).mean()

0.04215327987609877

In [60]:
(df['II-A'] - df['I']).std()

0.034623752733070584