In [None]:
import pandas as pd
import re
import subprocess
import matplotlib as mpl
mpl.use('Agg') 
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

`mv Untitled.jar ConnectedComponents.jar`

worked:

```
badger:Neo4j_meta4 janet$ pwd
/Users/janet/Neo4j_meta4
badger:Neo4j_meta4 janet$ java -jar ConnectedComponents.jar 0.03
``` 

In [None]:
! pwd

In [None]:
# for Waffle, until the default Java is set to 8:
#java="/usr/lib/jvm/java-8-oracle/jre/bin/java"
# for badger, set java = 'java'
java='java'

In [None]:
! pwd

In [None]:
! ls -l ../jars/*.jar

In [None]:
command = [java, '-jar', '../jars/ConstructNetwork50M.jar', '0.06']
print(' '.join(command))
example_build = subprocess.check_output(command)

In [None]:
print(example_build.decode('utf-8'))

In [None]:
def parse_db_building_stdout(stdout):
    #print(stdout)
    nodes_edges = re.findall(
        'after network construction: (\d+), (\d+)', 
        str(stdout))
    print(nodes_edges)
    assert len(nodes_edges) == 1, 'expected one count for nodes and one for edges'
    nodes, edges = nodes_edges[0]
    
    density = re.findall('Graph density: (\d*\.\d+|[-+]?\d+)', str(stdout))
    assert len(density) == 1, 'should report one density.  Found {}'.format(density)
    print(density)
    return nodes, edges, density[0]

In [None]:
parse_db_building_stdout(example_build.decode('utf-8'))

In [None]:
query = [java, '-jar', '../jars/ConnectedComponentsFinder50M.jar', 
     '../data_mining_Neo4j_v2_3_2/databases/db_50M_0.060000']
print(" ".join(query))
example_result = subprocess.check_output(query)

In [None]:
print(example_result.decode('utf-8'))

In [None]:
re.findall(r'There are \d+ different connected components for cutoff \d+.\d+', 
           str(example_result))

### Run the real query

In [None]:
cutoffs = [0.06, 0.65]
#cutoffs = [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07]

In [None]:
def build_databases(cutoffs):
    db_info = pd.DataFrame()
    for c in cutoffs:
        print('build db for cutoff = {}'.format(c))
        example_build = subprocess.check_output(
            [java, '-jar', '../jars/ConstructNetwork50M.jar', str(c)])
        nodes, edges, density = parse_db_building_stdout(example_build.decode('utf-8'))
        #print(nodes, edges, density)
        info = {'pcor cutoff':c, 'nodes':nodes, 'edges':edges, 'density': density}
        info = {k:[v] for k, v in info.items()}
        #print(info)
        db_info = pd.concat([db_info, pd.DataFrame(info)])
        db_info.to_csv('database_construction_info.tsv')
    return db_info
    # TODO: plot time for construction w/ different # of nodes. 
    

In [None]:
build_databases(cutoffs)

In [None]:
! ls -l ../data_mining_Neo4j_v2_3_2/databases/

In [None]:
def connected_components(filepath):
    print('find connected components for {}'.format(filepath))
    command = [java, '-jar', '../jars/ConnectedComponentsFinder50M.jar', filepath]
    print(" ".join(command))
    example_result = subprocess.check_output(command)
    results = str(example_result,'utf-8')
    result_sentence = re.findall(r'There are \d+ different connected '
                                 'components for cutoff \d+.\d+', results)[0]
    print(result_sentence)
    cc = re.findall('(\d+) different', result_sentence)
    cc[0] = int(cc[0])
    cutoff = re.findall('for cutoff (\d+.\d+)', result_sentence)
    cutoff[0] = float(cutoff[0])
    return {'cutoff': cutoff, 'connected components':cc}

In [None]:
connected_components('../data_mining_Neo4j_v2_3_2/databases/db_50M_0.060000')

In [None]:
results = pd.DataFrame()
for c in cutoffs:
    print("assess cutoff {}".format(c))
    path = '../data_mining_Neo4j_v2_3_2/databases/db_50M_' + str(c) + '0000'
    result = pd.DataFrame(connected_components(path))
    results = pd.concat([results, result], axis=0)
results

In [None]:
def plot(df, x_col, y_col, filename=None):
    x = df[x_col]
    y = df[y_col]
    
    fig, ax = plt.subplots(1, 1, figsize=(4, 3))
    plt.plot(x, y, linestyle='--', marker='o', color='#756bb1')
    ax.set_ylim(bottom=0) 
    
    #plt.legend(loc='best')
    ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
    plt.xlabel(x_col)
    plt.ylabel(y_col)
    plt.title('Number of non-singleton connected components')
    plt.tight_layout()
    if filename:
        fig.savefig(filename)
    return fig

In [None]:
connected_components = plot(results, 'cutoff', 'connected components')