In [1]:
import sys
import pandas as pd
from neo4j import ServiceUnavailable
from CORD19_GraphOfDocs.neo4j_wrapper import Neo4jDatabase
from CORD19_GraphOfDocs import select
from sklearn.utils import shuffle
import numpy as np

In [2]:
def connect_to_the_database():
    try:
        database = Neo4jDatabase('bolt://localhost:7687', 'neo4j', '123')
        # Neo4j server is unavailable.
        # This client app cannot open a connection.
    except ServiceUnavailable as error:
        print('\t* Neo4j database is unavailable.')
        print('\t* Please check the database connection before running this app.')
        input('\t* Press any key to exit the app...')
        sys.exit(1)

    return database

def disconnect_from_the_database(database):
    database.close()

In [3]:
database = connect_to_the_database()

In [4]:
def remove_duplicates(df):
    df = df[~pd.DataFrame(np.sort(df[['node1','node2']].values, 1)).duplicated()]
    return df

In [5]:
number_of_samples = 1000
train_positive_samples = select.get_positive_examples(database, limit=number_of_samples, train_set=True)
train_negative_samples = select.get_negative_examples(database, limit=number_of_samples, train_set=True)

test_positive_samples = select.get_positive_examples(database, limit=number_of_samples, train_set=False)
test_negative_samples = select.get_negative_examples(database, limit=number_of_samples, train_set=False, min_hops=2, max_hops=2)

columns = ['node1', 'node2', 'label']
train_positive_samples_df = pd.DataFrame(train_positive_samples, columns=columns)
train_positive_samples_df = remove_duplicates(train_positive_samples_df)
train_negative_samples_df = pd.DataFrame(train_negative_samples, columns=columns)
train_negative_samples_df = remove_duplicates(train_negative_samples_df)
train_positive_samples_df = shuffle(train_positive_samples_df)
train_negative_samples_df = shuffle(train_negative_samples_df)

samples_num = min(len(train_positive_samples_df), len(train_negative_samples_df))
train_positive_samples_df = train_positive_samples_df.iloc[:samples_num]
train_negative_samples_df = train_negative_samples_df.iloc[:samples_num]

test_positive_samples_df = pd.DataFrame(test_positive_samples, columns=columns)
test_positive_samples_df = remove_duplicates(test_positive_samples_df)
test_negative_samples_df = pd.DataFrame(test_negative_samples, columns=columns)
test_negative_samples_df = remove_duplicates(test_negative_samples_df)
test_positive_samples_df = shuffle(test_positive_samples_df)
test_negative_samples_df = shuffle(test_negative_samples_df)

samples_num = min(len(test_positive_samples_df), len(test_negative_samples_df))
test_positive_samples_df = test_positive_samples_df.iloc[:samples_num]
test_negative_samples_df = test_negative_samples_df.iloc[:samples_num]

train_df = pd.concat([train_positive_samples_df, train_negative_samples_df])
train_df = shuffle(train_df)
test_df = pd.concat([test_positive_samples_df, test_negative_samples_df])
test_df = shuffle(test_df)
print(len(train_df))
print(len(test_df))

846
1544


In [6]:
print(f'Train df positive examples {len(train_df[train_df["label"] == 1])}')
print(f'Train df negative examples {len(train_df[train_df["label"] == 0])}')
print(f'Test df positive examples {len(test_df[test_df["label"] == 1])}')
print(f'Test df negative examples {len(test_df[test_df["label"] == 0])}')

Train df positive examples 423
Train df negative examples 423
Test df positive examples 772
Test df negative examples 772


In [7]:
def get_samples_dataframe_with_features(database, original_df, train_set):
    pairs = original_df[['node1', 'node2', 'label']].values.tolist()
    results = select.create_graph_features(database, pairs, train_set=train_set)
    df = pd.DataFrame(results, columns=[
        'node1',
        'node2',
        'adamic_adar',
        'common_neighbors', 
        'preferential_attachment',
        'total_neighbors',
        'similarity',
        'label',
    ])
    return df

print('Calculate train df features')
train_df = get_samples_dataframe_with_features(database, train_df, train_set=True)
print('Calculate test df features')
test_df = get_samples_dataframe_with_features(database, test_df, train_set=False)

Calculate train df features
Calculate test df features


In [8]:
train_df.fillna({'similarity': 0}, inplace=True)
test_df.fillna({'similarity': 0}, inplace=True)

In [9]:
disconnect_from_the_database(database)

In [10]:
train_df.to_csv(f'train_{len(train_df)}.csv')
test_df.to_csv(f'test_{len(test_df)}.csv')