Skip to content

Commit

Permalink
Merge 42fe149 into 9fd270e
Browse files Browse the repository at this point in the history
  • Loading branch information
justaddcoffee committed Sep 23, 2020
2 parents 9fd270e + 42fe149 commit 556825f
Show file tree
Hide file tree
Showing 15 changed files with 76 additions and 153 deletions.
135 changes: 47 additions & 88 deletions kg_covid_19/edges.py → kg_covid_19/make_holdouts.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,12 @@
import pandas as pd # type: ignore
import numpy as np # type: ignore
from tqdm import tqdm # type: ignore
from ensmallen_graph import EnsmallenGraph # type: ignore


def make_edges(nodes: str, edges: str, output_dir: str,
train_fraction: float, validation: bool,
min_degree: int, check_disconnected_nodes: bool = False,
remove_extra_cols: bool = False) -> None:
"""Prepare positive and negative edges for testing and training (see run.py edges
def make_holdouts(nodes: str, edges: str, output_dir: str,
train_fraction: float, validation: bool, seed=42) -> None:
"""Prepare positive and negative edges for testing and training (see run.py holdouts
command for documentation)
Args:
Expand All @@ -22,55 +21,50 @@ def make_edges(nodes: str, edges: str, output_dir: str,
:param output_dir: directory to output edges and new graph [data/edges/]
:param train_fraction: fraction of edges to emit as training
:param validation: should we make validation edges? [False]
:param min_degree when choosing positive edges, what is the minimum degree
of nodes involved in the edge [2]
:param check_disconnected_nodes: should we check for disconnected nodes (i.e.
nodes with degree of 0) in input graph? [False]
:param remove_extra_cols throw out columns other than ['subject', 'object',
'relation', 'edge_label'][false]
:param seed: random seed [42]
Returns:
None.
"""
logging.info("Loading edge file %s" % edges)
edges_df: pd.DataFrame
if remove_extra_cols:
edges_df = tsv_to_df(edges, usecols=['subject', 'object', 'relation',
'edge_label'])
else:
edges_df = tsv_to_df(edges)
logging.info("Loading node file %s" % nodes)
nodes_df: pd.DataFrame = tsv_to_df(nodes)

# emit warning if there are nodes in nodes tsv not present in edges tsv
logging.info("Check for disconnected nodes: %r" % check_disconnected_nodes)
if check_disconnected_nodes and has_disconnected_nodes(nodes_df, edges_df):
warnings.warn("Graph has disconnected nodes")
logging.info("Loading graph from nodes %s and edges %s files" % (nodes, edges))
graph = EnsmallenGraph.from_csv(
edge_path=edges,
sources_column='subject',
destinations_column='object',
directed=False,
edge_types_column='edge_label',
default_edge_type='biolink:Association',
node_path=nodes,
nodes_column='id',
default_node_type='biolink:NamedThing',
node_types_column='category',
ignore_duplicated_edges=True,
ignore_duplicated_nodes=True,
force_conversion_to_undirected=True
);

os.makedirs(output_dir, exist_ok=True)

# make positive edges
logging.info("Making positive edges")
pos_train_edges: pd.DataFrame
pos_test_edges: pd.DataFrame
pos_valid_edges: pd.DataFrame
pos_train_edges, pos_test_edges = \
make_positive_edges(nodes_df=nodes_df,
edges_df=edges_df,
train_fraction=train_fraction,
min_degree=min_degree)
pos_train_edges, pos_test_edges = graph.random_holdout(seed=42,
train_percentage=train_fraction)
if validation:
pos_valid_edges = pos_test_edges.sample(frac=0.5)
pos_test_edges = pos_test_edges.drop(pos_valid_edges.index)
pos_valid_edges, pos_test_edges = \
pos_test_edges.random_holdout(seed=seed,
train_percentage=0.5)

# make negative edges
logging.info("Making negative edges")
neg_edges_df: pd.DataFrame = make_negative_edges(nodes_df, edges_df)
neg_train_edges: pd.DataFrame = neg_edges_df.sample(frac=train_fraction)
neg_test_edges: pd.DataFrame = neg_edges_df.drop(neg_train_edges.index)
neg_valid_edges: pd.DataFrame

all_negative_edges = \
pos_train_edges.sample_negatives(seed=seed,
negatives_number=graph.get_edges_number(),
allow_selfloops=False)
neg_train_edges, neg_test_edges = \
all_negative_edges.random_holdout(seed=seed, train_percentage=train_fraction)
if validation:
neg_valid_edges = neg_test_edges.sample(frac=0.5)
neg_test_edges = neg_test_edges.drop(neg_valid_edges.index)
neg_test_edges, neg_valid_edges = \
neg_test_edges.random_holdout(seed=seed, train_percentage=0.5)

#
# write out positive edges
Expand All @@ -81,11 +75,12 @@ def make_edges(nodes: str, edges: str, output_dir: str,
pos_train_nodes_outfile = os.path.join(output_dir, "pos_train_nodes.tsv")
pos_test_edges_outfile = os.path.join(output_dir, "pos_test_edges.tsv")
pos_valid_edges_outfile = os.path.join(output_dir, "pos_valid_edges.tsv")
df_to_tsv(df=pos_train_edges, outfile=pos_train_edges_outfile)
df_to_tsv(df=nodes_df, outfile=pos_train_nodes_outfile)
df_to_tsv(df=pos_test_edges, outfile=pos_test_edges_outfile)

pos_train_edges.to_edges_csv(edges_path=pos_train_edges_outfile)
pos_train_edges.to_nodes_csv(nodes_path=pos_train_nodes_outfile)
pos_test_edges.to_edges_csv(edges_path=pos_test_edges_outfile)
if validation:
df_to_tsv(df=pos_valid_edges, outfile=pos_valid_edges_outfile)
pos_valid_edges.to_edges_csv(edges_path=pos_valid_edges_outfile)

#
# write out negative edges
Expand All @@ -94,10 +89,11 @@ def make_edges(nodes: str, edges: str, output_dir: str,
neg_train_edges_outfile = os.path.join(output_dir, "neg_train_edges.tsv")
neg_test_edges_outfile = os.path.join(output_dir, "neg_test_edges.tsv")
neg_valid_edges_outfile = os.path.join(output_dir, "neg_valid_edges.tsv")
df_to_tsv(df=neg_train_edges, outfile=neg_train_edges_outfile)
df_to_tsv(df=neg_test_edges, outfile=neg_test_edges_outfile)

neg_train_edges.to_edges_csv(edges_path=neg_train_edges_outfile)
neg_test_edges.to_edges_csv(edges_path=neg_test_edges_outfile)
if validation:
df_to_tsv(df=neg_valid_edges, outfile=neg_valid_edges_outfile)
neg_valid_edges.to_edges_csv(edges_path=neg_valid_edges_outfile)


def df_to_tsv(df: pd.DataFrame, outfile: str, sep="\t", index=False) -> None:
Expand All @@ -109,7 +105,7 @@ def make_negative_edges(nodes_df: pd.DataFrame,
edge_label: str = 'negative_edge',
relation: str = 'negative_edge'
) -> pd.DataFrame:
"""Given a graph (as nodes and edges pandas dataframes), select num_edges edges that
"""Given a graph (as nodes and edges pandas dataframes), select num_edges holdouts that
are NOT present in the graph
:param nodes_df: pandas dataframe containing node info
Expand Down Expand Up @@ -200,8 +196,7 @@ def _generate_negative_edges(nodes_df: pd.DataFrame,

def make_positive_edges(nodes_df: pd.DataFrame,
edges_df: pd.DataFrame,
train_fraction: float,
min_degree: int) -> List[pd.DataFrame]:
train_fraction: float) -> List[pd.DataFrame]:
"""Positive edges are randomly selected from the edges in the graph, IFF both nodes
participating in the edge have a degree greater than min_degree (to avoid creating
disconnected components). This edge is then removed in the output graph. Negative
Expand All @@ -212,11 +207,10 @@ def make_positive_edges(nodes_df: pd.DataFrame,
:param edges_df: pandas dataframe with edge info, generated from KGX TSV file
:param train_fraction: fraction of input edges to emit as test (and optionally
validation) edges
:param min_degree: the minimum degree of nodes to be selected for positive edges
:return: pandas dataframes:
training_edges_df: a dataframe with training edges with positive edges we
selected for test removed from graph
test_edges_df: a dataframe with training edges with positive edges
test_edges_df: a dataframe with test positive edges
"""
if 'subject' not in list(edges_df.columns) or \
'object' not in list(edges_df.columns):
Expand Down Expand Up @@ -245,13 +239,6 @@ def make_positive_edges(nodes_df: pd.DataFrame,
test_edges = test_edges.merge(obj_degree_df, how='left', on='object')
pbar.update()

pbar.set_description("Removing edges < min_degree")
test_edges.drop(test_edges[test_edges['subj_degree'] < min_degree].index,
inplace=True)
test_edges.drop(test_edges[test_edges['obj_degree'] < min_degree].index,
inplace=True)
pbar.update()

pbar.set_description("Adding edge_label and relation columns")
test_edges = test_edges.sample(frac=(1-train_fraction))
test_edges['edge_label'] = 'positive_edge'
Expand All @@ -270,34 +257,6 @@ def make_positive_edges(nodes_df: pd.DataFrame,
return [train_edges, test_edges]


def has_disconnected_nodes(nodes_df: pd.DataFrame, edges_df: pd.DataFrame,
check_nodes_in_edge_df_not_in_node_df=True) -> bool:
"""Given nodes and edges df, determine if there are nodes that are not present in
edges (disconnected vertices)
:param nodes_df: pandas dataframe with node info
:param edges_df: pandas dataframe with edge info
:param check_nodes_in_edge_df_not_in_node_df: while we're at it, check if
edge df has nodes not mentioned in node df [True]
:return: bool
"""
nodes_in_edge_file = \
np.sort(np.unique(np.concatenate((edges_df.subject, edges_df.object))))
nodes_in_node_file = np.sort(nodes_df.id.unique())

if check_nodes_in_edge_df_not_in_node_df:
diff = len(np.setdiff1d(nodes_in_edge_file, nodes_in_node_file))
if diff != 0:
warnings.warn(
"There are %i nodes in edge file that aren't in nodes file" % diff)

# if setdiff below is zero, odes_in_node_file is a subset of nodes_in_edge_file
if len(np.setdiff1d(nodes_in_node_file, nodes_in_edge_file)) == 0:
return False
else:
return True


def tsv_to_df(tsv_file: str, *args, **kwargs) -> pd.DataFrame:
"""Read in a TSV file and return a pandas dataframe
Expand Down
26 changes: 11 additions & 15 deletions run.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import click
from kg_covid_19 import download as kg_download
from kg_covid_19 import transform as kg_transform
from kg_covid_19.edges import make_edges
from kg_covid_19.make_holdouts import make_holdouts
from kg_covid_19.merge_utils.merge_kg import load_and_merge
from kg_covid_19.query import run_query, parse_query_yaml, result_dict_to_tsv
from kg_covid_19.transform import DATA_SOURCES
Expand Down Expand Up @@ -114,25 +114,22 @@ def query(yaml: str, output_dir: str,
@cli.command()
@click.option("nodes", "-n", default="data/merged/nodes.tsv", type=click.Path(exists=True))
@click.option("edges", "-e", default="data/merged/edges.tsv", type=click.Path(exists=True))
@click.option("output_dir", "-o", default="data/edges/", type=click.Path())
@click.option("output_dir", "-o", default="data/holdouts/", type=click.Path())
@click.option("train_fraction", "-t", default=0.8, type=float)
@click.option("validation", "-v", is_flag=True, default=False)
@click.option("min_degree", "-m", default=2, type=click.IntRange(min=0, max=None,
clamp=False))
def edges(*args, **kwargs) -> None:
"""Make sets of edges for ML training
def holdouts(*args, **kwargs) -> None:
"""Make holdouts for ML training
Given a graph (from formatted node and edge TSVs), output positive edges and negative
edges for use in machine learning.
To generate positive edges: a set of test positive edges equal in number to
[(1 - train_fraction) * number of edges in input graph] are randomly selected from
the edges in the input graph, such that both nodes participating in the edge have a
degree greater than min_degree (to avoid creating disconnected components). These
edges are emitting as positive test edges. (If -v == true, the test positive edges
are divided equally to yield test and validation positive edges.) These edges are
then removed from the edges of the input graph, and these are emitted as the
training edges.
the edges in the input graph that is not part of a minimal spanning tree, such that
removing the edge does not create new components. These edges are emitting as
positive test edges. (If -v == true, the test positive edges are divided equally to
yield test and validation positive edges.) These edges are then removed from the
edges of the input graph, and these are emitted as the training edges.
Negative edges are selected by randomly selecting pairs of nodes that are not
connected by an edge in the input graph. The number of negative edges emitted is
Expand All @@ -154,10 +151,9 @@ def edges(*args, **kwargs) -> None:
:param output_dir: directory to output edges and new graph [data/edges/]
:param train_fraction: fraction of edges to emit as training [0.8]
:param validation: should we make validation edges? [False]
:param min_degree when choosing edges, what is the minimum degree of nodes
involved in the edge [1]
"""
make_edges(*args, **kwargs)
make_holdouts(*args, **kwargs)


if __name__ == "__main__":
Expand Down
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,8 @@ def find_version(*file_paths):
'click',
'pyyaml',
'bmt',
'SPARQLWrapper'
'SPARQLWrapper',
'ensmallen_graph'
],
extras_require=extras,
)
File renamed without changes.
File renamed without changes.

0 comments on commit 556825f

Please sign in to comment.