diff --git a/kg_covid_19/edges.py b/kg_covid_19/make_holdouts.py similarity index 66% rename from kg_covid_19/edges.py rename to kg_covid_19/make_holdouts.py index d853db90..3f8a73dd 100644 --- a/kg_covid_19/edges.py +++ b/kg_covid_19/make_holdouts.py @@ -7,13 +7,12 @@ import pandas as pd # type: ignore import numpy as np # type: ignore from tqdm import tqdm # type: ignore +from ensmallen_graph import EnsmallenGraph # type: ignore -def make_edges(nodes: str, edges: str, output_dir: str, - train_fraction: float, validation: bool, - min_degree: int, check_disconnected_nodes: bool = False, - remove_extra_cols: bool = False) -> None: - """Prepare positive and negative edges for testing and training (see run.py edges +def make_holdouts(nodes: str, edges: str, output_dir: str, + train_fraction: float, validation: bool, seed=42) -> None: + """Prepare positive and negative edges for testing and training (see run.py holdouts command for documentation) Args: @@ -22,55 +21,50 @@ def make_edges(nodes: str, edges: str, output_dir: str, :param output_dir: directory to output edges and new graph [data/edges/] :param train_fraction: fraction of edges to emit as training :param validation: should we make validation edges? [False] - :param min_degree when choosing positive edges, what is the minimum degree - of nodes involved in the edge [2] - :param check_disconnected_nodes: should we check for disconnected nodes (i.e. - nodes with degree of 0) in input graph? [False] - :param remove_extra_cols throw out columns other than ['subject', 'object', - 'relation', 'edge_label'][false] + :param seed: random seed [42] Returns: None. """ - logging.info("Loading edge file %s" % edges) - edges_df: pd.DataFrame - if remove_extra_cols: - edges_df = tsv_to_df(edges, usecols=['subject', 'object', 'relation', - 'edge_label']) - else: - edges_df = tsv_to_df(edges) - logging.info("Loading node file %s" % nodes) - nodes_df: pd.DataFrame = tsv_to_df(nodes) - - # emit warning if there are nodes in nodes tsv not present in edges tsv - logging.info("Check for disconnected nodes: %r" % check_disconnected_nodes) - if check_disconnected_nodes and has_disconnected_nodes(nodes_df, edges_df): - warnings.warn("Graph has disconnected nodes") + logging.info("Loading graph from nodes %s and edges %s files" % (nodes, edges)) + graph = EnsmallenGraph.from_csv( + edge_path=edges, + sources_column='subject', + destinations_column='object', + directed=False, + edge_types_column='edge_label', + default_edge_type='biolink:Association', + node_path=nodes, + nodes_column='id', + default_node_type='biolink:NamedThing', + node_types_column='category', + ignore_duplicated_edges=True, + ignore_duplicated_nodes=True, + force_conversion_to_undirected=True + ); os.makedirs(output_dir, exist_ok=True) # make positive edges logging.info("Making positive edges") - pos_train_edges: pd.DataFrame - pos_test_edges: pd.DataFrame - pos_valid_edges: pd.DataFrame - pos_train_edges, pos_test_edges = \ - make_positive_edges(nodes_df=nodes_df, - edges_df=edges_df, - train_fraction=train_fraction, - min_degree=min_degree) + pos_train_edges, pos_test_edges = graph.random_holdout(seed=42, + train_percentage=train_fraction) if validation: - pos_valid_edges = pos_test_edges.sample(frac=0.5) - pos_test_edges = pos_test_edges.drop(pos_valid_edges.index) + pos_valid_edges, pos_test_edges = \ + pos_test_edges.random_holdout(seed=seed, + train_percentage=0.5) # make negative edges logging.info("Making negative edges") - neg_edges_df: pd.DataFrame = make_negative_edges(nodes_df, edges_df) - neg_train_edges: pd.DataFrame = neg_edges_df.sample(frac=train_fraction) - neg_test_edges: pd.DataFrame = neg_edges_df.drop(neg_train_edges.index) - neg_valid_edges: pd.DataFrame + + all_negative_edges = \ + pos_train_edges.sample_negatives(seed=seed, + negatives_number=graph.get_edges_number(), + allow_selfloops=False) + neg_train_edges, neg_test_edges = \ + all_negative_edges.random_holdout(seed=seed, train_percentage=train_fraction) if validation: - neg_valid_edges = neg_test_edges.sample(frac=0.5) - neg_test_edges = neg_test_edges.drop(neg_valid_edges.index) + neg_test_edges, neg_valid_edges = \ + neg_test_edges.random_holdout(seed=seed, train_percentage=0.5) # # write out positive edges @@ -81,11 +75,12 @@ def make_edges(nodes: str, edges: str, output_dir: str, pos_train_nodes_outfile = os.path.join(output_dir, "pos_train_nodes.tsv") pos_test_edges_outfile = os.path.join(output_dir, "pos_test_edges.tsv") pos_valid_edges_outfile = os.path.join(output_dir, "pos_valid_edges.tsv") - df_to_tsv(df=pos_train_edges, outfile=pos_train_edges_outfile) - df_to_tsv(df=nodes_df, outfile=pos_train_nodes_outfile) - df_to_tsv(df=pos_test_edges, outfile=pos_test_edges_outfile) + + pos_train_edges.to_edges_csv(edges_path=pos_train_edges_outfile) + pos_train_edges.to_nodes_csv(nodes_path=pos_train_nodes_outfile) + pos_test_edges.to_edges_csv(edges_path=pos_test_edges_outfile) if validation: - df_to_tsv(df=pos_valid_edges, outfile=pos_valid_edges_outfile) + pos_valid_edges.to_edges_csv(edges_path=pos_valid_edges_outfile) # # write out negative edges @@ -94,10 +89,11 @@ def make_edges(nodes: str, edges: str, output_dir: str, neg_train_edges_outfile = os.path.join(output_dir, "neg_train_edges.tsv") neg_test_edges_outfile = os.path.join(output_dir, "neg_test_edges.tsv") neg_valid_edges_outfile = os.path.join(output_dir, "neg_valid_edges.tsv") - df_to_tsv(df=neg_train_edges, outfile=neg_train_edges_outfile) - df_to_tsv(df=neg_test_edges, outfile=neg_test_edges_outfile) + + neg_train_edges.to_edges_csv(edges_path=neg_train_edges_outfile) + neg_test_edges.to_edges_csv(edges_path=neg_test_edges_outfile) if validation: - df_to_tsv(df=neg_valid_edges, outfile=neg_valid_edges_outfile) + neg_valid_edges.to_edges_csv(edges_path=neg_valid_edges_outfile) def df_to_tsv(df: pd.DataFrame, outfile: str, sep="\t", index=False) -> None: @@ -109,7 +105,7 @@ def make_negative_edges(nodes_df: pd.DataFrame, edge_label: str = 'negative_edge', relation: str = 'negative_edge' ) -> pd.DataFrame: - """Given a graph (as nodes and edges pandas dataframes), select num_edges edges that + """Given a graph (as nodes and edges pandas dataframes), select num_edges holdouts that are NOT present in the graph :param nodes_df: pandas dataframe containing node info @@ -200,8 +196,7 @@ def _generate_negative_edges(nodes_df: pd.DataFrame, def make_positive_edges(nodes_df: pd.DataFrame, edges_df: pd.DataFrame, - train_fraction: float, - min_degree: int) -> List[pd.DataFrame]: + train_fraction: float) -> List[pd.DataFrame]: """Positive edges are randomly selected from the edges in the graph, IFF both nodes participating in the edge have a degree greater than min_degree (to avoid creating disconnected components). This edge is then removed in the output graph. Negative @@ -212,11 +207,10 @@ def make_positive_edges(nodes_df: pd.DataFrame, :param edges_df: pandas dataframe with edge info, generated from KGX TSV file :param train_fraction: fraction of input edges to emit as test (and optionally validation) edges - :param min_degree: the minimum degree of nodes to be selected for positive edges :return: pandas dataframes: training_edges_df: a dataframe with training edges with positive edges we selected for test removed from graph - test_edges_df: a dataframe with training edges with positive edges + test_edges_df: a dataframe with test positive edges """ if 'subject' not in list(edges_df.columns) or \ 'object' not in list(edges_df.columns): @@ -245,13 +239,6 @@ def make_positive_edges(nodes_df: pd.DataFrame, test_edges = test_edges.merge(obj_degree_df, how='left', on='object') pbar.update() - pbar.set_description("Removing edges < min_degree") - test_edges.drop(test_edges[test_edges['subj_degree'] < min_degree].index, - inplace=True) - test_edges.drop(test_edges[test_edges['obj_degree'] < min_degree].index, - inplace=True) - pbar.update() - pbar.set_description("Adding edge_label and relation columns") test_edges = test_edges.sample(frac=(1-train_fraction)) test_edges['edge_label'] = 'positive_edge' @@ -270,34 +257,6 @@ def make_positive_edges(nodes_df: pd.DataFrame, return [train_edges, test_edges] -def has_disconnected_nodes(nodes_df: pd.DataFrame, edges_df: pd.DataFrame, - check_nodes_in_edge_df_not_in_node_df=True) -> bool: - """Given nodes and edges df, determine if there are nodes that are not present in - edges (disconnected vertices) - - :param nodes_df: pandas dataframe with node info - :param edges_df: pandas dataframe with edge info - :param check_nodes_in_edge_df_not_in_node_df: while we're at it, check if - edge df has nodes not mentioned in node df [True] - :return: bool - """ - nodes_in_edge_file = \ - np.sort(np.unique(np.concatenate((edges_df.subject, edges_df.object)))) - nodes_in_node_file = np.sort(nodes_df.id.unique()) - - if check_nodes_in_edge_df_not_in_node_df: - diff = len(np.setdiff1d(nodes_in_edge_file, nodes_in_node_file)) - if diff != 0: - warnings.warn( - "There are %i nodes in edge file that aren't in nodes file" % diff) - - # if setdiff below is zero, odes_in_node_file is a subset of nodes_in_edge_file - if len(np.setdiff1d(nodes_in_node_file, nodes_in_edge_file)) == 0: - return False - else: - return True - - def tsv_to_df(tsv_file: str, *args, **kwargs) -> pd.DataFrame: """Read in a TSV file and return a pandas dataframe diff --git a/run.py b/run.py index 77ce8ea7..ccda5c17 100644 --- a/run.py +++ b/run.py @@ -5,7 +5,7 @@ import click from kg_covid_19 import download as kg_download from kg_covid_19 import transform as kg_transform -from kg_covid_19.edges import make_edges +from kg_covid_19.make_holdouts import make_holdouts from kg_covid_19.merge_utils.merge_kg import load_and_merge from kg_covid_19.query import run_query, parse_query_yaml, result_dict_to_tsv from kg_covid_19.transform import DATA_SOURCES @@ -114,25 +114,22 @@ def query(yaml: str, output_dir: str, @cli.command() @click.option("nodes", "-n", default="data/merged/nodes.tsv", type=click.Path(exists=True)) @click.option("edges", "-e", default="data/merged/edges.tsv", type=click.Path(exists=True)) -@click.option("output_dir", "-o", default="data/edges/", type=click.Path()) +@click.option("output_dir", "-o", default="data/holdouts/", type=click.Path()) @click.option("train_fraction", "-t", default=0.8, type=float) @click.option("validation", "-v", is_flag=True, default=False) -@click.option("min_degree", "-m", default=2, type=click.IntRange(min=0, max=None, - clamp=False)) -def edges(*args, **kwargs) -> None: - """Make sets of edges for ML training +def holdouts(*args, **kwargs) -> None: + """Make holdouts for ML training Given a graph (from formatted node and edge TSVs), output positive edges and negative edges for use in machine learning. To generate positive edges: a set of test positive edges equal in number to [(1 - train_fraction) * number of edges in input graph] are randomly selected from - the edges in the input graph, such that both nodes participating in the edge have a - degree greater than min_degree (to avoid creating disconnected components). These - edges are emitting as positive test edges. (If -v == true, the test positive edges - are divided equally to yield test and validation positive edges.) These edges are - then removed from the edges of the input graph, and these are emitted as the - training edges. + the edges in the input graph that is not part of a minimal spanning tree, such that + removing the edge does not create new components. These edges are emitting as + positive test edges. (If -v == true, the test positive edges are divided equally to + yield test and validation positive edges.) These edges are then removed from the + edges of the input graph, and these are emitted as the training edges. Negative edges are selected by randomly selecting pairs of nodes that are not connected by an edge in the input graph. The number of negative edges emitted is @@ -154,10 +151,9 @@ def edges(*args, **kwargs) -> None: :param output_dir: directory to output edges and new graph [data/edges/] :param train_fraction: fraction of edges to emit as training [0.8] :param validation: should we make validation edges? [False] - :param min_degree when choosing edges, what is the minimum degree of nodes - involved in the edge [1] + """ - make_edges(*args, **kwargs) + make_holdouts(*args, **kwargs) if __name__ == "__main__": diff --git a/setup.py b/setup.py index 65cfe157..72335b27 100644 --- a/setup.py +++ b/setup.py @@ -69,7 +69,8 @@ def find_version(*file_paths): 'click', 'pyyaml', 'bmt', - 'SPARQLWrapper' + 'SPARQLWrapper', + 'ensmallen_graph' ], extras_require=extras, ) diff --git a/tests/resources/edges/bigger_graph_degree_info.txt b/tests/resources/holdouts/bigger_graph_degree_info.txt similarity index 100% rename from tests/resources/edges/bigger_graph_degree_info.txt rename to tests/resources/holdouts/bigger_graph_degree_info.txt diff --git a/tests/resources/edges/bigger_graph_edges.tsv b/tests/resources/holdouts/bigger_graph_edges.tsv similarity index 100% rename from tests/resources/edges/bigger_graph_edges.tsv rename to tests/resources/holdouts/bigger_graph_edges.tsv diff --git a/tests/resources/edges/bigger_graph_edges_HIGHER_DEGREE_NODES.tsv b/tests/resources/holdouts/bigger_graph_edges_HIGHER_DEGREE_NODES.tsv similarity index 100% rename from tests/resources/edges/bigger_graph_edges_HIGHER_DEGREE_NODES.tsv rename to tests/resources/holdouts/bigger_graph_edges_HIGHER_DEGREE_NODES.tsv diff --git a/tests/resources/edges/bigger_graph_nodes.tsv b/tests/resources/holdouts/bigger_graph_nodes.tsv similarity index 100% rename from tests/resources/edges/bigger_graph_nodes.tsv rename to tests/resources/holdouts/bigger_graph_nodes.tsv diff --git a/tests/resources/edges/bigger_graph_nodes_EXTRA_IDS.tsv b/tests/resources/holdouts/bigger_graph_nodes_EXTRA_IDS.tsv similarity index 100% rename from tests/resources/edges/bigger_graph_nodes_EXTRA_IDS.tsv rename to tests/resources/holdouts/bigger_graph_nodes_EXTRA_IDS.tsv diff --git a/tests/resources/edges/bigger_graph_nodes_MISSING_IDS.tsv b/tests/resources/holdouts/bigger_graph_nodes_MISSING_IDS.tsv similarity index 100% rename from tests/resources/edges/bigger_graph_nodes_MISSING_IDS.tsv rename to tests/resources/holdouts/bigger_graph_nodes_MISSING_IDS.tsv diff --git a/tests/resources/edges/small_graph_edges.tsv b/tests/resources/holdouts/small_graph_edges.tsv similarity index 100% rename from tests/resources/edges/small_graph_edges.tsv rename to tests/resources/holdouts/small_graph_edges.tsv diff --git a/tests/resources/edges/small_graph_nodes.tsv b/tests/resources/holdouts/small_graph_nodes.tsv similarity index 100% rename from tests/resources/edges/small_graph_nodes.tsv rename to tests/resources/holdouts/small_graph_nodes.tsv diff --git a/tests/resources/edges/small_graph_nodes_EXTRA_IDS.tsv b/tests/resources/holdouts/small_graph_nodes_EXTRA_IDS.tsv similarity index 100% rename from tests/resources/edges/small_graph_nodes_EXTRA_IDS.tsv rename to tests/resources/holdouts/small_graph_nodes_EXTRA_IDS.tsv diff --git a/tests/resources/edges/small_graph_nodes_MISSING_IDS.tsv b/tests/resources/holdouts/small_graph_nodes_MISSING_IDS.tsv similarity index 100% rename from tests/resources/edges/small_graph_nodes_MISSING_IDS.tsv rename to tests/resources/holdouts/small_graph_nodes_MISSING_IDS.tsv diff --git a/tests/test_edges.py b/tests/test_holdouts.py similarity index 83% rename from tests/test_edges.py rename to tests/test_holdouts.py index e0c12ad2..ba24698d 100644 --- a/tests/test_edges.py +++ b/tests/test_holdouts.py @@ -7,15 +7,15 @@ from pandas import np from parameterized import parameterized -from kg_covid_19.edges import make_edges, tsv_to_df, has_disconnected_nodes, \ - make_negative_edges, make_positive_edges, df_to_tsv +from kg_covid_19.make_holdouts import make_holdouts, tsv_to_df, make_negative_edges, \ + make_positive_edges, df_to_tsv class TestEdges(unittest.TestCase): @classmethod def setUpClass(cls) -> None: - cls.nodes_file = 'tests/resources/edges/bigger_graph_nodes.tsv' - cls.edges_file = 'tests/resources/edges/bigger_graph_edges.tsv' + cls.nodes_file = 'tests/resources/holdouts/bigger_graph_nodes.tsv' + cls.edges_file = 'tests/resources/holdouts/bigger_graph_edges.tsv' cls.edges = tsv_to_df(cls.edges_file) cls.nodes = tsv_to_df(cls.nodes_file) @@ -25,8 +25,7 @@ def setUpClass(cls) -> None: # make positive edges for small graph cls.train_fraction = 0.8 (cls.train_edges, cls.test_edges) = make_positive_edges( - nodes_df=cls.nodes, edges_df=cls.edges, train_fraction= cls.train_fraction, - min_degree=0) + nodes_df=cls.nodes, edges_df=cls.edges, train_fraction= cls.train_fraction) def setUp(self) -> None: pass @@ -46,7 +45,7 @@ def test_df_to_tsv(self): self.assertEqual(df.shape, df_roundtrip.shape) def test_make_edges_exists(self): - self.assertTrue(isinstance(make_edges, object)) + self.assertTrue(isinstance(make_holdouts, object)) # # Test output files @@ -72,9 +71,9 @@ def test_make_edges_check_edge_output_files(self, output_file: str, output_file_with_path = os.path.join(me_output_dir, output_file) input_edges = tsv_to_df(self.edges_file) num_input_edges = input_edges.shape[0] - make_edges(nodes=self.nodes_file, edges=self.edges_file, - output_dir=me_output_dir, train_fraction=0.8, - validation=make_validation, min_degree=1) + make_holdouts(nodes=self.nodes_file, edges=self.edges_file, + output_dir=me_output_dir, train_fraction=0.8, + validation=make_validation) if file_should_exist: self.assertTrue(os.path.isfile(output_file_with_path)) new_edges_df = tsv_to_df(output_file_with_path) @@ -94,9 +93,9 @@ def test_make_edges_check_edge_output_files(self, output_file: str, def test_make_edges_pos_train_test_valid_edges_distinct(self, train, test, valid): output_dir = tempfile.mkdtemp() input_edges = tsv_to_df(self.edges_file) - make_edges(nodes=self.nodes_file, edges=self.edges_file, - output_dir=output_dir, train_fraction=0.8, - validation=True, min_degree=1) + make_holdouts(nodes=self.nodes_file, edges=self.edges_file, + output_dir=output_dir, train_fraction=0.8, + validation=True) input_edges = tsv_to_df(self.edges_file)[['subject', 'object']] train_edges = tsv_to_df(os.path.join(output_dir, train))[['subject', 'object']] test_edges = tsv_to_df(os.path.join(output_dir, test))[['subject', 'object']] @@ -120,9 +119,9 @@ def test_make_edges_check_node_output_file(self): output_dir = tempfile.mkdtemp() output_file_with_path = os.path.join(output_dir, 'pos_train_nodes.tsv') input_nodes = tsv_to_df(self.nodes_file) - make_edges(nodes=self.nodes_file, edges=self.edges_file, - output_dir=output_dir, train_fraction=0.8, - validation=False, min_degree=1) + make_holdouts(nodes=self.nodes_file, edges=self.edges_file, + output_dir=output_dir, train_fraction=0.8, + validation=False) self.assertTrue(os.path.isfile(output_file_with_path)) new_nodes_df = tsv_to_df(output_file_with_path) # make sure we get expected @@ -216,41 +215,9 @@ def test_make_positive_edges_test_pos_edges_are_removed_from_train_edges(self): % (overlap_test_train.shape[0], overlap_test_train.to_string())) - def test_make_positive_edges_test_min_degree_gt_zero(self): - train_fraction = 0.90 - degree = 2 - hd_edges_file =\ - 'tests/resources/edges/bigger_graph_edges_HIGHER_DEGREE_NODES.tsv' - hd_edges = tsv_to_df(hd_edges_file) - hd_nodes = ['p1', 'd1', - 'g1', 'g2', 'g3', 'g4', 'g5', 'g6', 'g7', 'g8', 'g9', 'g10', - 'g11', 'g12', 'g13', 'g14', 'g15', 'g16', 'g17', 'g18', 'g19', - 'g20', 'g21', 'g22', 'g23', 'g24', 'g25'] - for _ in range(10): - (train_edges, test_edges) = make_positive_edges( - nodes_df=self.nodes, edges_df=hd_edges, train_fraction=train_fraction, - min_degree=degree) - these_nodes = set(list(test_edges.subject) + list(test_edges.object)) - self.assertTrue(set(these_nodes) < set(hd_nodes), - "Got some nodes with degree < 2: %s" % - " ".join(np.setdiff1d(these_nodes,hd_nodes)[0])) - # # negative edge tests # - def test_has_disconnected_nodes(self): - nodes_extra_ids = tsv_to_df( - 'tests/resources/edges/bigger_graph_nodes_EXTRA_IDS.tsv') - nodes_missing_ids = tsv_to_df( - 'tests/resources/edges/bigger_graph_nodes_MISSING_IDS.tsv') - self.assertTrue(not has_disconnected_nodes(edges_df=self.edges, - nodes_df=self.nodes)) - with self.assertWarns(Warning): - self.assertTrue(not has_disconnected_nodes(edges_df=self.edges, - nodes_df=nodes_missing_ids)) - self.assertTrue(has_disconnected_nodes(edges_df=self.edges, - nodes_df=nodes_extra_ids)) - def test_make_negative_edges_check_instance_type(self): self.assertTrue(isinstance(self.ne, pd.DataFrame)) diff --git a/tests/test_run.py b/tests/test_run.py index 31ed66a6..c59f0749 100644 --- a/tests/test_run.py +++ b/tests/test_run.py @@ -2,7 +2,7 @@ from click.testing import CliRunner from unittest import mock -from run import download, transform, merge, edges, query +from run import download, transform, merge, holdouts, query class TestRun(TestCase):