From e076b3b923045076be6be272478d3d11e0ffae32 Mon Sep 17 00:00:00 2001 From: Terencio Agozzino Date: Tue, 23 Mar 2021 21:25:44 +0100 Subject: [PATCH] chore: update the doc --- docs/requirements.txt | 5 +++++ poetry.lock | 6 +++--- pyproject.toml | 2 +- pyrdf2vec/connectors.py | 5 ----- pyrdf2vec/embedders/word2vec.py | 7 ++----- pyrdf2vec/graphs/kg.py | 21 --------------------- pyrdf2vec/graphs/vertex.py | 13 ++----------- pyrdf2vec/rdf2vec.py | 20 ++++---------------- pyrdf2vec/samplers/frequency.py | 1 - pyrdf2vec/samplers/pagerank.py | 7 ------- pyrdf2vec/samplers/sampler.py | 12 ------------ pyrdf2vec/samplers/uniform.py | 5 ----- pyrdf2vec/typings.py | 6 +++--- pyrdf2vec/walkers/community.py | 5 ----- pyrdf2vec/walkers/halk.py | 1 - pyrdf2vec/walkers/ngram.py | 5 ----- pyrdf2vec/walkers/walker.py | 19 ++----------------- pyrdf2vec/walkers/weisfeiler_lehman.py | 6 ------ 18 files changed, 22 insertions(+), 124 deletions(-) diff --git a/docs/requirements.txt b/docs/requirements.txt index 932362a1..bacdf824 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,3 +1,8 @@ +gensim +rdflib +scikit-learn +sphinx sphinx-autodoc-typehints sphinx-rtd-theme sphinxcontrib-apidoc +tomlkit diff --git a/poetry.lock b/poetry.lock index fad8419e..680b4ff7 100644 --- a/poetry.lock +++ b/poetry.lock @@ -317,7 +317,7 @@ python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" [[package]] name = "importlib-metadata" -version = "3.7.3" +version = "3.4.0" description = "Read metadata from Python packages" category = "main" optional = false @@ -1513,8 +1513,8 @@ imagesize = [ {file = "imagesize-1.2.0.tar.gz", hash = "sha256:b1f6b5a4eab1f73479a50fb79fcf729514a900c341d8503d62a62dbc4127a2b1"}, ] importlib-metadata = [ - {file = "importlib_metadata-3.7.3-py3-none-any.whl", hash = "sha256:b74159469b464a99cb8cc3e21973e4d96e05d3024d337313fedb618a6e86e6f4"}, - {file = "importlib_metadata-3.7.3.tar.gz", hash = "sha256:742add720a20d0467df2f444ae41704000f50e1234f46174b51f9c6031a1bd71"}, + {file = "importlib_metadata-3.4.0-py3-none-any.whl", hash = "sha256:ace61d5fc652dc280e7b6b4ff732a9c2d40db2c0f92bc6cb74e07b73d53a1771"}, + {file = "importlib_metadata-3.4.0.tar.gz", hash = "sha256:fa5daa4477a7414ae34e95942e4dd07f62adf589143c875c133c1e53c4eff38d"}, ] incremental = [ {file = "incremental-21.3.0-py2.py3-none-any.whl", hash = "sha256:92014aebc6a20b78a8084cdd5645eeaa7f74b8933f70fa3ada2cfbd1e3b54321"}, diff --git a/pyproject.toml b/pyproject.toml index 544945fb..ab663dc0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -171,7 +171,7 @@ deps = sphinxcontrib-apidoc tomlkit commands = - sphinx-build -n -T docs docs/_build/html + sphinx-build -n -T -W docs docs/_build/html python -m doctest README.rst [testenv:lint] diff --git a/pyrdf2vec/connectors.py b/pyrdf2vec/connectors.py index 6ffac1aa..8995dab5 100644 --- a/pyrdf2vec/connectors.py +++ b/pyrdf2vec/connectors.py @@ -24,14 +24,12 @@ class Connector(ABC): endpoint: str = attr.ib( validator=attr.validators.instance_of(str), ) - """The endpoint to execute the queries.""" cache: Cache = attr.ib( kw_only=True, factory=lambda: TTLCache(maxsize=1024, ttl=1200), validator=attr.validators.optional(attr.validators.instance_of(Cache)), ) - """The policy and size cache to use.""" _headers: Dict[str, str] = attr.ib( init=False, @@ -40,16 +38,13 @@ class Connector(ABC): "Accept": "application/sparql-results+json", }, ) - """The HTTP headers to use.""" _asession = attr.ib(init=False, default=None) - """The aiohttp session to use for asynchrone requests.""" _session = attr.ib( init=False, factory=lambda: requests.Session(), ) - """The requests session to use for synchrone requests.""" async def close(self) -> None: """Closes the aiohttp session.""" diff --git a/pyrdf2vec/embedders/word2vec.py b/pyrdf2vec/embedders/word2vec.py index cf8b7c95..a5d30a8b 100644 --- a/pyrdf2vec/embedders/word2vec.py +++ b/pyrdf2vec/embedders/word2vec.py @@ -17,12 +17,9 @@ class Word2Vec(Embedder): """ - kwargs = attr.ib(init=False, default=None) - """The keyword arguments dictionary. - Defaults to {size=500, min_count=0, negative=20}. - """ + # kwargs = attr.ib(init=False, default=None) - _model: W2V = attr.ib(init=False, default=None, repr=False) + # _model = attr.ib(init=False, type=W2V, default=None, repr=False) def __init__(self, **kwargs): self.kwargs = { diff --git a/pyrdf2vec/graphs/kg.py b/pyrdf2vec/graphs/kg.py index 3f1bb7d4..3bf28196 100644 --- a/pyrdf2vec/graphs/kg.py +++ b/pyrdf2vec/graphs/kg.py @@ -28,7 +28,6 @@ class KG: _check_location, ], ) - """The location of the file to load.""" skip_predicates: Set[str] = attr.ib( factory=set, @@ -36,7 +35,6 @@ class KG: member_validator=attr.validators.instance_of(str) ), ) - """The label predicates to skip from the KG.""" literals: List[List[str]] = attr.ib( # type: ignore factory=list, @@ -44,65 +42,46 @@ class KG: member_validator=attr.validators.instance_of(List) ), ) - """The predicate chains to get the literals.""" fmt: Optional[str] = attr.ib( kw_only=True, default=None, validator=attr.validators.optional(attr.validators.instance_of(str)), ) - """The format of the file. - It should be used only if the format can not be determined from source. - """ mul_req: bool = attr.ib( kw_only=True, default=False, validator=attr.validators.instance_of(bool), ) - """True to allow bundling of SPARQL queries, False otherwise. - This attribute accelerates the extraction of walks for remote Knowledge - Graphs. Beware that this may violate the policy of some SPARQL endpoint - server. - """ cache: Cache = attr.ib( kw_only=True, factory=lambda: TTLCache(maxsize=1024, ttl=1200), validator=attr.validators.optional(attr.validators.instance_of(Cache)), ) - """The policy and size cache to use. - Defaults to TTLCache(maxsize=1024, ttl=1200) - """ connector: SPARQLConnector = attr.ib(default=None, init=False, repr=False) - """The connector to use.""" _is_remote: bool = attr.ib( default=False, validator=attr.validators.instance_of(bool) ) - """True if the Knowledge Graph is in remote, False otherwise.""" _inv_transition_matrix: DefaultDict[Vertex, Set[Vertex]] = attr.ib( init=False, repr=False, factory=lambda: defaultdict(set) ) - """Contains the parents of vertices.""" _transition_matrix: DefaultDict[Vertex, Set[Vertex]] = attr.ib( init=False, repr=False, factory=lambda: defaultdict(set) ) - """Contains the children of vertices.""" _entity_hops: Dict[str, List[Hop]] = attr.ib( init=False, repr=False, factory=dict ) - """Caches the results of asynchronous requests.""" _entities: Set[Vertex] = attr.ib(init=False, repr=False, factory=set) - """Stores the entities.""" _vertices: Set[Vertex] = attr.ib(init=False, repr=False, factory=set) - """Stores the vertices.""" def __attrs_post_init__(self): if self.location is not None: diff --git a/pyrdf2vec/graphs/vertex.py b/pyrdf2vec/graphs/vertex.py index d2078507..df19b2d4 100644 --- a/pyrdf2vec/graphs/vertex.py +++ b/pyrdf2vec/graphs/vertex.py @@ -1,5 +1,3 @@ -from __future__ import annotations - from typing import Any, Optional import attr @@ -10,20 +8,13 @@ class Vertex: """Represents a vertex in a Knowledge Graph.""" name: str = attr.ib(validator=attr.validators.instance_of(str)) - """The name of vertex.""" - predicate: bool = attr.ib( default=False, validator=attr.validators.instance_of(bool), repr=False, ) - """True if the vertex is a predicate. False, otherwise.""" - - vprev: Optional[Vertex] = attr.ib(default=None, repr=False) - """The previous vertex.""" - - vnext: Optional[Vertex] = attr.ib(default=None, repr=False) - """The next vertex.""" + vprev: Optional["Vertex"] = attr.ib(default=None, repr=False) + vnext: Optional["Vertex"] = attr.ib(default=None, repr=False) def __eq__(self, other: Any) -> bool: """Defines behavior for the equality operator, ==. diff --git a/pyrdf2vec/rdf2vec.py b/pyrdf2vec/rdf2vec.py index 05f30f2b..94a281e7 100644 --- a/pyrdf2vec/rdf2vec.py +++ b/pyrdf2vec/rdf2vec.py @@ -17,14 +17,15 @@ class RDF2VecTransformer: """Transforms nodes in a Knowledge Graph into an embedding.""" - embedder: Embedder = attr.ib( + embedder = attr.ib( factory=lambda: Word2Vec(), + type=Embedder, validator=attr.validators.instance_of(Embedder), # type: ignore ) - """The embedding technique.""" - walkers: Sequence[Walker] = attr.ib( + walkers = attr.ib( factory=lambda: [RandomWalker(2)], # type: ignore + type=Sequence[Walker], validator=attr.validators.deep_iterable( member_validator=attr.validators.instance_of( Walker # type: ignore @@ -32,28 +33,18 @@ class RDF2VecTransformer: iterable_validator=attr.validators.instance_of(list), ), ) - """The walking strategy.""" verbose: int = attr.ib( kw_only=True, default=0, validator=attr.validators.in_([0, 1, 2]) ) - """The verbosity level. - 0: does not display anything; - 1: display of the progress of extraction and training of walks; - 2: debugging. - """ _embeddings: Embeddings = attr.ib(init=False, factory=list) - """All the embeddings of the model.""" _entities: Entities = attr.ib(init=False, factory=list) - """All the entities of the model.""" _literals: Literals = attr.ib(init=False, factory=list) - """All the literals of the model.""" _walks: List[str] = attr.ib(init=False, factory=list) - """All the walks of the model.""" _is_extract_walks_literals = attr.ib( init=False, @@ -61,9 +52,6 @@ class RDF2VecTransformer: default=False, validator=attr.validators.instance_of(bool), ) - """True if the session must be closed after the call to the `transform` - function. False, otherwise. - """ def fit( self, walks: List[str], is_update: bool = False diff --git a/pyrdf2vec/samplers/frequency.py b/pyrdf2vec/samplers/frequency.py index 6667c8d1..2f820f30 100644 --- a/pyrdf2vec/samplers/frequency.py +++ b/pyrdf2vec/samplers/frequency.py @@ -21,7 +21,6 @@ class ObjFreqSampler(Sampler): _counts: DefaultDict[str, int] = attr.ib( init=False, repr=False, factory=lambda: defaultdict(dict) ) - """Counter for vertices.""" def fit(self, kg: KG) -> None: """Fits the sampling strategy by counting the number of available diff --git a/pyrdf2vec/samplers/pagerank.py b/pyrdf2vec/samplers/pagerank.py index 8bcb5ee0..af3d3866 100644 --- a/pyrdf2vec/samplers/pagerank.py +++ b/pyrdf2vec/samplers/pagerank.py @@ -16,11 +16,6 @@ class PageRankSampler(Sampler): nodes are more important than others and hence there will be resources which are more frequent in the walks as others. - Args: - - alpha: The damping for PageRank. - Defaults to 0.85. - """ alpha: float = attr.ib( @@ -28,12 +23,10 @@ class PageRankSampler(Sampler): default=0.85, validator=attr.validators.instance_of(float), ) - """The damping for Page Rank.""" _pageranks: Dict[str, float] = attr.ib( init=False, repr=False, factory=dict ) - """The Page Rank dictionary.""" def fit(self, kg: KG) -> None: """Fits the sampling strategy by running PageRank on a provided KG diff --git a/pyrdf2vec/samplers/sampler.py b/pyrdf2vec/samplers/sampler.py index 4d40bff7..a73ee997 100644 --- a/pyrdf2vec/samplers/sampler.py +++ b/pyrdf2vec/samplers/sampler.py @@ -25,38 +25,26 @@ class Sampler(ABC): inverse: bool = attr.ib( default=False, validator=attr.validators.instance_of(bool) ) - """True if the inverse algorithm must be used, False otherwise.""" split: bool = attr.ib( default=False, validator=attr.validators.instance_of(bool) ) - """True if the split algorithm must be used, False otherwise.""" _is_support_remote: bool = attr.ib(init=False, repr=False, default=False) - """True if the sampling strategy can be used with a remote Knowledge Graph, - False Otherwise. - """ _random_state: Optional[int] = attr.ib( init=False, repr=False, default=None, ) - """The random state to use to keep random determinism with the sampling - strategy. - """ _vertices_deg: Dict[str, int] = attr.ib( init=False, repr=False, factory=dict ) - """The degree of the vertices.""" _visited: Set[Tuple[Hop, int]] = attr.ib( init=False, repr=False, factory=set ) - """Tags vertices that appear at the max depth or of which all their - children are tagged. - """ @abstractmethod def fit(self, kg: KG) -> None: diff --git a/pyrdf2vec/samplers/uniform.py b/pyrdf2vec/samplers/uniform.py index 3e2b1738..b2e50575 100644 --- a/pyrdf2vec/samplers/uniform.py +++ b/pyrdf2vec/samplers/uniform.py @@ -18,17 +18,12 @@ class UniformSampler(Sampler): inverse: bool = attr.ib( init=False, default=False, validator=attr.validators.instance_of(bool) ) - """True if the inverse algorithm must be used, False otherwise.""" split: bool = attr.ib( init=False, default=False, validator=attr.validators.instance_of(bool) ) - """True if the split algorithm must be used, False otherwise.""" _is_support_remote: bool = attr.ib(init=False, repr=False, default=True) - """True if the sampling strategy can be used with a remote Knowledge Graph, - False Otherwise. - """ def fit(self, kg: KG) -> None: """Since the weights are uniform, this function does nothing. diff --git a/pyrdf2vec/typings.py b/pyrdf2vec/typings.py index 9c319ca6..e4eabd7e 100644 --- a/pyrdf2vec/typings.py +++ b/pyrdf2vec/typings.py @@ -1,12 +1,12 @@ -from typing import TYPE_CHECKING, Dict, List, Tuple, Union +from typing import TYPE_CHECKING, Any, Dict, List, Tuple, Union if TYPE_CHECKING: from pyrdf2vec.graphs import Vertex # noqa: F401 -Hop = Tuple["Vertex", "Vertex"] +Hop = Tuple[Any, Any] SWalk = Tuple[str, ...] -Walk = Tuple["Vertex", ...] +Walk = Tuple[Any, ...] Embeddings = List[str] diff --git a/pyrdf2vec/walkers/community.py b/pyrdf2vec/walkers/community.py index 7e2e8b0b..d1bccad4 100644 --- a/pyrdf2vec/walkers/community.py +++ b/pyrdf2vec/walkers/community.py @@ -40,17 +40,12 @@ class CommunityWalker(Walker): hop_prob: float = attr.ib( kw_only=True, default=0.1, validator=attr.validators.instance_of(float) ) - """The probability to hop.""" resolution: int = attr.ib( kw_only=True, default=1, validator=attr.validators.instance_of(int) ) - """The resolution to use.""" _is_support_remote: bool = attr.ib(init=False, repr=False, default=False) - """True if the walking strategy can be used with a remote Knowledge Graph, - False Otherwise. - """ def _community_detection(self, kg: KG) -> None: """Converts the knowledge graph to a networkX graph. diff --git a/pyrdf2vec/walkers/halk.py b/pyrdf2vec/walkers/halk.py index 4d582dfa..25264849 100644 --- a/pyrdf2vec/walkers/halk.py +++ b/pyrdf2vec/walkers/halk.py @@ -25,7 +25,6 @@ class HALKWalker(RandomWalker): iterable_validator=attr.validators.instance_of(list), ), ) - """The minimum frequency thresholds of a hop to be kept.""" def _extract(self, kg: KG, instance: Vertex) -> EntityWalks: """Extracts walks rooted at the provided entities which are then each diff --git a/pyrdf2vec/walkers/ngram.py b/pyrdf2vec/walkers/ngram.py index 8c1d48a6..737c04b4 100644 --- a/pyrdf2vec/walkers/ngram.py +++ b/pyrdf2vec/walkers/ngram.py @@ -21,21 +21,16 @@ class NGramWalker(RandomWalker): grams: int = attr.ib( kw_only=True, default=3, validator=attr.validators.instance_of(int) ) - """The N-gram to relabel.""" wildcards: list = attr.ib( kw_only=True, default=None, validator=attr.validators.optional(attr.validators.instance_of(list)), ) - """The wildcards to be used to match sub-sequences with small differences - to be mapped onto the same label. - """ _n_gram_map: Dict[Tuple, str] = attr.ib( init=False, repr=False, factory=dict ) - """Stores the mapping of N-gram.""" def _take_n_grams(self, walk: Walk) -> List[str]: """Takes the N-Grams. diff --git a/pyrdf2vec/walkers/walker.py b/pyrdf2vec/walkers/walker.py index 47f65f88..162118bb 100644 --- a/pyrdf2vec/walkers/walker.py +++ b/pyrdf2vec/walkers/walker.py @@ -31,12 +31,10 @@ class Walker(ABC): """Base class of the walking strategies.""" kg: Optional[KG] = None - """Global KG used later on for the worker process.""" max_depth: int = attr.ib( validator=[attr.validators.instance_of(int), _check_max_depth] ) - """The maximum depth of one walk.""" max_walks: Optional[int] = attr.ib( # type: ignore default=None, @@ -45,13 +43,12 @@ class Walker(ABC): _check_max_walks, ], ) - """The maximum number of walks per entity.""" - sampler: Sampler = attr.ib( + sampler = attr.ib( factory=lambda: UniformSampler(), + type=Sampler, validator=attr.validators.instance_of(Sampler), # type: ignore ) - """The sampling strategy.""" n_jobs: Optional[int] = attr.ib( # type: ignore default=None, @@ -60,32 +57,20 @@ class Walker(ABC): _check_jobs, ], ) - """The number of CPU cores used when parallelizing. - None means 1. -1 means using all processors. - """ with_reverse: Optional[bool] = attr.ib( kw_only=True, default=False, validator=attr.validators.instance_of(bool), ) - """True to extracts children's and parents' walks from the root, - creating (max_walks * max_walks) more walks of 2 * depth, False otherwise. - """ random_state: Optional[int] = attr.ib( kw_only=True, default=None, validator=attr.validators.optional(attr.validators.instance_of(int)), ) - """The random state to use to keep random determinism with the walking - strategy. - """ _is_support_remote: bool = attr.ib(init=False, repr=False, default=True) - """True if the walking strategy can be used with a remote Knowledge Graph, - False Otherwise. - """ def __attrs_post_init__(self): if self.n_jobs == -1: diff --git a/pyrdf2vec/walkers/weisfeiler_lehman.py b/pyrdf2vec/walkers/weisfeiler_lehman.py index 70180821..819439fc 100644 --- a/pyrdf2vec/walkers/weisfeiler_lehman.py +++ b/pyrdf2vec/walkers/weisfeiler_lehman.py @@ -16,22 +16,16 @@ class WLWalker(RandomWalker): wl_iterations: int = attr.ib( kw_only=True, default=4, validator=attr.validators.instance_of(int) ) - """The Weisfeiler Lehman's iteration.""" _is_support_remote: bool = attr.ib(init=False, repr=False, default=False) - """True if the walking strategy can be used with a remote Knowledge Graph, - False Otherwise. - """ _inv_label_map: DefaultDict[ Vertex, Dict[Union[str, int], Union[str, int]] ] = attr.ib(init=False, repr=False, factory=lambda: defaultdict(dict)) - """Stores the mapping of the inverse labels.""" _label_map: DefaultDict[Vertex, Dict[int, str]] = attr.ib( init=False, repr=False, factory=lambda: defaultdict(dict) ) - """Stores the mapping of the labels.""" def _create_label(self, kg: KG, vertex: Vertex, n: int) -> str: """Creates a label according to a vertex and its neighbors.