From ea8aea1beb9c7ef8277293166c3b07ad2fdb7a14 Mon Sep 17 00:00:00 2001 From: Michael McAuliffe Date: Sun, 6 Aug 2017 07:33:08 -0400 Subject: [PATCH] DOCS: Reogranization WIP --- bin/pgdb.py | 2 +- docs/source/acoustics.rst | 16 + docs/source/acoustics_backend.rst | 5 + docs/source/acoustics_encoding.rst | 6 + docs/source/acoustics_querying.rst | 6 + docs/source/api_graph.rst | 10 +- docs/source/enrichment.rst | 19 + docs/source/enrichment_csvs.rst | 76 ++++ docs/source/enrichment_queries.rst | 65 +++ docs/source/enrichment_syllables.rst | 108 +++++ docs/source/enrichment_utterances.rst | 94 +++++ .../{installation.rst => getting_started.rst} | 0 docs/source/{importing.rst => import.rst} | 0 docs/source/index.rst | 10 +- .../source/{graph_queries.rst => queries.rst} | 13 +- docs/source/queries_aggregates.rst | 65 --- docs/source/queries_annotations.rst | 370 ++++++++++++++++++ docs/source/queries_basic.rst | 138 ------- docs/source/queries_discourse.rst | 7 + docs/source/queries_lexicon.rst | 18 + docs/source/queries_ordering.rst | 29 -- docs/source/queries_reference.rst | 122 ++++++ docs/source/queries_speakers.rst | 8 + docs/source/queries_subannotations.rst | 53 --- docs/source/queries_subpaths.rst | 81 ---- docs/source/queries_subsets.rst | 52 --- polyglotdb/corpus/featured.py | 13 + polyglotdb/corpus/lexical.py | 14 + polyglotdb/corpus/spoken.py | 23 ++ polyglotdb/corpus/syllabic.py | 35 +- tests/test_client.py | 5 +- tests/test_enrich.py | 2 +- tests/test_summarized.py | 6 +- 33 files changed, 1016 insertions(+), 455 deletions(-) create mode 100644 docs/source/acoustics.rst create mode 100644 docs/source/acoustics_backend.rst create mode 100644 docs/source/acoustics_encoding.rst create mode 100644 docs/source/acoustics_querying.rst create mode 100644 docs/source/enrichment.rst create mode 100644 docs/source/enrichment_csvs.rst create mode 100644 docs/source/enrichment_queries.rst create mode 100644 docs/source/enrichment_syllables.rst create mode 100644 docs/source/enrichment_utterances.rst rename docs/source/{installation.rst => getting_started.rst} (100%) rename docs/source/{importing.rst => import.rst} (100%) rename docs/source/{graph_queries.rst => queries.rst} (60%) delete mode 100644 docs/source/queries_aggregates.rst create mode 100644 docs/source/queries_annotations.rst delete mode 100644 docs/source/queries_basic.rst create mode 100644 docs/source/queries_discourse.rst create mode 100644 docs/source/queries_lexicon.rst delete mode 100644 docs/source/queries_ordering.rst create mode 100644 docs/source/queries_reference.rst create mode 100644 docs/source/queries_speakers.rst delete mode 100644 docs/source/queries_subannotations.rst delete mode 100644 docs/source/queries_subpaths.rst delete mode 100644 docs/source/queries_subsets.rst diff --git a/bin/pgdb.py b/bin/pgdb.py index ea0342da..25857093 100644 --- a/bin/pgdb.py +++ b/bin/pgdb.py @@ -50,7 +50,7 @@ def save_config(c): TEMP_DIR = os.path.join(CONFIG_DIR, 'downloads') -NEO4J_VERSION = '3.0.7' +NEO4J_VERSION = '3.2.3' INFLUXDB_VERSION = '1.1.0' diff --git a/docs/source/acoustics.rst b/docs/source/acoustics.rst new file mode 100644 index 00000000..3afb3a6e --- /dev/null +++ b/docs/source/acoustics.rst @@ -0,0 +1,16 @@ +.. _acoustics: + +***************** +Acoustic measures +***************** + +TODO blurb + +Contents: + +.. toctree:: + :maxdepth: 2 + + acoustics_encoding.rst + acoustics_querying.rst + acoustics_backend.rst diff --git a/docs/source/acoustics_backend.rst b/docs/source/acoustics_backend.rst new file mode 100644 index 00000000..c18d89f8 --- /dev/null +++ b/docs/source/acoustics_backend.rst @@ -0,0 +1,5 @@ +.. _acoustics_backend: + +**************** +Acoustic backend +**************** diff --git a/docs/source/acoustics_encoding.rst b/docs/source/acoustics_encoding.rst new file mode 100644 index 00000000..4c4c68ec --- /dev/null +++ b/docs/source/acoustics_encoding.rst @@ -0,0 +1,6 @@ +.. _acoustics_encoding: + +************************** +Encoding acoustic measures +************************** + diff --git a/docs/source/acoustics_querying.rst b/docs/source/acoustics_querying.rst new file mode 100644 index 00000000..af8abcfc --- /dev/null +++ b/docs/source/acoustics_querying.rst @@ -0,0 +1,6 @@ +.. _acoustics_querying: + +************************** +Querying acoustic measures +************************** + diff --git a/docs/source/api_graph.rst b/docs/source/api_graph.rst index f103cd01..78b8521f 100644 --- a/docs/source/api_graph.rst +++ b/docs/source/api_graph.rst @@ -8,7 +8,7 @@ Graph API Queries ------- -.. currentmodule:: polyglotdb.graph.query +.. currentmodule:: polyglotdb.query.annotations.query .. autosummary:: :toctree: generated/ @@ -20,20 +20,20 @@ Queries Attributes ---------- -.. currentmodule:: polyglotdb.graph.attributes +.. currentmodule:: polyglotdb.query.annotations.attributes.base .. autosummary:: :toctree: generated/ :template: class.rst - Attribute + AnnotationNode AnnotationAttribute .. _graph_clauses_api: Clause elements --------------- -.. currentmodule:: polyglotdb.graph.elements +.. currentmodule:: polyglotdb.query.annotations.elements .. autosummary:: :toctree: generated/ @@ -54,7 +54,7 @@ Clause elements Aggregate functions ------------------- -.. currentmodule:: polyglotdb.graph.func +.. currentmodule:: polyglotdb.query.base.func .. autosummary:: :toctree: generated/ diff --git a/docs/source/enrichment.rst b/docs/source/enrichment.rst new file mode 100644 index 00000000..cc6928b8 --- /dev/null +++ b/docs/source/enrichment.rst @@ -0,0 +1,19 @@ +.. _enrichment: + +********** +Enrichment +********** + +Following import, the corpus is often fairly bare, with just word and phone annotations. An important step in analyzing +corpora is therefore enriching it with other information. Most of the methods here are automatic once a function is called. + + +Contents: + +.. toctree:: + :maxdepth: 2 + + enrichment_syllables.rst + enrichment_utterances.rst + enrichment_csvs.rst + enrichment_queries.rst diff --git a/docs/source/enrichment_csvs.rst b/docs/source/enrichment_csvs.rst new file mode 100644 index 00000000..95f590ea --- /dev/null +++ b/docs/source/enrichment_csvs.rst @@ -0,0 +1,76 @@ +.. _enrichment_csvs: + +************************ +Enrichment via CSV files +************************ + +PolyglotDB supports ways of adding arbitrary information to annotations or metadata about speakers and files by specifying +a local CSV file to add information from. When constructing this CSV file, the first column should be the label used to +identify which element should be enriched, and all subsequent columns are used as properties to add to the corpus. + +:: + + ID_column,property_one,property_two + first_item,first_item_value_one,first_item_value_two + second_item,,second_item_value_two + +Enriching using this file would look up elements based on the `ID_column`, and the one matching `first_item` would get +both `property_one` and `property_two` (with the respective values). The one matching `second_item` would only get a +`property_two` (because the value for `property_one` is empty. + +.. _enrich_lexicon: + +Enriching the lexicon +===================== + +.. code-block:: python + + lexicon_csv_path = '/full/path/to/lexicon/data.csv' + with CorpusContext(config) as c: + c.enrich_lexicon_from_csv(lexicon_csv_path) + + +.. note:: + + The function `enrich_lexicon_from_csv` accepts an optional keyword `case_sensitive` and defaults to `False`. Changing this + will respect capitalization when looking up words. + + +.. _enrich_inventory: + +Enriching the phonological inventory +==================================== + +The phone inventory can be enriched with arbitrary properties via: + +.. code-block:: python + + inventory_csv_path = '/full/path/to/inventory/data.csv' + with CorpusContext(config) as c: + c.enrich_inventory_from_csv(inventory_csv_path) + +.. _enrich_speakers: + +Enriching speaker information +============================= + +Speaker information can be added via: + +.. code-block:: python + + speaker_csv_path = '/full/path/to/speaker/data.csv' + with CorpusContext(config) as c: + c.enrich_speakers_from_csv(speaker_csv_path) + +.. _enrich_discourses: + +Enriching discourse information +=============================== + +Metadata about the discourses or sound files can be added via: + +.. code-block:: python + + discourse_csv_path = '/full/path/to/discourse/data.csv' + with CorpusContext(config) as c: + c.enrich_discourses_from_csv(discourse_csv_path) diff --git a/docs/source/enrichment_queries.rst b/docs/source/enrichment_queries.rst new file mode 100644 index 00000000..496d8a2a --- /dev/null +++ b/docs/source/enrichment_queries.rst @@ -0,0 +1,65 @@ +.. _enrichment_queries: + +********************** +Enrichment via queries +********************** + +Queries have the functionality to set properties and create subsets of elements based on results. + +For instance, if you wanted to make word initial phones more easily queryable, you could perform the following: + +.. code-block:: python + + with CorpusContext(config) as c: + q = c.query_graph(c.phone) + q = q.filter(c.phone.begin == c.phone.word.begin) + q.create_subset('word-initial') + +Once that code completes, a subsequent query could be made of: + +.. code-block:: python + + with CorpusContext(config) as c: + q = c.query_graph(c.phone) + q = q.filter(c.phone.subset == 'word-initial) + print(q.all())) + +Or instead of a subset, a property could be encoded as: + +.. code-block:: python + + with CorpusContext(config) as c: + q = c.query_graph(c.phone) + q = q.filter(c.phone.begin == c.phone.word.begin) + q.set_properties(position='word-initial') + +And then this property can be exported as a column in a csv: + +.. code-block:: python + + with CorpusContext(config) as c: + q = c.query_graph(c.phone) + q.columns(c.position) + q.to_csv(some_csv_path) + + +Lexicon queries can also be used in the same way to create subsets and encode properties that do not vary on a token by token basis. + +For instance, a subset for high vowels can be created as follows: + +.. code-block:: python + + with CorpusContext(config) as c: + high_vowels = ['iy', 'ih','uw','uh'] + q = c.query_lexicon(c.lexicon_phone) + q = q.filter(c.lexicon_phone.label.in_(high_vowels)) + q.create_subset('high_vowel') + +Which can then be used to query phone annotations: + +.. code-block:: python + + with CorpusContext(config) as c: + q = c.query_graph(c.phone) + q = q.filter(c.phone.subset == 'high_vowel') + print(q.all()) diff --git a/docs/source/enrichment_syllables.rst b/docs/source/enrichment_syllables.rst new file mode 100644 index 00000000..069ab2d2 --- /dev/null +++ b/docs/source/enrichment_syllables.rst @@ -0,0 +1,108 @@ +.. _enrichment_syllables: + +*********************** +Creating syllable units +*********************** + +Syllables are groupings of phones into larger units, within words. PolyglotDB enforces a strict hierarchy, with the boundaries +of words aligning with syllable boundaries (i.e., syllables cannot stretch across words). + +At the moment, only one algorithm is supported (`maximal onset`) because its simplicity lends it to be language agnostic. + +To encode syllables, there are two steps: + +1. :ref:`encoding_syllabics` +2. :ref:`encoding_syllables` + + +.. _encoding_syllabics: + +Encoding syllabic segments +========================== + +Syllabic segments are called via a specialized function: + + + +.. code-block:: python + + syllabic_segments = ['aa', 'ae','ih'] + with CorpusContext(config) as c: + c.encode_syllabic_segments(syllabic_segments) + + +Following this code, all phones with labels of `aa, ae, ih` will belong to the subset `syllabic`. This subset can be +then queried in the future, in addition to allowing syllables to be encoded. + +.. _encoding_syllables: + +Encoding syllables +================== + +.. code-block:: python + + with CorpusContext(config) as c: + c.encode_syllables() + +.. note:: + + The function `encode_syllables` can be given a keyword argument for `call_back`, which is a function like `print` that + allows for progress to be output to the console. + +Following encoding, syllables are available to queried and used as any other linguistic unit. For example, to get a list of +all the instances of syllables at the beginnings of words: + + +.. code-block:: python + + with CorpusContext(config) as c: + q = c.query_graph(c.syllable).filter(c.syllable.begin == c.syllable.word.begin) + print(q.all()) + +.. _stress_tone: + +Encoding syllable properties from syllabics +=========================================== + +Often in corpora there is information about syllables contained on the vowels. For instance, if the transcription contains +stress levels, they will be specified as numbers 0-2 on the vowels (i.e. as in Arpabet). Tone is likewise similarly encoded +in some transcription systems. This section details functions that strip this information from the vowel and place it on +the syllable unit instead. + +.. note:: + + Removing the stress/tone information from the vowel makes queries easier, as getting all `AA` tokens no longer requires + specifying that the label is in the set of `AA1, AA2, AA0`. This functionality can be disabled by specifying `clean_phone_label=False` + in the two functions that follow. + +.. _stress_enrichment: + +Encoding stress +--------------- + +.. code-block:: python + + with CorpusContext(config) as c: + + c.encode_stress_to_syllables() + +.. note:: + + By default, stress is taken to be numbers in the vowel label (i.e., `AA1` would have a stress of `1`). A different + pattern to use for stress information can be specified through the optional `regex` keyword argument. + + +.. _tone_enrichment: + +Encoding tone +------------- + +.. code-block:: python + + with CorpusContext(config) as c: + + c.encode_tone_to_syllables() + +.. note:: + + As for stress, a different regex can be specified with the `regex` keyword argument. \ No newline at end of file diff --git a/docs/source/enrichment_utterances.rst b/docs/source/enrichment_utterances.rst new file mode 100644 index 00000000..05f8bba1 --- /dev/null +++ b/docs/source/enrichment_utterances.rst @@ -0,0 +1,94 @@ +.. _enrichment_utterances: + +************************ +Creating utterance units +************************ + +Utterances are groups of words that are continuous in some sense. The can be thought of as similar to interpausal units or chunks +in other work. The basic idea is that there are intervals in which there are no speech, a subset of which count as breaks in speech +depending on the length of these non-speech intervals. + +To encode utterances, there are two steps: + +1. :ref:`encoding_pauses` +2. :ref:`encoding_utterances` + + +.. _encoding_pauses: + +Encoding non-speech elements +============================ + +Non-speech elements in PolyglotDB are termed `pause`. Pauses are encoded as follows: + +.. code-block:: python + + nonspeech_words = ['',''] + with CorpusContext(config) as c: + c.encode_pauses(nonspeech_words) + +The function `encode_pauses` takes a list of word labels that should not be considered speech in a discourse and marks them as such. + +.. note:: + + Non-speech words can also be encoded through regular expressions, as in: + + .. code-block:: python + + nonspeech_words = '^[<[{].*' + with CorpusContext(config) as c: + c.encode_pauses(nonspeech_words) + + Where the pattern to be matched is any label that starts with `<` or `[`. + +Once pauses are encoded, aspects of pauses can be queried, as follows: + +.. code-block:: python + + with CorpusContext(config) as c: + q = c.query_graph(c.pause).filter(c.pause.discourse.name == 'one_discourse') + print(q.all()) + +Additionally, word annotations can have previous and following pauses that can be found: + +.. code-block:: python + + with CorpusContext(config) as c: + q = c.query_graph(c.word).columns(c.word.label, + c.word.following_pause_duration.column_name('pause_duration')) + print(q.all()) + + + +.. note:: + + Once pauses are encoded, accessing an annotation's previous or following word via `c.word.previous` will skip over + any pauses. So for a string like `I go...`, the previous word to the word `go` would be `I` rather than ``. + +.. _encoding_utterances: + +Encoding utterances +=================== + +Once pauses are encoded, utterances can be encoded by specifying the minimum length of non-speech elements that count as +a break between stretches of speech. + +.. code-block:: python + + with CorpusContext(config) as c: + c.encode_utterances(min_pause_length=0.15) + +.. note:: + + The function `encode_utterances` can be given a keyword argument for `call_back`, which is a function like `print` that + allows for progress to be output to the console. + +Following encoding, utterances are available to queried and used as any other linguistic unit. For example, to get a list of +all the instances of words at the beginnings of utterances: + + +.. code-block:: python + + with CorpusContext(config) as c: + q = c.query_graph(c.word).filter(c.word.begin == c.word.utterance.begin) + print(q.all()) \ No newline at end of file diff --git a/docs/source/installation.rst b/docs/source/getting_started.rst similarity index 100% rename from docs/source/installation.rst rename to docs/source/getting_started.rst diff --git a/docs/source/importing.rst b/docs/source/import.rst similarity index 100% rename from docs/source/importing.rst rename to docs/source/import.rst diff --git a/docs/source/index.rst b/docs/source/index.rst index b284c1be..ae652d6e 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -8,10 +8,14 @@ Contents: :maxdepth: 2 introduction.rst - installation.rst - importing.rst - graph_queries.rst + getting_started.rst + client.rst + import.rst + enrichment.rst + acoustics.rst + queries.rst io.rst + concepts.rst apireference.rst diff --git a/docs/source/graph_queries.rst b/docs/source/queries.rst similarity index 60% rename from docs/source/graph_queries.rst rename to docs/source/queries.rst index adda427b..76aef7c9 100644 --- a/docs/source/graph_queries.rst +++ b/docs/source/queries.rst @@ -1,4 +1,4 @@ -.. _graph_queries: +.. _queries: **************** Querying corpora @@ -13,9 +13,8 @@ Contents: .. toctree:: :maxdepth: 2 - queries_basic.rst - queries_aggregates.rst - queries_ordering.rst - queries_subsets.rst - queries_subannotations.rst - queries_subpaths.rst + queries_annotations.rst + queries_lexicon.rst + queries_speakers.rst + queries_discourse.rst + queries_reference.rst diff --git a/docs/source/queries_aggregates.rst b/docs/source/queries_aggregates.rst deleted file mode 100644 index d737f638..00000000 --- a/docs/source/queries_aggregates.rst +++ /dev/null @@ -1,65 +0,0 @@ -.. _aggregates_and_groups: - -********************* -Aggregates and groups -********************* - -Aggregate functions are available in :code:`polyglotdb.graph.func`. Aggregate -functions available are: - -* Average -* Count -* Max -* Min -* Stdev -* Sum - -In general, these functions take a numeric attribute as an argument. The -only one that does not follow this pattern is :code:`Count`. - -.. code-block:: python - - with CorpusContext(corpus_name = 'my_corpus', **graph_db_login) as c: - q = c.query_graph(c.phone).filter(c.phone.label == 'aa') - q = q.filter(c.phone.following.label == 'r') - - result = q.aggregate(Count()) - print(result) - - -Like the :code:`all` function, :code:`aggregate` triggers evaluation of the query. -Instead of returning rows, it will return a single number, which is the -number of rows matching this query. - -.. code-block:: python - - with CorpusContext(corpus_name = 'my_corpus', **graph_db_login) as c: - q = c.query_graph(c.phone).filter(c.phone.label == 'aa') - q = q.filter(c.phone.following.label == 'r') - - result = q.aggregate(Average(c.phone.duration)) - print(result) - - -The above aggregate function will return the average duration for all 'aa' -phones followed by 'r' phones. - -Aggregates are particularly useful with grouping. For instance: - -.. code-block:: python - - with CorpusContext(corpus_name = 'my_corpus', **graph_db_login) as c: - q = c.query_graph(c.phone).filter(c.phone.label == 'aa') - q = q.filter(c.phone.following.label.in_(['r','l'])) - q = q.group_by(c.phone.following.label.column_name('following_label')) - - result = q.aggregate(Average(c.phone.duration), Count()) - print(result) - - -The above query will return the average duration and the count of 'aa' -phones grouped by whether they're followed by an 'r' or an 'l'. - -.. note:: In the above example, the :code:`group_by` attribute is supplied with - an alias for output. In the print statment and in the results, the column - will be called 'following_label' instead of the default (more opaque) one. diff --git a/docs/source/queries_annotations.rst b/docs/source/queries_annotations.rst new file mode 100644 index 00000000..85b92fbd --- /dev/null +++ b/docs/source/queries_annotations.rst @@ -0,0 +1,370 @@ + + +.. _annotation_queries: + +******************** +Querying annotations +******************** + +The main way of finding specific annotations is through the :code:`query_graph` method of +:code:`CorpusContext` objects. + +.. code-block:: python + + with CorpusContext(config) as c: + q = c.query_graph(c.word).filter(c.word.label == 'are') + results = q.all() + print(results) + +The above code will find and print all instances of :code:`word` annotations that are +labeled with 'are'. The method :code:`query_graph` takes one argument, which is +an attribute of the context manager corresponding to the name of the +annotation type. + +The primary function for queries is :code:`filter`. This function takes one or more +conditional expressions on attributes of annotations. In the above example, +:code:`word` annotations have an attribute :code:`label` which corresponds to the +orthography. + +Conditional expressions can take on any normal Python conditional (:code:`==`, +:code:`!=`, :code:`<`, :code:`<=`, :code:`>`, :code:`>=`). The Python +operator :code:`in` does not work; a special pattern has to be used: + +.. code-block:: python + + with CorpusContext(config) as c: + q = c.query_graph(c.word).filter(c.word.label.in_(['are', 'is','am'])) + results = q.all() + print(results) + +The :code:`in_` conditional function can take any iterable, including another query: + +.. code-block:: python + + with CorpusContext(config) as c: + sub_q = c.query_graph(c.word).filter(c.word.label.in_(['are', 'is','am'])) + q = c.query_graph(c.phone).filter(c.phone.word.id.in_(sub_q)) + results = q.all() + print(results) + +In this case, it will find all :code:`phone` annotations that are in the words +listed. Using the :code:`id` attribute will use unique identifiers for the filter. +In this particular instance, it does not matter, but it does in the following: + +.. code-block:: python + + with CorpusContext(config) as c: + sub_q = c.query_graph(c.word).filter(c.word.label.in_(['are', 'is','am'])) + sub_q = sub_q.filter_right_aligned(c.word.line) + q = c.query_graph(c.phone).filter(c.phone.word.id.in_(sub_q)) + results = q.all() + print(results) + + +The above query will find all instances of the three words, but only where +they are right-aligned with a :code:`line` annotation. + +.. note:: Queries are lazy evaluated. In the above example, :code:`sub_q` is + not evaluated until :code:`q.all()` is called. This means that filters + can be chained across multiple lines without a performance hit. + +.. _following_previous: + +Following and previous annotations +---------------------------------- + +Filters can reference the surrounding local context. For instance: + +.. code-block:: python + + with CorpusContext(config) as c: + q = c.query_graph(c.phone).filter(c.phone.label == 'aa') + q = q.filter(c.phone.following.label == 'r') + results = q.all() + print(results) + + +The above query will find all the 'aa' phones that are followed by an 'r' +phone. Similarly, :code:`c.phone.previous` would provide access to filtering on +preceding phones. + +.. _subsetting: + +Subsetting annotations +---------------------- + +In linguistics, it's often useful to specify subsets of symbols as particular classes. +For instance, phonemes are grouped together by whether they are syllabic, +their manner/place of articulation, and vowel height/backness/rounding, and +words are grouped by their parts of speech. + + +Suppose a subset has been created as in :ref:`caching_subsets`, so that the phones 'aa' and 'ih' have been marked as `+syllabic`. +Once this category is encoded in the database, it can be used in filters. + +.. code-block:: python + + with CorpusContext(config) as c: + q = c.query_graph(c.phone) + q = q.filter(c.phone.subset=='+syllabic') + results = q.all() + print(results) + +Another way to specify subsets is on the phone annotations themselves, as follows: + +.. code-block:: python + + with CorpusContext(config) as c: + q = c.query_graph(c.phone.filter_by_subset('+syllabic')) + results = q.all() + print(results) + +Both of these queries are identical and will return all instances of 'aa' and 'ih' phones. The benefit of `filter_by_subset` +is generally for use in :ref:`hierarchical_queries`. + +.. note:: Using repeated subsets repeatedly in queries can make them overly + verbose. The objects that the queries use are normal Python objects + and can therefore be assigned to variables for easier use. + + .. code-block:: python + + with CorpusContext(config) as c: + syl = c.phone.filter_by_subset('+syllabic') + q = c.query_graph(syl) + q = q.filter(syl.end == syl.word.end) + results = q.all() + print(results) + + The above query would find all phones marked by '+syllabic' that are + at the ends of words. + + +.. _hierarchical_queries: + +Hierarchical queries +-------------------- + +A key facet of language is that it is hierarchical. Words contain phones, +and can be contained in larger utterances. There are several ways to +query hierarchical information. If we want to find all "aa" phones in the +word "dogs", then we can perform the following query: + +.. code-block:: python + + with CorpusContext(config) as c: + q = c.query_graph(c.phone).filter(c.phone.label == 'aa') + q = q.filter(c.phone.word.label == 'dogs') + results = q.all() + print(results) + +Starting from the word level, we might want to know what phones each word +contains. + +.. code-block:: python + + with CorpusContext(config) as c: + q = c.query_graph(c.word) + q = q.columns(c.word.phone.label.column('phones')) + results = q.all() + print(results) + +In the output of the above query, there would be a column labeled "phones" +that contains a list of the labels of phones that belong to the word +(``['d', 'aa', 'g', 'z']``). Any property of phones can be queried this +way (i.e., 'begin', 'end', 'duration', etc). + +Going down the hierarchy, we can also find all words that contain a certain phone. + +.. code-block:: python + + with CorpusContext(config) as c: + q = c.query_graph(c.word).filter(c.word.label.in_(['are', 'is','am'])) + q = q.filter(c.word.phone.label == 'aa') + results = q.all() + print(results) + + +In this example, it will find all instances of the three words that contain +an 'aa' phone. + +Special keywords exist for these containment columns. The keyword 'rate' +will return the elements per second for the word (i.e., phones per second). +The keyword 'count' will return the number of elements. + +.. code-block:: python + + with CorpusContext(config) as c: + q = c.query_graph(c.word) + q = q.columns(c.word.phone.rate.column('phones_per_second')) + q = q.columns(c.word.phone.count.column('num_phones')) + results = q.all() + print(results) + +These keywords can also leverage subsets, as above: + +.. code-block:: python + + with CorpusContext(config) as c: + q = c.query_graph(c.word) + q = q.columns(c.word.phone.rate.column('phones_per_second')) + q = q.columns(c.word.phone.filter_by_subset('+syllabic').count.column('num_syllabic_phones')) + q = q.columns(c.word.phone.count.column('num_phones')) + results = q.all() + print(results) + +Additionally, there is a special keyword can be used to query the position +of a contained element in a containing one. + +.. code-block:: python + + with CorpusContext(config) as c: + q = c.query_graph(c.phone).filter(c.phone.label == 'aa') + q = q.filter(c.word.label == 'dogs') + q = q.columns(c.word.phone.position.column_name('position_in_word')) + results = q.all() + print(results) + +The above query should return ``2`` for the value of 'position_in_word', +as the "aa" phone would be the second phone. + + +.. _subannotations: + +Subannotations +-------------- + +Annotations can have subannotations associated with them. Subannotations +are not independent linguistic types, but have more information associated +with them than just a single property. For instance, voice onset time (VOT) +would be a subannotation of stops (as it has a begin time and an end time +that are of interest). Querying such subannotations would be performed as follows: + + +.. code-block:: python + + with CorpusContext(config) as c: + q = c.query_graph(c.phone) + q = q.columns(c.phone.vot.duration.column_name('vot')) + results = q.all() + print(results) + +In some cases, it may be desirable to have more than one subannotation of +the same type associated with a single annotation. For instance, +voicing during the closure of a stop can take place at both the beginning +and end of closure, with an unvoiced period in the middle. Using a similar +query as above would get the durations of each of these (in the order of +their begin time): + + +.. code-block:: python + + with CorpusContext(config) as c: + q = c.query_graph(c.phone) + q = q.columns(c.phone.voicing_during_closure.duration.column_name('voicing')) + results = q.all() + print(results) + +In some cases, we might like to know the total duration of such subannotations, +rather than the individual durations. To query that information, we can +use an ``aggregate``: + +.. code-block:: python + + with CorpusContext(config) as c: + q = c.query_graph(c.phone) + results = q.aggregate(Sum(c.phone.voicing_during_closure.duration).column_name('total_voicing')) + print(results) + + +Miscellaneous +============= + +.. _aggregates_and_groups: + +Aggregates and groups +--------------------- + +Aggregate functions are available in :code:`polyglotdb.query.func`. Aggregate +functions available are: + +* Average +* Count +* Max +* Min +* Stdev +* Sum + +In general, these functions take a numeric attribute as an argument. The +only one that does not follow this pattern is :code:`Count`. + +.. code-block:: python + + with CorpusContext(config) as c: + q = c.query_graph(c.phone).filter(c.phone.label == 'aa') + q = q.filter(c.phone.following.label == 'r') + result = q.aggregate(Count()) + print(result) + + +Like the :code:`all` function, :code:`aggregate` triggers evaluation of the query. +Instead of returning rows, it will return a single number, which is the +number of rows matching this query. + +.. code-block:: python + + with CorpusContext(config) as c: + q = c.query_graph(c.phone).filter(c.phone.label == 'aa') + q = q.filter(c.phone.following.label == 'r') + result = q.aggregate(Average(c.phone.duration)) + print(result) + + +The above aggregate function will return the average duration for all 'aa' +phones followed by 'r' phones. + +Aggregates are particularly useful with grouping. For instance: + +.. code-block:: python + + with CorpusContext(config) as c: + q = c.query_graph(c.phone).filter(c.phone.label == 'aa') + q = q.filter(c.phone.following.label.in_(['r','l'])) + q = q.group_by(c.phone.following.label.column_name('following_label')) + result = q.aggregate(Average(c.phone.duration), Count()) + print(result) + + +The above query will return the average duration and the count of 'aa' +phones grouped by whether they're followed by an 'r' or an 'l'. + +.. note:: In the above example, the :code:`group_by` attribute is supplied with + an alias for output. In the print statment and in the results, the column + will be called 'following_label' instead of the default (more opaque) one. + +.. _ordering: + +Ordering +-------- + +The :code:`order_by` function is used to provide an ordering to the results of +a query. + +.. code-block:: python + + with CorpusContext(config) as c: + q = c.query_graph(c.phone).filter(c.phone.label == 'aa') + q = q.filter(c.phone.following.label.in_(['r','l'])) + q = q.filter(c.phone.discourse == 'a_discourse') + q = q.order_by(c.phone.begin) + results = q.all() + print(results) + + +The results for the above query will be ordered by the timepoint of the +annotation. Ordering by time is most useful for when looking at single +discourses (as including multiple discourses in a query would invalidate the +ordering). + +.. note:: In grouped aggregate queries, ordering is by default by the + first :code:`group_by` attribute. This can be changed by calling :code:`order_by` + before evaluating with :code:`aggregate`. \ No newline at end of file diff --git a/docs/source/queries_basic.rst b/docs/source/queries_basic.rst deleted file mode 100644 index 729446c7..00000000 --- a/docs/source/queries_basic.rst +++ /dev/null @@ -1,138 +0,0 @@ - - -.. _basic_queries: - -************************ -Basic structural queries -************************ - -The main way of accessing discourses is through the :code:`query_graph` method of -:code:`CorpusContext` objects. - -.. code-block:: python - - with CorpusContext(corpus_name = 'my_corpus', **graph_db_login) as c: - q = c.query_graph(c.word).filter(c.word.label == 'are') - results = q.all() - print(results) - -The above code will find and print all instances of :code:`word` annotations that are -labeled with 'are'. The method :code:`query_graph` takes one argument, which is -an attribute of the context manager corresponding to the name of the -annotation type. - -The primary function for queries is :code:`filter`. This function takes one or more -conditional expressions on attributes of annotations. In the above example, -:code:`word` annotations have an attribute :code:`label` which corresponds to the -orthography. - -Conditional expressions can take on any normal Python conditional (:code:`==`, -:code:`!=`, :code:`<`, :code:`<=`, :code:`>`, :code:`>=`). The Python -operator :code:`in` does not work; a special pattern has to be used: - -.. code-block:: python - - with CorpusContext(corpus_name = 'my_corpus', **graph_db_login) as c: - q = c.query_graph(c.word).filter(c.word.label.in_(['are', 'is','am'])) - - results = q.all() - print(results) - -The :code:`in_` conditional function can take any iterable, including another query: - -.. code-block:: python - - with CorpusContext(corpus_name = 'my_corpus', **graph_db_login) as c: - sub_q = c.query_graph(c.word).filter(c.word.label.in_(['are', 'is','am'])) - q = c.query_graph(c.phone).filter(c.word.id.in_(sub_q)) - - results = q.all() - print(results) - -In this case, it will find all :code:`phone` annotations that are in the words -listed. Using the :code:`id` attribute will use unique identifiers for the filter. -In this particular instance, it does not matter, but it does in the following: - -.. code-block:: python - - with CorpusContext(corpus_name = 'my_corpus', **graph_db_login) as c: - sub_q = c.query_graph(c.word).filter(c.word.label.in_(['are', 'is','am'])) - sub_q = sub_q.filter_right_aligned(c.line) - q = c.query_graph(c.phone).filter(c.word.id.in_(sub_q)) - results = q.all() - print(results) - - -The above query will find all instances of the three words, but only where -they are right-aligned with a :code:`line` annotation. - -.. note:: Queries are lazy evaluated. In the above example, :code:`sub_q` is - not evaluated until :code:`q.all()` is called. This means that filters - can be chained across multiple lines without a performance hit. - -Specialized filters -------------------- - -In addition to :code:`filter`, there are several specialized filter functions -that refer to other types of annotation. The :code:`filter_right_aligned` was -shown above. The full list is: - -* filter_left_aligned -* filter_right_aligned -* filter_contains -* filter_contained_by - -The alignment filters check whether right edges or the left edges of both -annotation types are aligned. The containment filters refer explicitly to -hierarchical structure. The :code:`filter_contains` checks whether the higher -annotation contains a lower annotation that matches the criteria: - -.. code-block:: python - - with CorpusContext(corpus_name = 'my_corpus', **graph_db_login) as c: - q = c.query_graph(c.word).filter(c.word.label.in_(['are', 'is','am'])) - q = q.filter_contains(c.phone.label == 'aa') - - results = q.all() - print(results) - - -In this example, it will find all instances of the three words that contain -an 'aa' phone. - -The :code:`filter_contained_by` function does the opposite, checking whether -the annotation is contained by an annotation that matches a condition: - -.. code-block:: python - - with CorpusContext(corpus_name = 'my_corpus', **graph_db_login) as c: - q = c.query_graph(c.phone).filter(c.phone.label == 'aa') - q = q.filter_contains(c.word.label.in_(['are', 'is','am'])) - - results = q.all() - print(results) - -The above example finds a similar set of labels as the one above that, -but the returned annotation types are different. - - -.. _following_previous: - -Following and previous annotations ----------------------------------- - -Filters can reference the surrounding local context. For instance: - -.. code-block:: python - - with CorpusContext(corpus_name = 'my_corpus', **graph_db_login) as c: - q = c.query_graph(c.phone).filter(c.phone.label == 'aa') - q = q.filter(c.phone.following.label == 'r') - - results = q.all() - print(results) - - -The above query will find all the 'aa' phones that are followed by an 'r' -phone. Similarly, :code:`c.phone.previous` would provide access to filtering on -preceding phones. diff --git a/docs/source/queries_discourse.rst b/docs/source/queries_discourse.rst new file mode 100644 index 00000000..b8b28c9a --- /dev/null +++ b/docs/source/queries_discourse.rst @@ -0,0 +1,7 @@ + + +.. _discourse_queries: + +***************** +Discourse queries +***************** diff --git a/docs/source/queries_lexicon.rst b/docs/source/queries_lexicon.rst new file mode 100644 index 00000000..0cd1f673 --- /dev/null +++ b/docs/source/queries_lexicon.rst @@ -0,0 +1,18 @@ + + +.. _lexicon_queries: + +*************** +Lexicon queries +*************** + +Querying the lexicon is in many ways similar to querying annotations in graphs. + +.. code-block:: python + + with CorpusContext(config) as c: + q = c.query_lexicon(c.lexicon_phone).filter(c.lexicon_phone.label == 'aa') + print(q.all()) + +The above query will just return one result (as there is only one phone type with a given label) as opposed to the multiple +results returned when querying annotations. \ No newline at end of file diff --git a/docs/source/queries_ordering.rst b/docs/source/queries_ordering.rst deleted file mode 100644 index 89fce705..00000000 --- a/docs/source/queries_ordering.rst +++ /dev/null @@ -1,29 +0,0 @@ -.. _ordering: - -******** -Ordering -******** - -The :code:`order_by` function is used to provide an ordering to the results of -a query. - -.. code-block:: python - - with CorpusContext(corpus_name = 'my_corpus', **graph_db_login) as c: - q = c.query_graph(c.phone).filter(c.phone.label == 'aa') - q = q.filter(c.phone.following.label.in_(['r','l'])) - q = q.filter(c.phone.discourse == 'a_discourse') - q = q.order_by(c.phone.begin) - - results = q.all() - print(results) - - -The results for the above query will be ordered by the timepoint of the -annotation. Ordering by time is most useful for when looking at single -discourses (as including multiple discourses in a query would invalidate the -ordering). - -.. note:: In grouped aggregate queries, ordering is by default by the - first :code:`group_by` attribute. This can be changed by calling :code:`order_by` - before evaluating with :code:`aggregate`. diff --git a/docs/source/queries_reference.rst b/docs/source/queries_reference.rst new file mode 100644 index 00000000..cd46b1f9 --- /dev/null +++ b/docs/source/queries_reference.rst @@ -0,0 +1,122 @@ + + +.. _queries_reference: + +*************** +Query Reference +*************** + + +Getting elements +================ + +:code:`c.phone` +:code:`c.lexicon_phone` +:code:`c.speaker` + + +Attributes +========== + +In addition to any values that get added through enrichment, there are several built in attributes that allow access to +different parts of the database. + ++--------------------------------------+-------------------------------------------------+----------------------------------------+ +| Attribute type | Code | Notes | ++======================================+=================================================+========================================+ +| Label [1]_ | :code:`c.phone.label` | | ++--------------------------------------+-------------------------------------------------+----------------------------------------+ +| Name [2]_ | :code:`c.speaker.name` | | ++--------------------------------------+-------------------------------------------------+----------------------------------------+ +| Begin [3]_ | :code:`c.phone.begin` | | ++--------------------------------------+-------------------------------------------------+----------------------------------------+ +| End [3]_ | :code:`c.phone.end` | | ++--------------------------------------+-------------------------------------------------+----------------------------------------+ +| Duration [3]_ | :code:`c.phone.duration` | | ++--------------------------------------+-------------------------------------------------+----------------------------------------+ +| Previous annotation [3]_ | :code:`c.phone.previous` | | ++--------------------------------------+-------------------------------------------------+----------------------------------------+ +| Following annotation [3]_ | :code:`c.phone.following` | | ++--------------------------------------+-------------------------------------------------+----------------------------------------+ +| Previous pause [3]_ | :code:`c.phone.word.previous_pause` | Must be from a `word` annotation | ++--------------------------------------+-------------------------------------------------+----------------------------------------+ +| Following pause [3]_ | :code:`c.phone.word.following_pause` | Must be from a `word` annotation | ++--------------------------------------+-------------------------------------------------+----------------------------------------+ +| Speaker [3]_ | :code:`c.phone.speaker` | | ++--------------------------------------+-------------------------------------------------+----------------------------------------+ +| Discourse [3]_ | :code:`c.phone.discourse` | | ++--------------------------------------+-------------------------------------------------+----------------------------------------+ +| Pitch attribute [3]_ | :code:`c.phone.pitch` | | ++--------------------------------------+-------------------------------------------------+----------------------------------------+ +| Formants attribute [3]_ | :code:`c.phone.formants` | | ++--------------------------------------+-------------------------------------------------+----------------------------------------+ +| Intensity attribute [3]_ | :code:`c.phone.intensity` | | ++--------------------------------------+-------------------------------------------------+----------------------------------------+ +| Minimum value [4]_ | :code:`c.phone.pitch.min` | | ++--------------------------------------+-------------------------------------------------+----------------------------------------+ +| Maximum value [4]_ | :code:`c.phone.pitch.max` | | ++--------------------------------------+-------------------------------------------------+----------------------------------------+ +| Mean value [4]_ | :code:`c.phone.pitch.mean` | | ++--------------------------------------+-------------------------------------------------+----------------------------------------+ +| Raw track [4]_ | :code:`c.phone.pitch.track` | | ++--------------------------------------+-------------------------------------------------+----------------------------------------+ +| Sampled track [4]_ | :code:`c.phone.pitch.sampled_track` | | ++--------------------------------------+-------------------------------------------------+----------------------------------------+ +| Interpolated track [4]_ | :code:`c.phone.pitch.interpolated_track` | | ++--------------------------------------+-------------------------------------------------+----------------------------------------+ + + + +.. [1] Only available for graph annotations and lexicon annotations +.. [2] Only available for speakers/discourses +.. [3] Only available for graph annotations +.. [4] Only available for acoustic attributes + +Filters +======= + ++--------------------------------------+-------------------------------------------------+----------------------------------------+ +| Filter type | Code | Notes | ++======================================+=================================================+========================================+ +| Equal | :code:`c.phone.label == 'aa'` | | ++--------------------------------------+-------------------------------------------------+----------------------------------------+ +| Not equal | :code:`c.phone.label != 'aa'` | | ++--------------------------------------+-------------------------------------------------+----------------------------------------+ +| Greater than | :code:`c.phone.begin > 0` | | ++--------------------------------------+-------------------------------------------------+----------------------------------------+ +| Greater than or equal | :code:`c.phone.begin >= 0` | | ++--------------------------------------+-------------------------------------------------+----------------------------------------+ +| Less than | :code:`c.phone.end < 10` | | ++--------------------------------------+-------------------------------------------------+----------------------------------------+ +| Less than or equal | :code:`c.phone.end <= 10` | | ++--------------------------------------+-------------------------------------------------+----------------------------------------+ +| In | :code:`c.phone.label.in_(['aa','ae'])` | :code:`in_` can also take a query | ++--------------------------------------+-------------------------------------------------+----------------------------------------+ +| Not in |:code:`c.phone.label.not_in_(['aa'])` | :code:`not_in_` can also take a query | ++--------------------------------------+-------------------------------------------------+----------------------------------------+ +| Is null | :code:`c.phone.label == None` | | ++--------------------------------------+-------------------------------------------------+----------------------------------------+ +| Is not null | :code:`c.phone.label != None` | | ++--------------------------------------+-------------------------------------------------+----------------------------------------+ +| Regular expression match | :code:`c.phone.label.regex('a,')` | | ++--------------------------------------+-------------------------------------------------+----------------------------------------+ +| In subset | :code:`c.phone.subset == 'syllabic'` | | ++--------------------------------------+-------------------------------------------------+----------------------------------------+ +| Not in subset | :code:`c.phone.subset != 'syllabic'` | | ++--------------------------------------+-------------------------------------------------+----------------------------------------+ +| Precedes pause |:code:`c.word.precedes_pause == True` | Only available for graph annotations | ++--------------------------------------+-------------------------------------------------+----------------------------------------+ +| Does not precede pause |:code:`c.word.precedes_pause == False` | Only available for graph annotations | ++--------------------------------------+-------------------------------------------------+----------------------------------------+ +| Follows pause |:code:`c.word.follows_pause == True` | Only available for graph annotations | ++--------------------------------------+-------------------------------------------------+----------------------------------------+ +| Does not follow pause |:code:`c.word.follows_pause == False` | Only available for graph annotations | ++--------------------------------------+-------------------------------------------------+----------------------------------------+ +| Right aligned |:code:`c.phone.end == c.phone.word.end` | Only available for graph annotations | ++--------------------------------------+-------------------------------------------------+----------------------------------------+ +| Not right aligned |:code:`c.phone.end != c.phone.word.end`` | Only available for graph annotations | ++--------------------------------------+-------------------------------------------------+----------------------------------------+ +| Left aligned |:code:`c.phone.begin == c.phone.word.begin` | Only available for graph annotations | ++--------------------------------------+-------------------------------------------------+----------------------------------------+ +| Not left aligned |:code:`c.phone.begin != c.phone.word.begin` | Only available for graph annotations | ++--------------------------------------+-------------------------------------------------+----------------------------------------+ \ No newline at end of file diff --git a/docs/source/queries_speakers.rst b/docs/source/queries_speakers.rst new file mode 100644 index 00000000..917aa0a2 --- /dev/null +++ b/docs/source/queries_speakers.rst @@ -0,0 +1,8 @@ + + +.. _speaker_queries: + +*************** +Speaker queries +*************** + diff --git a/docs/source/queries_subannotations.rst b/docs/source/queries_subannotations.rst deleted file mode 100644 index 18a344c3..00000000 --- a/docs/source/queries_subannotations.rst +++ /dev/null @@ -1,53 +0,0 @@ - - -.. _subannotations: - - -************** -Subannotations -************** - -Annotations can have subannotations associated with them. Subannotations -are not independent linguistic types, but have more information associated -with them than just a single property. For instance, voice onset time (VOT) -would be a subannotation of stops (as it has a begin time and an end time -that are of interest). Querying such subannotations would be performed as follows: - - -.. code-block:: python - - with CorpusContext(corpus_name = 'my_corpus', **graph_db_login) as c: - q = c.query_graph(c.phone) - q = q.columns(c.phone.vot.duration.column_name('vot')) - - results = q.all() - print(results) - -In some cases, it may be desirable to have more than one subannotation of -the same type associated with a single annotation. For instance, -voicing during the closure of a stop can take place at both the beginning -and end of closure, with an unvoiced period in the middle. Using a similar -query as above would get the durations of each of these (in the order of -their begin time): - - -.. code-block:: python - - with CorpusContext(corpus_name = 'my_corpus', **graph_db_login) as c: - q = c.query_graph(c.phone) - q = q.columns(c.phone.voicing_during_closure.duration.column_name('voicing')) - - results = q.all() - print(results) - -In some cases, we might like to know the total duration of such subannotations, -rather than the individual durations. To query that information, we can -use an ``aggregate``: - -.. code-block:: python - - with CorpusContext(corpus_name = 'my_corpus', **graph_db_login) as c: - q = c.query_graph(c.phone) - results = q.aggregate(Sum(c.phone.voicing_during_closure.duration).column_name('total_voicing')) - - print(results) diff --git a/docs/source/queries_subpaths.rst b/docs/source/queries_subpaths.rst deleted file mode 100644 index 3aa06583..00000000 --- a/docs/source/queries_subpaths.rst +++ /dev/null @@ -1,81 +0,0 @@ - - -.. _subpaths: - - -******************** -Hierarchical queries -******************** - -A key facet of language is that it is hierarchical. Words contain phones, -and can be contained in larger utterances. There are several ways to -query hierarchical information. If we want to find all "aa" phones in the -word "dogs", then we can perform the following query: - - -.. code-block:: python - - with CorpusContext(corpus_name = 'my_corpus', **graph_db_login) as c: - q = g.query_graph(g.phone).filter(g.phone.label == 'aa') - q = q.filter_contained_by(g.word.label == 'dogs') - - results = q.all() - print(results) - -The ``filter`` function can also be used for implicit containment queries: - -.. code-block:: python - - with CorpusContext(corpus_name = 'my_corpus', **graph_db_login) as c: - q = g.query_graph(g.phone).filter(g.phone.label == 'aa') - q = q.filter(g.word.label == 'dogs') - - results = q.all() - print(results) - -Starting from the word level, we might want to know what phones each word -contains. - -.. code-block:: python - - with CorpusContext(corpus_name = 'my_corpus', **graph_db_login) as c: - q = g.query_graph(g.word) - q = q.columns(g.word.phone.label.column('phones')) - - results = q.all() - print(results) - -In the output of the above query, there would be a column labeled "phones" -that contains a list of the labels of phones that belong to the word -(``['d', 'aa', 'g', 'z']``). Any property of phones can be queried this -way (i.e., 'begin', 'end', 'duration', etc). - -Special keywords exist for these containment columns. The keyword 'rate' -will return the elements per second for the word (i.e., phones per second). -The keyword 'count' will return the number of elements. - -.. code-block:: python - - with CorpusContext(corpus_name = 'my_corpus', **graph_db_login) as c: - q = g.query_graph(g.word) - q = q.columns(g.word.phone.rate.column('phones_per_second')) - q = q.columns(g.word.phone.count.column('num_phones')) - - results = q.all() - print(results) - -Additionally, there is a special keyword can be used to query the position -of a contained element in a containing one. - -.. code-block:: python - - with CorpusContext(corpus_name = 'my_corpus', **graph_db_login) as c: - q = g.query_graph(g.phone).filter(g.phone.label == 'aa') - q = q.filter(g.word.label == 'dogs') - q = q.columns(g.word.phone.position.column_name('position_in_word')) - - results = q.all() - print(results) - -The above query should return ``2`` for the value of 'position_in_word', -as the "aa" phone would be the second phone. diff --git a/docs/source/queries_subsets.rst b/docs/source/queries_subsets.rst deleted file mode 100644 index ca612a19..00000000 --- a/docs/source/queries_subsets.rst +++ /dev/null @@ -1,52 +0,0 @@ - -.. _subsetting: - - -********************** -Subsetting annotations -********************** - -In linguistics, it's often useful to specify subsets of symbols as particular classes. -For instance, phonemes are grouped together by whether they are syllabic, -their manner/place of articulation, and vowel height/backness/rounding, and -words are grouped by their parts of speech. - -In PolyglotDB, creating a subset is as follows: - -.. code-block:: python - - with CorpusContext(corpus_name = 'my_corpus', **graph_db_login) as c: - q = c.query_graph(c.phone).filter(c.phone.label.in_(['aa', 'ih'])) - q.set_type('+syllabic') - -After running that code, the phones 'aa' and 'ih' would be marked in the database -as '+syllabic'. The string for the category can contain any characters. -Once this category is encoded in the database, queries can be run just on -those subsets. - -.. code-block:: python - - with CorpusContext(corpus_name = 'my_corpus', **graph_db_login) as c: - q = c.query_graph(c.phone.subset('+syllabic')) - - results = q.all() - print(results) - -The above query will return all instances of 'aa' and 'ih' phones. - -.. note:: Using repeated subsets repeatedly in queries can make them overly - verbose. The objects that the queries use are normal Python objects - and can therefore be assigned to variables for easier use. - - .. code-block:: python - - with CorpusContext(corpus_name = 'my_corpus', **graph_db_login) as c: - syl = c.phone.subset('+syllabic') - q = c.query_graph(syl) - q = q.filter(syl.end == c.word.end) - - results = q.all() - print(results) - - The above query would find all phones marked by '+syllabic' that are - at the ends of words. diff --git a/polyglotdb/corpus/featured.py b/polyglotdb/corpus/featured.py index 4801a79f..c744e001 100644 --- a/polyglotdb/corpus/featured.py +++ b/polyglotdb/corpus/featured.py @@ -2,9 +2,22 @@ from ..io.importer import feature_data_to_csvs, import_feature_csvs from .lexical import LexicalContext from ..exceptions import SubsetError +from ..io.enrichment.features import enrich_features_from_csv class FeaturedContext(LexicalContext): + def enrich_inventory_from_csv(self, path): + """ + Enriches corpus from a csv file + + Parameters + ---------- + path : str + the path to the csv file + """ + + enrich_features_from_csv(self, path) + def encode_class(self, phones, label): """ encodes phone classes diff --git a/polyglotdb/corpus/lexical.py b/polyglotdb/corpus/lexical.py index 2f69d50d..9906745f 100644 --- a/polyglotdb/corpus/lexical.py +++ b/polyglotdb/corpus/lexical.py @@ -1,4 +1,5 @@ from ..io.importer import lexicon_data_to_csvs, import_lexicon_csvs +from ..io.enrichment.lexical import enrich_lexicon_from_csv from .spoken import SpokenContext @@ -25,3 +26,16 @@ def enrich_lexicon(self, lexicon_data, type_data=None, case_sensitive=False): def reset_lexicon(self): pass + + def enrich_lexicon_from_csv(self, path, case_sensitive=False): + """ + Enriches lexicon from a csv file + + Parameters + ---------- + path : str + the path to the csv file + case_sensitive : boolean + Defaults to false + """ + enrich_lexicon_from_csv(self, path, case_sensitive) \ No newline at end of file diff --git a/polyglotdb/corpus/spoken.py b/polyglotdb/corpus/spoken.py index 8b546f61..debfd991 100644 --- a/polyglotdb/corpus/spoken.py +++ b/polyglotdb/corpus/spoken.py @@ -1,9 +1,32 @@ from ..io.importer import (speaker_data_to_csvs, import_speaker_csvs, discourse_data_to_csvs, import_discourse_csvs) from .audio import AudioContext +from ..io.enrichment.spoken import enrich_speakers_from_csv, enrich_discourses_from_csv class SpokenContext(AudioContext): + def enrich_speakers_from_csv(self, path): + """ + Enriches speakers from a csv file + + Parameters + ---------- + path : str + the path to the csv file + """ + enrich_speakers_from_csv(self, path) + + def enrich_discourses_from_csv(self, path): + """ + Enriches discourses from a csv file + + Parameters + ---------- + path : str + the path to the csv file + """ + enrich_discourses_from_csv(self, path) + def get_speakers_in_discourse(self,discourse): query = '''MATCH (d:Discourse:{corpus_name})<-[:speaks_in]-(s:Speaker:{corpus_name}) WHERE d.name = {{discourse_name}} diff --git a/polyglotdb/corpus/syllabic.py b/polyglotdb/corpus/syllabic.py index 46d26bb7..36d78fb7 100644 --- a/polyglotdb/corpus/syllabic.py +++ b/polyglotdb/corpus/syllabic.py @@ -140,7 +140,7 @@ def has_syllabics(self): def has_syllables(self): return 'syllable' in self.hierarchy.annotation_types - def encode_syllables(self, algorithm='probabilistic', call_back=None, stop_check=None): + def encode_syllables(self, algorithm='maxonset', call_back=None, stop_check=None): """ Encodes syllables to a corpus @@ -326,7 +326,7 @@ def enrich_syllables(self, syllable_data, type_data=None): self.encode_hierarchy() - def encode_stress(self, pattern): + def _generate_stress_enrichment(self, pattern): """ encode stress based off of CMUDict cues @@ -357,9 +357,7 @@ def encode_stress(self, pattern): return enrich_dict - # self.enrich_syllables(enrich_dict) - - def encode_tone(self, pattern): + def _generate_tone_enrichment(self, pattern): """ encode tone based off of CMUDict cues """ @@ -385,18 +383,25 @@ def encode_tone(self, pattern): enrich_dict.update({syl: {'tone': end}}) return enrich_dict - # self.enrich_syllables(enrich_dict) - def encode_stresstone_to_syllables(self, encode_type, regex): + def encode_stress_to_syllables(self, regex=None, clean_phone_label=True): + if regex is None: + regex = '[0-9]' - if encode_type == 'stress': - if regex == "": - enrich_dict = self.encode_stress('[0-9]') - else: - enrich_dict = self.encode_stress(regex) - else: - enrich_dict = self.encode_tone(regex) + enrich_dict = self._generate_stress_enrichment(regex) + + if clean_phone_label: + self.remove_pattern(regex) + self.enrich_syllables(enrich_dict) + self.encode_hierarchy() + + def encode_tone_to_syllables(self, regex=None, clean_phone_label=True): + if regex is None: + regex = '[0-9]' + + enrich_dict = self._generate_tone_enrichment(regex) - self.remove_pattern(regex) + if clean_phone_label: + self.remove_pattern(regex) self.enrich_syllables(enrich_dict) self.encode_hierarchy() diff --git a/tests/test_client.py b/tests/test_client.py index ae33790b..d032412b 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -17,8 +17,9 @@ def test_client_create_database(graph_db, localhost): response = client.create_database('test_database') ports = client.get_ports('test_database') - assert ports == {'graph_http_port': 7404, 'graph_bolt_port': 7406, - 'acoustic_http_port': 8404} + assert 'graph_http_port' in ports + assert 'graph_bolt_port' in ports + assert 'acoustic_http_port' in ports def test_client_database_list(localhost): diff --git a/tests/test_enrich.py b/tests/test_enrich.py index 70da59cc..7393e63c 100644 --- a/tests/test_enrich.py +++ b/tests/test_enrich.py @@ -111,7 +111,7 @@ def test_stress_enrichment(stressed_config): with CorpusContext(stressed_config) as c: c.encode_syllabic_segments(syllabics) c.encode_syllables("maxonset") - c.encode_stresstone_to_syllables('stress', '[0-2]$') + c.encode_stress_to_syllables(regex='[0-2]$') assert (c.hierarchy.has_type_property("syllable", "stress")) diff --git a/tests/test_summarized.py b/tests/test_summarized.py index 0e1bb023..c4ae7d06 100644 --- a/tests/test_summarized.py +++ b/tests/test_summarized.py @@ -167,7 +167,7 @@ def test_syllable_mean_duration(summarized_config): print("syllable mean:") res = g.get_measure('duration', 'mean', 'syllable') print(res) - assert (len(res) == 55) + assert (len(res) == 57) for i, r in enumerate(res): if r[0] == 'w.er.d.z': break @@ -203,7 +203,7 @@ def test_syllable_median(summarized_config): res = g.get_measure('duration', 'median', 'syllable') print(res) - assert (len(res) == 55) + assert (len(res) == 57) def test_syllable_std_dev(summarized_config): @@ -215,7 +215,7 @@ def test_syllable_std_dev(summarized_config): print("syllable std dev:") res = g.get_measure('duration', 'stdev', 'syllable') - assert (len(res) == 55) + assert (len(res) == 57) g.reset_syllables()