/
descriptive_stats.py
214 lines (187 loc) · 7.28 KB
/
descriptive_stats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
"""Calculation of descriptive statistics."""
from typing import Callable, Dict, Union
import numpy as np
from spacy.language import Language
from spacy.tokens import Doc, Span
from .utils import filter_tokens, n_sentences, n_syllables, n_tokens
class DescriptiveStatistics:
"""spaCy v.3.0 component that adds attributes with desriptive statistics to
`Doc` and `Span` objects.
The attributes relate to token and sentence length, number of
syllables, and counts of tokens and sentences.
"""
def __init__(self, nlp: Language):
"""Initialise components."""
extensions: Dict[str, Callable] = {
"_n_sentences": n_sentences,
"_n_tokens": n_tokens,
"_n_syllables": n_syllables,
"token_length": self.token_length,
"sentence_length": self.sentence_length,
"syllables": self.syllables,
"counts": self.counts,
"descriptive_stats": self.descriptive_stats,
}
for extension_name, getter_fun in extensions.items():
if extension_name not in [
"_n_sentences",
"sentence_length",
"syllables",
] and not Span.has_extension(extension_name):
Span.set_extension(extension_name, getter=getter_fun)
if not Doc.has_extension(extension_name):
Doc.set_extension(extension_name, getter=getter_fun)
def token_length(self, doc: Union[Doc, Span]) -> dict:
"""Calculate mean, median and std of token length for a `Doc` or
`Span`.
Returns:
dict: token_length_mean, token_length_median, token_length_std
"""
token_lengths = [len(token) for token in filter_tokens(doc)]
if not token_lengths:
return {
"token_length_mean": np.nan,
"token_length_median": np.nan,
"token_length_std": np.nan,
}
return {
"token_length_mean": np.mean(token_lengths),
"token_length_median": np.median(token_lengths),
"token_length_std": np.std(token_lengths),
}
def sentence_length(self, doc: Doc) -> dict:
"""Calculate mean, median and std of sentence length for a `Doc`.
Returns:
dict: sentence_length_mean, sentence_length_median, sentence_length_std
"""
# get length of filtered tokens per sentence
tokenized_sentences = [
[
token.text
for token in sent
if not token.is_punct and "'" not in token.text
]
for sent in doc.sents
]
len_sentences = [len(sentence) for sentence in tokenized_sentences]
if not len_sentences:
return {
"sentence_length_mean": np.nan,
"sentence_length_median": np.nan,
"sentence_length_std": np.nan,
}
return {
"sentence_length_mean": np.mean(len_sentences),
"sentence_length_median": np.median(len_sentences),
"sentence_length_std": np.std(len_sentences),
}
def syllables(self, doc: Doc) -> dict:
"""Calculate mean, median and std of syllables per token for a `Doc`.
Uses `Pyphen` for hyphenation.
Returns:
dict: syllables_per_token_mean, syllables_per_token_median,
syllables_per_token_std
"""
n_syllables = doc._._n_syllables
if not n_syllables:
return {
"syllables_per_token_mean": np.nan,
"syllables_per_token_median": np.nan,
"syllables_per_token_std": np.nan,
}
return {
"syllables_per_token_mean": np.mean(n_syllables),
"syllables_per_token_median": np.median(n_syllables),
"syllables_per_token_std": np.std(n_syllables),
}
def counts(self, doc: Union[Doc, Span], ignore_whitespace: bool = True) -> dict:
"""Calculate counts of tokens, unique tokens, and characters for a
`Doc` or `Span`. Adds number of sentences for `Doc` objects.
Args:
ignore_whitespace: if True, whitespace is not counted as a character when
counting number of characters.
Return:
dict: n_tokens, n_unique_tokens, proportion_unique_tokens, n_characters,
(n_sentences)
"""
n_tokens = doc._._n_tokens
n_types = len({tok.lower_ for tok in filter_tokens(doc)})
if ignore_whitespace:
n_chars = len(doc.text.replace(" ", ""))
else:
n_chars = len(doc.text)
prop_unique_tokens = np.nan if n_tokens == 0 else n_types / n_tokens
out = {
"n_tokens": n_tokens,
"n_unique_tokens": n_types,
"proportion_unique_tokens": prop_unique_tokens,
"n_characters": n_chars,
}
if isinstance(doc, Doc):
out["n_sentences"] = doc._._n_sentences
return out
def descriptive_stats(self, doc: Union[Doc, Span]) -> dict:
"""Get all descriptive statistics in a single dict."""
out = {**doc._.counts, **doc._.token_length}
if isinstance(doc, Span):
return out
return {**out, **doc._.sentence_length, **doc._.syllables}
def __call__(self, doc):
"""Run the pipeline component."""
return doc
@Language.factory(
"textdescriptives/descriptive_stats",
assigns=[
"doc._._n_sentences",
"doc._._n_tokens",
"doc._._n_syllables",
"doc._.token_length",
"doc._.sentence_length",
"doc._.syllables",
"doc._.counts",
"doc._.descriptive_stats",
"span._.token_length",
"span._.counts",
"span._.descriptive_stats",
],
)
def create_descriptive_stats_component(
nlp: Language,
name: str,
) -> Callable[[Doc], Doc]:
"""Allows DescriptiveStatistics to be added to a spaCy pipe using
nlp.add_pipe("textdescriptives/descriptive_stats").
Adding the component to the pipe will add the following attributes to
`Doc` and `Span` objects:
- `doc._.n_sentences`
- `doc._.n_tokens`
- `doc._.token_length`
- `doc._.sentence_length`
- `doc._.syllables`
- `doc._.counts`
- `doc._.descriptive_stats`
- `span._.token_length`
- `span._.counts`
- `span._.descriptive_stats`
Args:
nlp (Language): spaCy language object, does not need to be specified in the
nlp.add_pipe call.
name (str): name of the component. Can be optionally specified in the
nlp.add_pipe call, using the name argument.
Returns:
Callable[[Doc], Doc]: DescriptiveStatistics component
Example:
>>> import spacy
>>> nlp = spacy.blank("en")
>>> # add sentencizer
>>> nlp.add_pipe("sentencizer")
>>> # add descriptive stats
>>> nlp.add_pipe("textdescriptives/descriptive_stats")
>>> # apply to a document
>>> doc = nlp("This is a sentence. This is another sentence.")
>>> doc._.descriptive_stats
"""
sentencizers = {"sentencizer", "parser"}
if not sentencizers.intersection(set(nlp.pipe_names)):
nlp.add_pipe("sentencizer") # add a sentencizer if not one in pipe
return DescriptiveStatistics(nlp)