/
test_extractors.py
156 lines (125 loc) · 4.14 KB
/
test_extractors.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
import pytest
import spacy
import textdescriptives as td
from textdescriptives.extractors import download_spacy_model, load_spacy_model
# pylint: disable=missing-function-docstring
@pytest.fixture(scope="function")
def nlp():
nlp = spacy.load("en_core_web_sm")
nlp.add_pipe("textdescriptives/all")
return nlp
def test_extract_df_single_doc(nlp):
doc = nlp("This is just a cute little text. Actually, it's two sentences.")
td.extract_df(doc)
for metric in [
"descriptive_stats",
"readability",
"dependency_distance",
"quality",
]:
td.extract_df(doc, metrics=metric)
def test_extract_df_pipe(nlp):
text = [
"I wonder how well the function works on multiple documents",
"Very exciting to see, don't you think?",
]
docs = nlp.pipe(text)
df = td.extract_df(docs)
assert "lix" in df.columns
assert "dependency_distance_mean" in df.columns
assert "n_stop_words" in df.columns
assert "pos_prop_VERB" in df.columns
assert "n_tokens" in df.columns
assert "first_order_coherence" in df.columns
assert len(df) == 2
def test_extract_dict_single_doc(nlp):
doc = nlp("This is just a cute little text. Actually, it's two sentences.")
td.extract_dict(doc)
for metric in ["descriptive_stats", "readability", "dependency_distance"]:
dict_list = td.extract_dict(doc, metrics=metric)
assert isinstance(dict_list, list)
assert len(dict_list) == 1
metrics_dict = dict_list[0]
assert isinstance(metrics_dict, dict)
assert metrics_dict.pop("text") == doc.text
def test_extract_df_only_pos():
nlp = spacy.load("en_core_web_sm")
nlp.add_pipe("textdescriptives/pos_proportions")
doc = nlp("This is just a cute little text. Actually, it's two sentences.")
td.extract_df(doc, metrics="pos_proportions")
@pytest.mark.parametrize("lang", ["en", "da"])
def test_download_spacy_model(lang):
download_spacy_model(lang=lang, size="sm")
def test_load_spacy_model():
nlp = load_spacy_model(
spacy_model=None,
lang="en",
metrics=["descriptive_stats", "readability", "coherence"],
spacy_model_size="sm",
)
assert "tok2vec" in nlp.pipe_names
def test_load_spacy_model_blank():
nlp = load_spacy_model(
spacy_model=None,
lang="en",
metrics=["descriptive_stats"],
spacy_model_size="sm",
)
assert "tok2vec" not in nlp.pipe_names
def test_extract_single_doc():
df = td.extract_metrics(
"This is just a cute little text. Actually, it's two sentences.",
spacy_model="en_core_web_sm",
metrics="readability",
)
assert "lix" in df.columns
def test_extract_with_lang():
df = td.extract_metrics(
"This is just a cute little text. Actually, it's two sentences.",
metrics="dependency_distance",
lang="en",
spacy_model_size="sm",
)
assert "dependency_distance_mean" in df.columns
@pytest.mark.parametrize(
"text",
[
"This is just a cute little text. Actually, it's two sentences. No, it's three",
[
"This is just a cute little text. Actually, it's two sentences. No, it's three.",
"Two documents in this bad boy. Let's see how it works.",
],
],
)
def test_extract_similar_extract_df(text):
df = td.extract_metrics(
text,
spacy_model="en_core_web_sm",
metrics="coherence",
)
nlp = spacy.load("en_core_web_sm")
nlp.add_pipe("textdescriptives/coherence")
if isinstance(text, str):
text = [text]
docs = nlp.pipe(text)
df2 = td.extract_df(docs)
assert df.equals(df2)
def test_extract_model_not_needed():
df = td.extract_metrics(
"This is just a cute little text. Actually, it's two sentences.",
metrics="descriptive_stats",
lang="en",
)
assert "n_tokens" in df.columns
def test_extract_metrics_twice():
text = "Just a small test"
df = td.extract_metrics(
text,
metrics="coherence",
lang="en",
)
df2 = td.extract_metrics(
text,
metrics="descriptive_stats",
lang="en",
)