-
Notifications
You must be signed in to change notification settings - Fork 5
/
text_complexity.py
321 lines (266 loc) · 8.35 KB
/
text_complexity.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
from autocorrect import Speller
import re
import spacy
#from spacy.tokenizer import Tokenizer
from nltk.stem import PorterStemmer
import json
from app.ocr.wordcount import wordlist
# initializing object
nlp = spacy.load("en_core_web_sm")
spell = Speller(lang='en')
def spellcheck(input_str: str) -> str:
"""
Will scroll through string, correct spelling error words,
and return the entire string
"""
textcorrected = spell(input_str)
return textcorrected
def tokenize(input_str: str) -> str:
"""
Will return all individual words in an array, ignores NLP stop words
"""
tokens = re.sub('[^a-zA-Z 0-9 \.]', '', input_str)
tokens = tokens.lower().split()
STOP_WORDS = nlp.Defaults.stop_words
arr = []
for token in tokens:
if token not in STOP_WORDS:
arr.append(token)
return arr
def descriptiveness(input_str: str) -> str:
'''
Spellchecks and tokenizes an input string in order to find part of speech of each word,
compares verbs, adj, adv ratio to proper noun and noun ratio to describe how descriptive
the text is
'''
input_str2 = spellcheck(input_str)
doc = nlp(input_str2)
x = [token.pos_ for token in doc]
count = 0
count2 = 0
for part_of_speech in x:
if part_of_speech == "PROPN" or part_of_speech == "NOUN" :
count += 1
elif part_of_speech == "VERB" or part_of_speech == "ADJ" :
count2 += 1
if count == 0:
return 0
elif count2 / count > 1:
return 1
else:
return count2 / count
def spellchecked_words(input_str: str) -> int:
'''
Takes a string, runs spellcheck on string, compares
different words after spellcheck to before,
returns number of words spellchecked
'''
arr = []
words1 = tokenize(input_str)
words2 = tokenize(spellcheck(input_str))
for word in words1:
if word not in words2:
arr.append(word)
return len(arr)
def efficiency(input_str: str) -> int:
"""
finds length of original string after tokenization,
divides # of non-spellchecked words
by # of total words, returns value between 0 and 1
"""
original = len(tokenize(input_str))
difference = original - spellchecked_words(input_str)
if original == 0:
return 0
else:
percentage = difference / original
return percentage
def unique_words(input_str: str) -> int:
"""
finds percentage of total words in tokenized string that are unique words
"""
arr = []
arr2 = set()
words = tokenize(input_str)
for word in words:
arr.append(word)
arr2.add(word)
if len(arr) == 0:
return 0
else:
x = len(arr2) / len(arr)
return x
def avg_sentence_length(input_str: str) -> int:
"""
finds average sentence length after tokenization
by taking total tokens / tokens containing .
returns value between 0 and 1
"""
array = []
tokens = tokenize(input_str)
count = 0
for word in tokens:
if '.' in word:
count += 1
for word in tokens:
array.append(word)
array_length = len(array)
if count == 0:
return 0
else:
words_per_sentence = array_length / count
final_score = (words_per_sentence / 10 )
return min(final_score,1)
def avg_len_words(input_str: str) -> int:
"""
finds the average length of words after tokenization in the text
returns value between 0 and 1
"""
arr = []
words = tokenize(input_str)
for word in words:
x = len(word)
arr.append(x)
x = sum(arr) / len(arr)
y = (x / 10)
if len(arr) == 0:
return 0
return min(y,1)
def vocab_length(input_str: str) -> int:
'''
Returns average word size of tokenized and unique words
returns value between 0 and 1
'''
arr = set()
arr2 = []
words = tokenize(input_str)
for word in words:
arr.add(word)
for word in arr:
x = len(word)
arr2.append(x)
y = (sum(arr2) / len(arr2)) / 10
if y == 0:
return 0
else:
return y
def good_vocab(input_str: str) -> int:
'''
Scrolls through input and matches up words against word list of large words,
and returns percentage of words in input string that are in the word list,
returns value between 0 and 1
'''
arr = []
arr2 = []
words = tokenize(input_str)
for word in words:
arr2.append(word)
if word in wordlist:
arr.append(word)
good_vocab = len(arr) / len(arr2)
return good_vocab
def evaluate(input_str: str) -> int:
'''
Evaluates text using vocab score, avg sentence length, spelling efficiency,
and descriptiveness to produce an overall score for the user
'''
score = (
(.1 * vocab_length(input_str)) +
(.2 * good_vocab(input_str)) +
(.1 * avg_sentence_length(input_str)) +
(.1 * efficiency(input_str)) +
(.1 * descriptiveness(input_str))
)
return score
def good_vocab_stars(input_str: str) -> int:
'''
returns between 0 and 5 stars, rounded to the nearest half star, based on usage of
good vocabulary
'''
x = good_vocab(input_str) / .12
y = round(x*2)/2
return min(y,5)
def efficiency_stars(input_str: str) -> int:
'''
returns between 0 and 5 stars, rounded to the nearest half star, based on usage of
spellcheck efficiency
'''
x = efficiency(input_str) / .15
y = round(x*2) / 2
return min(y,5)
def vocab_length_stars(input_str: str) -> int:
'''
returns between 0 and 5 stars, rounded to the nearest half star, based on usage of
average word length
'''
x = vocab_length(input_str) / .15
y = round(x*2) / 2
return min(y,5)
def avg_sentence_length_stars(input_str: str) -> int:
'''
returns between 0 and 5 stars, rounded to the nearest half star, based on usage of
avg sentence length
'''
x = avg_sentence_length(input_str) / .12
y = round(x*2) / 2
return min(y,5)
def descriptiveness_stars(input_str: str) -> int:
'''
returns between 0 and 5 stars, rounded to the nearest half star, based on usage of
descriptiveness
'''
x = descriptiveness(input_str) / .15
y = round(x*2) / 2
return min(y,5)
def evaluate_stars(input_str: str) -> int:
'''
returns between 0 and 5 stars, rounded to the nearest half star, based on usage of
descriptiveness
'''
x = evaluate(input_str) / .15
y = round(x*2) / 2
return min(y,5)
def get_text_scores(input_str: str) -> str:
return {
"vocab_length": vocab_length(input_str),
"avg_sentence_length": avg_sentence_length(input_str),
"efficiency": efficiency(input_str),
"descriptiveness": descriptiveness(input_str),
"good_vocab": good_vocab(input_str),
"evaluate": evaluate(input_str)
}
def get_text_scores_stars(input_str: str) -> str:
return {
"vocab_length": vocab_length_stars(input_str),
"avg_sentence_length": avg_sentence_length_stars(input_str),
"efficiency": efficiency_stars(input_str),
"descriptiveness": descriptiveness_stars(input_str),
"good_vocab": good_vocab_stars(input_str),
"evaluate": evaluate_stars(input_str)
}
if __name__ == '__main__':
# corrected = spellcheck(normal)
# print("normal:", normal)
# print()
# print("corrected:", corrected)
# x = google_pdf_handwriting_recognizer(local_path="./test_pdfs/test_pdf_1.pdf")
# x = " ".join(x)
string = "Great success. My name is Borat. I have come to America, to find Pamela Anderson, and \
make her my wife. Very nice!"
x = (string)
print(tokenize(x))
#print(spellchecked_words(x))
#print(unique_words(x))
print(vocab_length(x))
print(avg_sentence_length(x))
print(efficiency(x))
print(descriptiveness(x))
print(good_vocab(x))
print(evaluate(x))
print(store(x))
#print(wordlist)
print(good_vocab_stars(x))
print(efficiency_stars(x))
print(word_length_stars(x))
print(sentence_length_stars(x))
print(descriptiveness_stars(x))