-
Notifications
You must be signed in to change notification settings - Fork 8
/
main.py
121 lines (93 loc) · 3.19 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
'''
1) Associate words with their grammatical counterparts. (e.g. "city" and "cities")
2) Calculate the occurrence of each word in the text.
3) Assign each word with points depending on their popularity.
4) Detect which periods represent the end of a sentence. (e.g "Mr." does not).
5) Split up the text into individual sentences.
6) Rank sentences by the sum of their words' points.
7) Return X of the most highly ranked sentences in chronological order.
'''
import codecs
import sys
import spacy
def main():
args = sys.argv
# Document name should be provided
if len(args) < 2:
print "Provide document"
exit(1)
filename = args[1]
sentence_count = 3
# Read number of sentences to output if provided
if len(args) == 3:
sentence_count = int(args[2])
# Read file as utf-8
document_file = codecs.open(filename, encoding='utf-8')
contents = document_file.read()
# Process file contents
nlp = spacy.load('en')
doc = nlp(contents)
# 1, 2, 3
occurrences = {}
def fill_occurrences(word):
word_lemma = lemma(word)
count = occurrences.get(word_lemma, 0)
count += 1
occurrences[word_lemma] = count
each_word(doc, fill_occurrences)
# 4, 5, 6
ranked = get_ranked(doc.sents, sentence_count, occurrences)
# 7
print " ".join([x['sentence'].text for x in ranked])
def each_word(words, func):
for word in words:
if word.pos_ is "PUNCT":
continue
func(word)
def get_ranked(sentences, sentence_count, occurrences):
# Maintain ranked sentences for easy output
ranked = []
# Maintain the lowest score for easy removal
lowest_score = -1
lowest = 0
for sent in sentences:
# Fill ranked if not at capacity
if len(ranked) < sentence_count:
score = get_score(occurrences, sent)
# Maintain lowest score
if score < lowest_score or lowest_score is -1:
lowest = len(ranked) + 1
lowest_score = score
ranked.append({'sentence': sent, 'score': score})
continue
score = get_score(occurrences, sent)
# Insert if score is greater
if score > lowest_score:
# Maintain chronological order
for i in xrange(lowest, len(ranked) - 1):
ranked[i] = ranked[i+1]
ranked[len(ranked) - 1] = {'sentence': sent, 'score': score}
# Reset lowest_score
lowest_score = ranked[0]['score']
lowest = 0
for i in xrange(0, len(ranked)):
if ranked[i]['score'] < lowest_score:
lowest = i
lowest_score = ranked[i]['score']
return ranked
def lemma(word):
return word.lemma_
def get_score(occurrences, sentence):
class Totaler:
def __init__(self):
self.score = 0
def __call__(self, word):
self.score += occurrences.get(lemma(word), 0)
def total(self):
# Should the score be divided by total words?
return self.score
totaler = Totaler()
each_word(sentence, totaler)
return totaler.total()
if __name__ == "__main__":
main()