/
markov_text.py
84 lines (63 loc) · 2.19 KB
/
markov_text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
from notyourmotleycrew.timelinejs.models import Event
from notyourmotleycrew.timelinejs.models import Filterset
import nltk
import random
import codecs
filterset = Filterset.objects.get(title="ful")
thenteset = Filterset.objects.get(title="thentetest")
class MarkovModel:
def __init__(self):
self.data = {}
def add_text(self, text):
tokens = nltk.word_tokenize(text)
l = len(tokens)
for i in range(l - 2):
key = (tokens[i], tokens[i+1])
value = tokens[i+2]
if key not in self.data:
self.data[key] = []
self.data[key].append(value)
def get_random_upper_case(self):
"""
get a random key but the first word must begin with uppercase letter
"""
done = False
while not done:
pair = random.choice(self.data.keys())
if (pair[0][0]).isupper():
result = pair
done = True
return result
def get_sentence(self, n):
result = u""
stop = n
done = 0
#next_pair = random.choice(self.data.keys())
next_pair = self.get_random_upper_case()
result += u"{0}{1}{2}".format(next_pair[0], " " , next_pair[1])
next_word = "A"
while not (done > stop and next_word in [".", "?", "!"]):
while not next_pair in self.data:
next_pair = self.get_random_upper_case()
#next_pair = random.choice(self.data.keys())
next_word = random.choice(self.data[next_pair])
next_pair = (next_pair[1], next_word)
done += 1
result += u" {0}".format(next_word)
return result
model = MarkovModel()
fn = "/tmp/output.txt"
fh = codecs.open(fn, "w", "utf-8")
def run():
#build the model
events = Event.objects.filter(filtersets=filterset)
for event in events:
model.add_text(event.text_body)
#print model.data
#for key, value in model.data.items():
#print key, value
#print len(value)
for i in range(1000):
sentence = model.get_sentence(25)
fh.write(sentence)
fh.write("\n\n")