-
Notifications
You must be signed in to change notification settings - Fork 0
/
consvowOCP.py
106 lines (91 loc) · 3.84 KB
/
consvowOCP.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
# -*- coding: utf-8 -*-
# Splits segments by OCP (see consvow-feat for more detailed splitting)
# MH20150502
import random, itertools, codecs, sys
class listset:
def __init__(self, text):
self.text = text
self.allsegments = {x for x in text if x != ' '}
bigrams = [x[0] + x[1] for x in zip(text, text[1:])]
self.bigramcounts = {}
for bi in bigrams:
if bi[0] != ' ' and bi[1] != ' ':
self.bigramcounts[bi] = self.bigramcounts.get(bi, 0) + 1
self.tree = {}
self.tree[''] = self.allsegments
self.tree['0'], self.tree['1'] = self._randsplitset(self.allsegments)
def split(self, set1):
self.tree[set1 + '0'] = self.tree[set1]
self.tree[set1 + '1'] = set()
#self.tree[set1 + '0'], self.tree[set1 + '1'] = self._randsplitset(self.tree[set1])
def _randomswap(self, n, set1, set2):
if len(set1) == 0:
one = set([])
else:
maxn = min(n, len(set1)-1)
numswap1 = random.randint(0, maxn)
one = set(random.sample(set1, numswap1))
if len(set2) == 0:
two = set([])
else:
maxn = min(n, len(set2)-1)
numswap2 = random.randint(0, maxn)
two = set(random.sample(set2, numswap2))
return (set1 - one) | two, (set2 - two) | one
def _score(self, set1, set2):
sc = 0
for bi in self.bigramcounts:
if (bi[0] in set1 and bi[1] in set2) or (bi[0] in set2 and bi[1] in set1):
sc += self.bigramcounts[bi]
return sc
def _randsplitset(self, s):
"""Splits a set randomly into two subsets."""
setlist = list(s)
random.shuffle(setlist)
R = random.randint(0,len(setlist))
return set(setlist[0:R]), set(setlist[R:])
def candv(words):
data = u' '.join(words)
ls = listset(data)
def genperms(x):
barelist = ["".join(seq) for n in xrange(1,x) for seq in itertools.product("01", repeat=n)]
return [(barelist[i], barelist[i+1]) for i in xrange(0, len(barelist),2)]
for sets in genperms(2):
if sets[0] in ls.tree and sets[1] in ls.tree:
zero = ls.tree[sets[0]]
one = ls.tree[sets[1]]
bestzero = zero
bestone = one
for iter in xrange(800):
bestscore = ls._score(zero, one)
for i in xrange(40): #ann was 150
rs = int(1+(1/float(iter + 1))*400) # Annealing schedule determines how many swaps to make
newzero, newone = ls._randomswap(rs, zero, one)
newscore = ls._score(newzero, newone)
if newscore > bestscore:
bestzero = newzero
bestone = newone
#print bestscore, newscore, newzero, newone, iter, rs
bestscore = newscore
zero = bestzero
one = bestone
if ls.tree[sets[0]] != zero and ls.tree[sets[0]] != one:
ls.tree[sets[0]] = zero
ls.tree[sets[1]] = one
if len(zero) > 1:
ls.split(sets[0])
#print "SPLITTING:", sets[0]
if len(one) > 1:
ls.split(sets[1])
#print "SPLITTING:", sets[1]
for par in genperms(8):
for p in par:
if p in ls.tree and p + '0' in ls.tree and p + '1' in ls.tree:
if ls.tree[p + '0'] == ls.tree[p] or ls.tree[p + '1'] == ls.tree[p]:
del ls.tree[p + '0']
del ls.tree[p + '1']
C = ls.tree['0']
V = ls.tree['1']
if len(C) < len(V):
C, V = V, C
return C, V