In [5]:
from __future__ import division


import math
import os
import sys
import time

from config import CONFIG

from tests import tests1, tests2

from normalizer import Normalizer

from lib.CharacterIndex import CharacterIndex
from lib.NaiveTokenizer import NaiveTokenizer
from lib.TextStreamer import TextStreamer
from lib.CONLL14ErrorCorrection import CONLL14ErrorCorrection
from lib.Parser import PatternParser
from lib.Report import Report
from lib.DistributionalModel import NgramModel

from lib.Tools import (
    FreqDist,
    splitter,
    strip_punct,
    tokenizer
)

from collections import (
    Counter,
    defaultdict as deft
)


def timestamp():
    return '.'.join([str(t) for t in time.localtime()[3:6]])


def get_name(template):
    i = 1
    while True:
        name = template % (timestamp(), i)
        if not os.path.exists(name):
            return name
        i += 1

PoS_l = ['CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNS', 'NNP', 'NNPS', 'PDT',
         'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP',
         'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB']
PoS = {}
i = 1
for k in PoS_l:
	PoS[k] = i
	i += 1


WORD_GRAMS = [
    (1, False),
    (2, False),
    (3, False),
#     (3, True),
#     (4, True)
]

POS_GRAMS = [
    (1, False),
    (2, False),
    (3, False),
#     (3, True),
#     (4, True)
]



corpus1 = 'data/delorme.com_shu.pages_89.txt'
corpus2 = 'data/delorme.com_shu.pages_102.txt'
corpus3 = 'data/delorme.com_shu.pages_120.txt'
corpus4 = 'data/utexas_iit.pages_12.txt'

report = Report()

parser = PatternParser()

model = NgramModel(WORD_GRAMS)
model_pos = NgramModel(POS_GRAMS)

C = CONFIG[1]



In [9]:
 #for C in CONFIG:
    
#     tests = tests1.items() + tests2.items()

conll = CONLL14ErrorCorrection()

tests = []
for (left, err, right, corr, category), human  in conll:
    if err:
        test = (left, strip_punct(err).lower(), right,
                strip_punct(corr).lower(), category, True)
    else:
        test = (left, strip_punct(corr), right, err, category, False)
    tests.append(test)
tests = tests[:30000]

targets = [test[1] for test in tests]


#	Collect input from large text file:
dump = []
#     for doc in TextStreamer(corpus, nb_sent=C['nb_sent']):
streamers = [
    TextStreamer(corpus1, nb_sent=200000),
#         TextStreamer(corpus2, nb_sent=200000),
#         TextStreamer(corpus3, nb_sent=200000),
#         TextStreamer(corpus4, nb_sent=200000),
]
for streamer in streamers:
    for doc in streamer:
        for sent in splitter(doc):
            parse = parser(sent)
            # for unit in parse.split():
            #    print unit[0]
            # raw_input()
            tokenized = [w.lower() for w in tokenizer(sent)]
            tok_pos = [pos[1] for pos in parse.split()[0]]
            dump += tokenized

#                print tokenized
#            print 'parse' , parse.split()
#            print 'parse' , parse.split()[0]
#            print 'tok pos', tok_pos
#            raw_input()

            model.update(['#'] + tokenized + ['#'])
            model_pos.update(['#'] + tok_pos + ['#'])

freq_dist = Counter(dump + targets)

200000it [16:40, 199.80it/s]


In [10]:
    #	Map all character n-grams to words, and all words to their
    #	character n-grams
    #	index = CharacterIndex(dump + targets, top_n=C['top_n'], min_r=C['sim_thres'])
    index = CharacterIndex(dump + targets, top_n=C['top_n'], min_r=0.9)
    index.build()

    tests = [t for t in tests]
    for i, (left, candidate, right, correct, category, is_candidate) in enumerate(tests):
        
        if candidate == correct:
            continue
#         elif is_candidate and (category != 'Mec'):
#             continue
#        if candidate in ['diagonosed','firghtenning','concurently']:
        report.add()
        if is_candidate and ((not correct or len(correct.split()) > 1)
        or category not in ['Mec']):
#         or category not in ['Mec', 'Nn', 'Wform']):
            report.fn(left, candidate, right, correct, category)
            continue

#         similars = index(candidate, n=5)
        similars = [(w, sim) for w, sim in index(candidate)
                    if freq_dist[w] >= 7 and
                    freq_dist[w] / freq_dist[candidate] >= 10]

        if not similars and not is_candidate:
            report.tn(left, candidate, right, correct, category)
            continue
        elif not similars:
            report.fn(left, candidate, right, correct, category)
            continue
        elif similars and not is_candidate:
            report.fp(left, candidate, right, correct, category)
            continue

#         similars.sort(
#             key=lambda x: freq_dist[x[0]],
#             reverse=True
#         )
#         top = [w for w, sim in similars[:1]]

        corrections = []
        for sim, _ in similars + [(candidate, None)]:
            left = [e for _, e, _, _, _, _ in tests[i - 3:i]] + [sim]
            right = [sim] + [e for _, e, _, _, _, _ in tests[i + 1:i + 4]]

            pos_context_left = ' '.join([e for _, e, _, _, _, _ in tests[i - 3:i]] 
                                   + [sim])
            pos_context_right = ' '.join([sim]
                                   + [e for _, e, _, _, _, _ in tests[i + 1:i + 4]])

            parse_pos_left = parser(pos_context_left)
            parse_pos_right = parser(pos_context_right)

#            print 'pos_context_left', pos_context_left, '\n'
#            print 'parse_pos_left', parse_pos_left, '\n'
#            print 'pos_context_left.split()', pos_context_left.split()
#            raw_input()

            left_pos = [e_pos[1] for e_pos in parse_pos_left.split()[0]]
            right_pos = [e_pos[1] for e_pos in parse_pos_right.split()[0]]

            pleft = model(left)
            pright = model(right)

            pleft_pos = model_pos(left_pos)
            pright_pos = model_pos(right_pos)

            score = abs(pleft - pright)
            score_pos = abs(pleft_pos - pright_pos)

            print left, pleft
            print left_pos, pleft_pos
            print right, pright
            print right_pos, pright_pos

            corrections.append((score * max([pleft, pright]), score_pos * max([pleft_pos, pright_pos]), sim))


        #           corrections.append((score, sim))

        baseline = [[sim_w, sim_pos, w] for sim_w, sim_pos, w in corrections if w == candidate][0]
        #top = [w for sim_w, sim_pos, w in corrections if w == candidate][0]
        #         print [(freq_dist[w] / freq_dist[candidate], w) for sim, w in corrections[:1]
        #                    if freq_dist[w] / freq_dist[candidate] >= 2]
        #         print [w for sim, w in corrections[:1]
        #                    if (baseline and sim / baseline >= 2)
        #                    or not baseline]
        #         print

        #print corrections

        #hypothesis = [sim_w, sim_pos, w in corrections[:1] if w != candidate]
        #        top_w = [w for sim_w, sim_pos, w in corrections[:1] if w != candidate]
        #        top_sim_w = [sim_w for sim_w, sim_pos, w in corrections[:1] if w != candidate]
        #        top_sim_pos = [sim_pos for sim_w, sim_pos, w in corrections[:1] if w != candidate]

        is_corrected = False

#        print corrections
        for hyp in corrections:
            if hyp[0] >= baseline[0] and hyp[1] >= baseline[1]:
#                print hyp
                baseline = hyp
                is_corrected = True

        top = baseline[2]
#        print 'TOP: %s' % top
#        print

        if not is_corrected and is_candidate:
            report.fn(left, candidate, right, correct, category)
        elif not is_corrected and not is_candidate:
            report.tn(left, candidate, right, correct, category)
        elif is_candidate and correct == top:
            report.tp(left, candidate, right, correct, category)
        elif is_corrected and not is_candidate:
            report.fp(left, candidate, right, correct, category)

    report.lap(C)
    
    template = 'logs/test-%s-%d'
    report(get_name(template))


[u'when', u'we', u'are', u'diagnosed'] 1.66219279359e-19
[u'WRB', u'PRP', u'VBP', u'VBN'] 1.09969106154e-10
[u'diagnosed', u'with', u'certain', u'genetic'] 8.01018970643e-24
[u'VBN', u'IN', u'JJ', u'JJ'] 6.34535389069e-10
[u'when', u'we', u'are', u'diagonosed'] 0.0
[u'WRB', u'PRP', u'VBP', u'VBN'] 1.09969106154e-10
[u'diagonosed', u'with', u'certain', u'genetic'] 0.0
[u'VBN', u'IN', u'JJ', u'JJ'] 6.34535389069e-10
[u'can', u'going', u'through', u'theses'] 2.04623324675e-18
[u'MD', u'VBG', u'IN', u'NNS'] 1.09923589609e-17
[u'theses', u'process', u'from', u'the'] 1.49069506003e-19
[u'NNS', u'NN', u'IN', u'DT'] 5.28550311927e-10
[u'can', u'going', u'through', u'thses'] 0.0
[u'MD', u'VBG', u'IN', u'NNS'] 1.09923589609e-17
[u'thses', u'process', u'from', u'the'] 0.0
[u'NNS', u'NN', u'IN', u'DT'] 5.28550311927e-10
500.0 <tp=1  tn=468  fp=1  fn=28  total=500  prec=50  rec=3>
[u'he', u'or', u'she', u'concurrently'] 5.32895099345e-17
[u'PRP', u'CC', u'PRP', u'RB'] 3.73809624526e-13
[u'concurren