In [2]:
from __future__ import division


import math
import os
import sys
import time

from config import CONFIG

from tests import tests1, tests2

from normalizer import Normalizer

from lib.CharacterIndex import CharacterIndex
from lib.NaiveTokenizer import NaiveTokenizer
from lib.TextStreamer import TextStreamer
from lib.CONLL14ErrorCorrection import CONLL14ErrorCorrection
from lib.Parser import PatternParser
from lib.Report import Report

from lib.Tools import (
    FreqDist,
    splitter,
    strip_punct,
    tokenizer
)

from collections import (
    Counter,
    defaultdict as deft
)


def timestamp():
    return '.'.join([str(t) for t in time.localtime()[3:6]])


def get_name(template):
    i = 1
    while True:
        name = template % (timestamp(), i)
        if not os.path.exists(name):
            return name
        i += 1

PoS_l = ['CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNS', 'NNP', 'NNPS', 'PDT',
         'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP',
         'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB']
PoS = {}
i = 1
for k in PoS_l:
	PoS[k] = i
	i += 1

raw_input()

corpus = 'data/delorme.com_shu.pages_89.txt'

report = Report()

parser = PatternParser()

for C in CONFIG:
    
#     tests = tests1.items() + tests2.items()

    conll = CONLL14ErrorCorrection()
    
    tests = []
    for (left, err, right, corr, category), human  in conll:
        if err:
            test = (left, strip_punct(err).lower(), right,
                    strip_punct(corr).lower(), category, True)
        else:
            test = (left, strip_punct(corr), right, err, category, False)
        tests.append(test)
    tests = tests[:10000]

    targets = [test[1] for test in tests]

    #	Collect input from large text file:
    dump = []
    train_conll=[]
    for doc in TextStreamer(corpus, nb_sent=100):
        for sent in splitter(doc):
            parse = parser(sent)
            for unit in parse.split():
                input_sent = []
                # print unit, '$ \n'
                for word_gr in unit :
                    word = word_gr[0]
                    tag = word_gr[1]
                    _tuple = (word,tag,word)
                    # print _tuple
                    input_sent.append(_tuple)
                # print input_sent    
                # raw_input()
            train_conll.append(input_sent)    
            dump += [w.lower() for w in tokenizer(sent)]
    freq_dist = Counter(dump + targets)
    print train_conll[0]
    
    
    
    #	Map all character n-grams to words, and all words to their
    #	character n-grams
    index = CharacterIndex(dump + targets, top_n=C['top_n'], min_r=C['sim_thres'])
    index.build()

    for left, error, right, correct, category, human in tests:
        
        if error == correct:
            continue

        report.add()

        if human:
            left = ' '.join(left.split()[-10:])
            right = ' '.join(right.split()[:10])
        else:
            left = ''
            right = ''

        similars = [(w, sim) for w, sim in index[error]
                    if freq_dist[w] >= 10 and
                    freq_dist[w] / freq_dist[error] >= 30]

        if not similars and not human:
            report.tn(left, error, right, correct, category)
            continue
        elif not similars:
            report.fn(left, error, right, correct, category)
            continue
        elif similars and not human:
            report.fp(left, error, right, correct, category)
            continue

        similars.sort(
            key=lambda x: freq_dist[x[0]],
            reverse=True
        )

        top = [w for w, _ in similars[:1]]
        if correct in top:
            report.tp(left, error, right, correct, category)
        else:
            report.fp(left, error, right, correct, category)
    
    report.lap(C)


template = 'logs/test-%s-%d'
report(get_name(template))





87it [00:00, 151.64it/s]


[(u'A', u'DT', u'A'), (u'bus', u'NN', u'bus'), (u'looms', u'VBZ', u'looms'), (u'out', u'IN', u'out'), (u'of', u'IN', u'of'), (u'the', u'DT', u'the'), (u'grey', u'JJ', u'grey'), (u'and', u'CC', u'and'), (u'blizzardy', u'JJ', u'blizzardy'), (u'conditions', u'NNS', u'conditions'), (u'and', u'CC', u'and'), (u'we', u'PRP', u'we'), (u'get', u'VB', u'get'), (u'on', u'IN', u'on'), (u'gratefully', u'RB', u'gratefully'), (u'.', u'.', u'.')]
500.0 <tp=0  tn=469  fp=0  fn=30  total=500  prec=1.00  rec=0.00>
1000.0 <tp=0  tn=941  fp=1  fn=57  total=1000  prec=0.00  rec=0.00>
1500.0 <tp=0  tn=1396  fp=1  fn=102  total=1500  prec=0.00  rec=0.00>
2000.0 <tp=0  tn=1860  fp=1  fn=138  total=2000  prec=0.00  rec=0.00>
2500.0 <tp=0  tn=2324  fp=1  fn=174  total=2500  prec=0.00  rec=0.00>


KeyboardInterrupt: 

0it [00:00, ?it/s]

[[u'A', u'DT', u'B-NP', u'O', u'NP-SBJ-1', u'a'], [u'bus', u'NN', u'I-NP', u'O', u'NP-SBJ-1', u'bus'], [u'looms', u'VBZ', u'B-VP', u'O', u'VP-1', u'loom'], [u'out', u'IN', u'B-PP', u'B-PNP', u'O', u'out'], [u'of', u'IN', u'I-PP', u'I-PNP', u'O', u'of'], [u'the', u'DT', u'B-NP', u'I-PNP', u'O', u'the'], [u'grey', u'JJ', u'I-NP', u'I-PNP', u'O', u'grey'], [u'and', u'CC', u'I-NP', u'I-PNP', u'O', u'and'], [u'blizzardy', u'JJ', u'I-NP', u'I-PNP', u'O', u'blizzardy'], [u'conditions', u'NNS', u'I-NP', u'I-PNP', u'O', u'condition'], [u'and', u'CC', u'O', u'O', u'O', u'and'], [u'we', u'PRP', u'B-NP', u'O', u'NP-SBJ-2', u'we'], [u'get', u'VB', u'B-VP', u'O', u'VP-2', u'get'], [u'on', u'IN', u'B-PP', u'O', u'O', u'on'], [u'gratefully', u'RB', u'B-ADVP', u'O', u'O', u'gratefully'], [u'.', u'.', u'O', u'O', u'O', u'.']] $ 

(u'A', u'DT', u'A')
(u'bus', u'NN', u'bus')
(u'looms', u'VBZ', u'looms')
(u'out', u'IN', u'out')
(u'of', u'IN', u'of')
(u'the', u'DT', u'the')
(u'grey', u'JJ', u'grey')
(u'and'