## Contents

* Preprocessing
* CRF
* Comparing Performance on Clean & Augmented Data

## I. Preprocessing

In [1]:
import re, json

### A. Load

In [6]:
data_path = "/Users/jacobsw/Desktop/WORK/OJO_CODE/HOMECITY_CRF/"
filename = "20160812-tagged-conversations-training.jsons"

In [46]:
def load_homecity(path, filename):
    
    data = []
    for line in open(path+filename,'r'):
        data.append(json.loads(line))
        
    return data

def filter_by_received(data):
    
    entries = []
    for datum in data:
        for entry in datum['conversation']:
            if entry['received']:
                entries.append(entry)
    
    return entries

r = re.compile('\S+')
def find_words(s, start, end):
    results = []
    for j,m in enumerate(r.finditer(s)):
        if m.end() < start:
            continue
        elif m.start() < end:
            results.append(j)
            # results.append((j, m.group(), m.start(), m.end()))
        else:
            break
    return results

def process_untagged(entry): 
    
    words = entry['body'].split()
    labels = ['O'] * len(words)
    
    return words, labels

def process_tagged(entry):
    
    sent = entry['body']
    
    words = sent.split()
    labels = ['O'] * len(words)

    for entity in entry['entity_sets'][0]['entities']:
        w_idxs = find_words(sent, entity['starting_char'], entity['ending_char'])
        labels[w_idxs[0]] = 'B-' + entity['entity']
        if len(w_idxs)>1:
            for i in xrange(1,len(w_idxs)):
                labels[w_idxs[i]] = 'I-' + entity['entity']
    
    return words, labels

In [47]:
data = load_homecity(data_path, filename)

In [48]:
entries = filter_by_received(data)

### B. Conversion

In [49]:
def to_crf_format(entries):
    
    words_list, labels_list = [], []
    for i,entry in enumerate(entries):
        try:
            if len(entry['entity_sets'])==0:
                words, labels = process_untagged(entry)
            else:
                words, labels = process_tagged(entry)
            words_list.append(words)
            labels_list.append(labels)
        except: # if there's exception, would be captured.
            print i
    
    return words_list, labels_list

def all_o(labels):
    return all(lb=='O' for lb in labels)

def non_all_o_subset(X, Y):
    
    X_sub, Y_sub = [], []
    for i in xrange(len(X)):
        if not all_o(Y[i]):
            X_sub.append(X[i])
            Y_sub.append(Y[i])
    
    return X_sub, Y_sub

In [50]:
X, Y = non_all_o_subset(*to_crf_format(entries))

### B. Augmentation

In [51]:
from spacy.en import English

In [52]:
nlp = English()

def merge(x, y):

    x2y = []
    curr_x = 0
    curr_y = 0
    len_x = 0
    len_y = 0
    for j, xx in enumerate(x):
        while len_y < len_x:
            len_y += len(y[curr_y])
            curr_y += 1
        x2y.append(curr_y)
        len_x += len(x[curr_x])
        curr_x += 1
        
    return x2y

def filter_extra(x2y, fts):
    return [fts[i] for i in x2y]

def extract_info(words): 
    # assuming parser = spacy.English()
    
    if type(words)==list: sent = ' '.join(words)
        
    parsed = nlp(unicode(sent))# if type(sent)==str else parser(unicode(sent))
    tokens = [token.orth_ for token in parsed]    
    x2y = merge(words, tokens)
    
    pos = [token.pos_ for token in parsed]
    ner = ['none' if token.ent_type_=='' else token.ent_type_ for token in parsed]
    dep_rel = [token.dep_ for token in parsed]
    dep_head = [token.head.orth_ for token in parsed]
    
    return filter_extra(x2y,pos), \
           filter_extra(x2y,ner), \
           filter_extra(x2y,dep_rel), \
           filter_extra(x2y,dep_head)


In [53]:
X_aug = [extract_info(X_i) for X_i in X]

## II. CRF

### A. Featurization

In [54]:
from sklearn.cross_validation import train_test_split

In [55]:
def featurize(words, labels):
    
    len_sent = len(words)
    all_fts = []

    for i in xrange(len_sent):
        fts = [words[i]]
        if i > 0:
            fts += ['-1-'+words[i-1],'-1-L-'+labels[i-1]]
        else: fts += ['BOS2'] # BOS2, BOS1, first_word, ...
        if i > 1:
            fts += ['-2-'+words[i-2],'-2-L-'+labels[i-2]]
        else: fts += ['BOS1']
        if i < len_sent - 1:
            fts += ['+1-'+words[i+1],'+1-L-'+labels[i+1]]
        else: fts += ['EOS2'] # last_word, EOS1, EOS2
        if i < len_sent - 2:
            fts += ['+2-'+words[i+2],'+2-L-'+labels[i+2]]
        else: fts += ['EOS1']
        all_fts.append(fts)
    
    return all_fts    

def featurize_aug(words, augs, labels):
    
    len_sent = len(words)
    all_fts = []
    pos, ner, dep_rel, dep_head = augs
    for i in xrange(len_sent):
        fts = [words[i],pos[i],ner[i],dep_rel[i],dep_head[i]]
        if i > 0:
            fts += ['-1-'+words[i-1],'-1-L-'+labels[i-1],
                    '-1-POS'+pos[i-1],'-1-NER'+ner[i-1],'-1-DR'+dep_rel[i-1],'-1-DH'+dep_head[i-1]]
        else: fts += ['BOS2'] # BOS2, BOS1, first_word, ...
        if i > 1:
            fts += ['-2-'+words[i-2],'-2-L-'+labels[i-2],
                    '-2-POS'+pos[i-2],'-2-NER'+ner[i-2],'-2-DR'+dep_rel[i-2],'-2-DH'+dep_head[i-2]]
        else: fts += ['BOS1']
        if i < len_sent - 1:
            fts += ['+1-'+words[i+1],'+1-L-'+labels[i+1],
                    '+1-POS'+pos[i+1],'+1-NER'+ner[i+1],'+1-DR'+dep_rel[i+1],'+1-DH'+dep_head[i+1]]
        else: fts += ['EOS2'] # last_word, EOS1, EOS2
        if i < len_sent - 2:
            fts += ['+2-'+words[i+2],'+2-L-'+labels[i+2],
                    '+2-POS'+pos[i+2],'+2-NER'+ner[i+2],'+2-DR'+dep_rel[i+2],'+2-DH'+dep_head[i+2]]
        else: fts += ['EOS1']
        all_fts.append(fts)
    
    return all_fts

In [56]:
X_train, X_test, X_train_aug, X_test_aug, Y_train, Y_test = train_test_split(X, X_aug, Y, test_size=.15, random_state=0)

In [57]:
X_train_fts = [featurize(words,labels) for words,labels in zip(X_train,Y_train)]
X_test_fts = [featurize(words,labels) for words,labels in zip(X_test,Y_test)]

In [58]:
X_train_aug_fts = [featurize_aug(words,augs,labels) for words,augs,labels in zip(X_train,X_train_aug,Y_train)]
X_test_aug_fts = [featurize_aug(words,augs,labels) for words,augs,labels in zip(X_test,X_test_aug,Y_test)]

### B. CRF

In [59]:
from itertools import chain
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelBinarizer
import pycrfsuite

In [60]:
class CRF(object):
    
    def run(self, X_train_fts, X_test_fts, Y_train, Y_test):

        print "... Training"
        crf = pycrfsuite.Trainer(verbose=0)
        for x,y in zip(X_train_fts, Y_train):
            crf.append(x,y)
        crf.set_params({'c1':1.,'c2':1e-3,'max_iterations':100,'feature.possible_transitions':True})
        crf.train('crf_homecity.crfsuite')
        tagger = pycrfsuite.Tagger()
        tagger.open('crf_homecity.crfsuite')

        print "... Evaluating"
        y_true = Y_test
        y_pred = [tagger.tag(fts) for fts in X_test_fts]
        lb = LabelBinarizer()
        y_true_in_tags = lb.fit_transform(list(chain.from_iterable(y_true)))
        y_pred_in_tags = lb.transform(list(chain.from_iterable(y_pred)))
        O_idx = lb.transform(['O']).argmax()
        class_indices = {cls:idx for idx,cls in enumerate(lb.classes_)}
        print classification_report(
            y_true_in_tags[y_true_in_tags.argmax(1) != O_idx],
            y_pred_in_tags[y_true_in_tags.argmax(1) != O_idx],
            labels = [class_indices[cls] for cls in lb.classes_],
            target_names = lb.classes_
        )
        print
        print "Accuracy: %.6f%%" % (accuracy_score(y_true_in_tags, y_pred_in_tags)*100)
        self.y_t = y_true_in_tags
        self.y_p = y_pred_in_tags
        self.lb = lb
    

## III. Comparing Performance on Clean & Augmented Data

In [61]:
crf = CRF()

### A. Clean Data

In [62]:
%%time
crf.run(X_train_fts, X_test_fts, Y_train, Y_test)

... Training
... Evaluating
                                 precision    recall  f1-score   support

                   B-ojo.status       1.00      0.33      0.50        75
      B-property_search.amenity       1.00      0.63      0.77        43
    B-property_search.bathrooms       1.00      0.50      0.67        16
     B-property_search.bedrooms       1.00      0.74      0.85        19
        B-property_search.floor       0.00      0.00      0.00         1
     B-property_search.location       0.91      0.52      0.66       149
     B-property_search.lot_size       1.00      0.57      0.73         7
        B-property_search.price       1.00      0.16      0.28        37
B-property_search.property_type       1.00      0.52      0.68        27
         B-property_search.sqft       1.00      0.20      0.33        10
  B-property_search.unit_floors       0.00      0.00      0.00         1
   B-property_search.year_built       0.00      0.00      0.00         1
                   I-o

### B. Augmented Data

In [63]:
%%time
crf.run(X_train_aug_fts, X_test_aug_fts, Y_train, Y_test)

... Training
... Evaluating
                                 precision    recall  f1-score   support

                   B-ojo.status       1.00      0.33      0.50        75
      B-property_search.amenity       0.93      0.63      0.75        43
    B-property_search.bathrooms       1.00      0.69      0.81        16
     B-property_search.bedrooms       1.00      0.79      0.88        19
        B-property_search.floor       0.00      0.00      0.00         1
     B-property_search.location       0.92      0.60      0.72       149
     B-property_search.lot_size       1.00      0.57      0.73         7
        B-property_search.price       1.00      0.65      0.79        37
B-property_search.property_type       1.00      0.52      0.68        27
         B-property_search.sqft       1.00      0.20      0.33        10
  B-property_search.unit_floors       0.00      0.00      0.00         1
   B-property_search.year_built       0.00      0.00      0.00         1
                   I-o