In [None]:
!pip install spacy==3.0.0
!pip install sklearn_crfsuite
!pip install scikit-learn==0.23.2
!pip install pytorch-crf

In [None]:
! wget https://groups.csail.mit.edu/sls/downloads/restaurant/restauranttrain.bio

In [None]:
# import main libraries
from IPython.display import Image
from IPython.core.display import display, HTML
import pandas as pd
from collections import Counter
import random
from spacy import displacy
import json
import os
from spacy.training import offsets_to_biluo_tags
import spacy
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import sklearn
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
from sklearn.metrics import f1_score

In [None]:
with open('restauranttrain.bio', 'r') as f:
    for line in f.readlines()[:30]:
        print(line)

In [None]:
# switch column places
with open('restauranttrain.bio', 'r') as f:
    with open('restauranttrain_updated.bio', 'w') as w:
        for line in f.readlines():
            if line == '\n':
                w.write(line)
            else:
                w.write('\t'.join(line.strip().split('\t')[::-1]) + '\n')

In [None]:
# split into train and validation
count = 0
with open('restauranttrain.bio', 'r') as f:
    with open('restauranttrain_updated_train.bio', 'w') as w1:
        with open('restauranttrain_updated_valid.bio', 'w') as w2:
            for line in f.readlines():
                if count < 6500:
                    if line == '\n':
                        w1.write(line)
                        count += 1
                    else:
                        w1.write('\t'.join(line.strip().split('\t')[::-1]) + '\n')
                else:
                    if line == '\n':
                        w2.write(line)
                    else:
                        w2.write('\t'.join(line.strip().split('\t')[::-1]) + '\n')

In [None]:
data = []
labels = []
with open('restauranttrain_updated.bio', 'r') as f:
    cur_data = []
    cur_label = []
    for line in f.readlines():
        if line == '\n':
            data.append(cur_data)
            labels.append(cur_label)
            cur_data = []
            cur_label = []
        else:
            cur_data.append(line.strip().split('\t')[0])
            cur_label.append(line.strip().split('\t')[1])

In [None]:
for w, e in zip(data[0], labels[0]):
    print(f'{w}\t{e}')


In [None]:
predictions = []
for word in data[0]:
    if word.isdigit():
        predictions.append('B-Rating')
    else:
        predictions.append('O')

In [None]:
f1_score(labels[0], predictions, average='weighted')

In [None]:
df = pd.DataFrame({'sent_id': [i for j in [[i] * len(s) for i, s in enumerate(data)] for i in j],
                   'data': [i for j in data for i in j],
                   'entities': [i for j in labels for i in j]})
df.head(10)

In [None]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, t) for w, t in zip(s['data'].values.tolist(), 
                                                           s['entities'].values.tolist())]
        self.grouped = self.data.groupby('sent_id').apply(agg_func)
        self.sentences = [s for s in self.grouped]
        
    def get_next(self):
        try: 
            s = self.grouped['Sentence: {}'.format(self.n_sent)]
            self.n_sent += 1
            return s 
        except:
            return None

getter = SentenceGetter(df)
sentences = getter.sentences

In [None]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    
    features = {
        'bias': 1.0, 
        'word.lower()': word.lower(), 
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit()
    }
    if i > 0:
        word1 = sent[i-1][0]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper()
        })
    else:
        features['BOS'] = True
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper()
        })
    else:
        features['EOS'] = True

    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, label in sent]

def sent2tokens(sent):
    return [token for token, label in sent]

In [None]:
X = [sent2features(s) for s in sentences]
y = [sent2labels(s) for s in sentences]

In [None]:
X_train = X[:7000]
X_test = X[7000:]
y_train = y[:7000]
y_test = y[7000:]

In [None]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True,
    verbose=True
)
crf.fit(X_train, y_train)

In [None]:
all_entities = sorted(df.entities.unique().tolist())

In [None]:
y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=all_entities)

In [None]:
y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=[i for i in all_entities if i != 'O'])

In [None]:
print(metrics.flat_classification_report(y_test, y_pred, labels = all_entities))

In [None]:
!python -m spacy init config base_config.cfg -p ner

In [None]:
!python -m spacy init fill-config base_config.cfg config.cfg

In [None]:
!python -m spacy convert restauranttrain_updated.bio . -t json -c ner

In [None]:
!python -m spacy convert restauranttrain_updated_train.bio . -c ner

In [None]:
!python -m spacy convert restauranttrain_updated_valid.bio . -c ner

In [None]:
!python -m spacy train config.cfg --output ./output --paths.train restauranttrain_updated_train.spacy --paths.dev restauranttrain_updated_valid.spacy

In [None]:
!python -m spacy evaluate output/model-last restauranttrain_updated_valid.spacy

In [None]:
nlp = spacy.load("output/model-best")
for i in range(10, 20):
    doc = nlp(' '.join(data[i]))
    spacy.displacy.render(doc, style="ent", jupyter=True)