In [None]:
import spacy
import json
import random
import re
import pandas as pd
import numpy as np
from copy import deepcopy
from sklearn import model_selection
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
from spacy.symbols import ORTH
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import tree
import matplotlib.pyplot as plt
import seaborn as sns
import luima_sbd.sbd_utils as sbd
import os
import fasttext
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
import joblib

import seaborn as sns
%matplotlib inline

#show all outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

sns.set_style("whitegrid")
def fig_prop():
    plt.figure(figsize=(10,7))
    plt.ticklabel_format(style='plain', axis='y')
    plt.ticklabel_format(style='plain', axis='x')
    
import matplotlib
matplotlib.rcParams.update({'font.size': 13})

# SETUP

In [None]:

embedding_model = fasttext.load_model("result_model.bin")
best_model = joblib.load('RSVM_best_model.joblib')

train_tokens_mean = 20.592041800643088
train_tokens_std = 15.221984070624842

nlp = spacy.load("en_core_web_sm")

special_cases = ['Vet. App.','Fed. Cir.']
nlp.tokenizer.add_special_case('Vet. App.', [{ORTH: 'Vet. App.'}])
nlp.tokenizer.add_special_case('Fed. Cir.', [{ORTH: 'Fed. Cir.'}])

def spacy_tokenize(txt):
    doc = nlp(txt)
    tokens = list(doc)
    clean_tokens = []
    for i in range(len(tokens)):
        t = tokens[i]
        #print(t.pos_, t.text)
        #print(i, len(tokens))
        if(i != len(tokens) - 1):
            t_next = tokens[i+1]
        else: t_next = None
        if(t_next!=None and t_next.pos_=='PART' and re.search(r'\'', t_next.text)):
            t_combined = t.text + t_next.text
            t_combined = re.sub(r'\W','',t_combined).lower()
            clean_tokens.append(t_combined)
            i+=1           
        elif t.pos_ == 'PUNCT':
            pass
        elif t.text in special_cases:
            clean_tokens.append(t.lemma_.lower())
        elif (t.text[0].isalpha() == False and t.is_digit==False and t.is_upper == False):
            pass            
        elif t.pos_ == 'NUM':
            clean_tokens.append(f'<NUM{len(t)}>')
        else:
            lemma = t.lemma_
            lemma = re.sub(r'\W','',lemma)
            lemma =lemma.lower()
            clean_tokens.append(lemma)
    return clean_tokens


def spans_add_spacy_tokens(spans):
    for s in spans:
        s['tokens_spacy'] = spacy_tokenize(s['txt'])
        s['tokens_count'] = len(s['tokens_spacy'])
        
def word_vector_spans(spans):
    for s in spans:
        total_vec = np.zeros(100,)
        total_tokens = s['tokens_count']
        if(total_tokens != 0):
            for t in s['tokens_spacy']:
                word_vec = embedding_model.get_word_vector(t)
                total_vec = np.add(total_vec, word_vec)
            average_vec = total_vec / total_tokens
            s['average_vec'] = average_vec
        else:
            s['average_vec'] = np.zeros(100,)

            


# ANALYZE

In [None]:
#vectorizer = joblib.load('spacy_vectorizer.joblib')

def make_spans_data(sentences, offsets, counts):
    spans_data = []
    for i in range(len(sentences)):
        span_dict = {
            'txt': sentences[i],
            'start_normalized': offsets[i][0]/counts
        }
        spans_data.append(span_dict)
        
    return spans_data

def make_feature_vectors_and_labels(spans):
    # function takes long to execute
    # note: we un-sparse the matrix here to be able to manipulate it
    #tfidf = vectorizer.transform([s['txt'] for s in spans]).toarray()
    starts_normalized = np.array([s['start_normalized'] for s in spans])
    
    num_tokens_normalized = np.array([(s['tokens_count']-train_tokens_mean)/train_tokens_std for s in spans])
    
    avg_vec = np.array([s['average_vec'] for s in spans])
    X = np.concatenate((avg_vec, np.expand_dims(starts_normalized, axis=1), np.expand_dims(num_tokens_normalized, axis=1)), axis=1)
    return X


def analyze(text):

    results = []
    predictions = []
    length = len(text)
    
    sentences = sbd.text2sentences(text, offsets=False)
    offsets = sbd.text2sentences(text, offsets=True)
    
    spans_data = make_spans_data(sentences, offsets, length)
    spans_add_spacy_tokens(spans_data)
    word_vector_spans(spans_data)
    test_data = make_feature_vectors_and_labels(spans_data)
    
    predictions = best_model.predict(test_data)
    for i in range(len(sentences)):
        arr = [sentences[i], predictions[i]]
        results.append(arr)
        
    return results

In [None]:
analyze()