In [None]:
import spacy
import json
import random
import re
import pandas as pd
import numpy as np
from copy import deepcopy
from sklearn import model_selection
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
from spacy.symbols import ORTH
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import tree
import matplotlib.pyplot as plt
import seaborn as sns
import luima_sbd.sbd_utils as sbd
import os
import fasttext
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
import joblib

In [None]:
import seaborn as sns
%matplotlib inline

#show all outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

sns.set_style("whitegrid")
def fig_prop():
    plt.figure(figsize=(10,7))
    plt.ticklabel_format(style='plain', axis='y')
    plt.ticklabel_format(style='plain', axis='x')
    
import matplotlib
matplotlib.rcParams.update({'font.size': 13})

In [None]:
random.seed(42)

In [None]:
#Code from LDSI_Classifier_Workshop
def plot_confusion_matrix(y_true, y_pred, classes,
                          title=None,
                          cmap=plt.cm.Blues):
    cm = confusion_matrix(y_true, y_pred)
    fig, ax = plt.subplots(figsize=(8, 8))
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], 'd'),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    return ax

In [None]:
#Code from LDSI_Classifier_Workshop
def top_tfidf_features(row, features, top_n=15):
    ''' Get top n tfidf values in row and return them with their corresponding feature names.'''
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_feats = [(features[i], row[i]) for i in topn_ids]
    df = pd.DataFrame(top_feats)
    df.columns = ['feature', 'tfidf']
    return df


def top_features_in_doc(Xtr, features, row_id, top_n=15):
    ''' Top tfidf features in specific document (matrix row) '''
    xtr_row = Xtr[row_id]
    if type(xtr_row) is not np.ndarray:
        xtr_row = xtr_row.toarray()
    row = np.squeeze(xtr_row)
    return top_tfidf_features(row, features, top_n)


def top_mean_features(Xtr, features, grp_ids=None, min_tfidf=0.1, top_n=25):
    ''' Return the top n features that on average are most important amongst documents in rows
        indentified by indices in grp_ids. '''
    if grp_ids:
        D = Xtr[grp_ids]
    else:
        D = Xtr
    if type(D) is not np.ndarray:
        D = D.toarray()
    D[D < min_tfidf] = 0
    tfidf_means = np.mean(D, axis=0)
    return top_tfidf_features(tfidf_means, features, top_n)


def top_features_by_class(Xtr, y, features, min_tfidf=0.1, top_n=25):
    ''' Return a list of dfs, where each df holds top_n features and their mean tfidf value
        calculated across documents with the same class label. '''
    dfs = {}
    labels = np.unique(y)
    for label in labels:
        ids = np.where(y==label)
        feats_df = top_mean_features(Xtr, features, ids, min_tfidf=min_tfidf, top_n=top_n)
        feats_df.label = label
        dfs[label] = feats_df
    return dfs


def span_top_tfidf(spans_txt, spans_tfidf, features, index):
    print('span text:\n'+spans_txt[index]+'\n')
    print(top_features_in_doc(spans_tfidf, features, index))
    
corpus_fpath = './ldsi_bva_sentence_corpus_v1.json'
data = json.load(open(corpus_fpath))

In [None]:
#Open the files
affirmed = open("./affirmed_ids.txt", "r").read().split('\n')
denied = open("./denied_ids.txt", "r").read().split('\n')
remanded = open("./remanded_ids.txt", "r").read().split('\n')

In [None]:
#Code from LDSI_Classifier_Workshop
annotations = data['annotations']
documents_by_id = {d['_id']: d for d in data['documents']}
types_by_id = {t['_id']: t for t in data['types']}
type_ids_by_name = {t['name']: t['_id'] for t in data['types']}
type_names_by_id = {t['_id']: t['name'] for t in data['types']}
doc_id_by_name = {d['name']: d['_id'] for d in data['documents']}
doc_name_by_id = {d['_id']: d['name'] for d in data['documents']}

In [None]:
# get all sentences assuming every annotation is a sentence
def make_span_data(documents_by_id, types_by_id, annotations):
    span_data = []
    for a in annotations:
        start = a['start']
        end = a['end']
        document_txt = documents_by_id[a['document']]['plainText']
        doc_name = documents_by_id[a['document']]['name']
        if(doc_name in affirmed):
            dec_label = "affirmed"
        elif(doc_name in denied):
            dec_label = "denied"
        elif(doc_name in remanded):
            dec_label = "remanded"
        atype = a['type']
        sd = {'txt': document_txt[start:end],
              'document': a['document'],
              'name': doc_name,
              'decision': dec_label,
              'type': types_by_id[atype]['name'],
              'start': a['start'],
              'start_normalized': a['start'] / len(document_txt),
              'end': a['end']}
        span_data.append(sd)
    return span_data

In [None]:
spans = make_span_data(documents_by_id, types_by_id, annotations)
span_labels = [s['type'] for s in spans]

In [None]:
span_decisions = [s['decision'] for s in spans]

In [None]:
affirmed_ids = random.sample(affirmed, 6)
affirmed_ids_test = affirmed_ids[0:3]
affirmed_ids_dev = affirmed_ids[3:6]

In [None]:
denied_ids = random.sample(denied, 6)
denied_ids_test = denied_ids[0:3]
denied_ids_dev = denied_ids[3:6]

In [None]:
remanded_ids = random.sample(remanded, 6)
remanded_ids_test = remanded_ids[0:3]
remanded_ids_dev = remanded_ids[3:6]

In [None]:
test_ids_all = affirmed_ids_test+denied_ids_test+remanded_ids_test
dev_ids_all = affirmed_ids_dev+denied_ids_dev+remanded_ids_dev

In [None]:
dev_spans_new = []
test_spans_new = []
train_spans_new = []
for s in spans:
    if s['name'] in test_ids_all:
        test_spans_new.append(s)
    elif s['name'] in dev_ids_all:
        dev_spans_new.append(s)
    else:
        train_spans_new.append(s)
        
df_train_spans = pd.DataFrame(train_spans_new)
df_dev_spans = pd.DataFrame(dev_spans_new)
df_test_spans = pd.DataFrame(test_spans_new)

train_doc_list = df_train_spans['document'].unique()

In [None]:
#Create Corpus
train_spans_txt = [s['txt'] for s in train_spans_new]
dev_spans_txt = [s['txt'] for s in dev_spans_new]
test_spans_txt = [s['txt'] for s in test_spans_new]

In [None]:
nlp = spacy.load("en_core_web_sm")

### Sentence Segmenter on Training Data

In [None]:
tp_total = 0
fp_total = 0
fn_total = 0
#sent_total = 0

score_list = [] 

for index in range(len(train_doc_list)):
    test_doc = []
    for s in train_spans_new:
        if(s['document']==train_doc_list[index]):
            test_doc.append(s)
   
    document_txt = documents_by_id[train_doc_list[index]]['plainText']
    doc = nlp(document_txt)
    #sent_total += len(list(doc.sents))
    
    assert doc.has_annotation("SENT_START")
    
    test_doc = sorted(test_doc, key=lambda k: k['start'])
    
    tp = 0
    fp = 0

    for i in range(len(test_doc)):
        for sent in doc.sents:
            s_start = sent.start_char
            s_end = sent.end_char
            t_start = test_doc[i]['start']
            t_end = test_doc[i]['end']
            if(s_start >= t_start-3 and s_end <= t_end+3):
                if(s_start >= t_start-3 and s_start <= t_start+3 and s_end >= t_end-3 and s_end <= t_end+3):
                    tp += 1
                else:
                    fp+=1

            elif(s_start <= t_end and s_start>=t_start and s_end >=t_end):
                fp+=1
            elif(i!=len(test_doc)-1 and s_start >= t_end and s_start < test_doc[i+1]['start'] and s_end <= test_doc[i+1]['end']):
                fp+=1
                
    score_dict = {}
    fn = len(test_doc) - tp
    tp_total += tp
    fp_total += fp
    fn_total += fn
    doc_precision = tp / (tp+fp)
    doc_recall = tp / (tp+fn)
    score_dict = {"File": train_doc_list[index],
                 "Precision": doc_precision,
                 "Recall": doc_recall}
    score_list.append(score_dict)


        
#    print("File",train_doc_list[index], "P", doc_precision, "R", doc_recall, "F1", doc_f1_score)
#     print("File",train_doc_list[index], "TP", tp, "FP", fp, "FN", fn)
print("Total", tp_total, fp_total, fn_total)

precision = tp_total / (tp_total+fp_total)
recall = tp_total / (tp_total+fn_total)
f1_score = 2 * precision * recall/(precision+recall)

print(f"Precision: {precision}\n Recall: {recall}\n F1 Score: {f1_score}")

In [None]:
precision_scores = sorted(score_list, key=lambda k: k['Precision'])[0:3]
recall_scores = sorted(score_list, key=lambda k: k['Recall'])[0:3]

In [None]:
print(precision_scores)
print(recall_scores)

In [None]:
new_doc_list = ['60b606d7f8611168dd279d16', '60b606d9f8611168dd279d44', '60b606d8f8611168dd279d2f', '60b606cbf8611168dd279cd1']

### Worst Performing Docs

In [None]:
tp_total = 0
fp_total = 0
fn_total = 0

worst_score_list = [] 

for index in range(len(new_doc_list)):
    #print('@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@')
    test_doc = []
    for s in train_spans_new:
        if(s['document']==new_doc_list[index]): 
            test_doc.append(s)
    len_ann = len(test_doc)
    document_txt = documents_by_id[new_doc_list[index]]['plainText']
    doc = nlp(document_txt)
    test_doc = sorted(test_doc, key=lambda k: k['start'])
    tp = 0
    fp = 0

    for i in range(len(test_doc)):
#         print('---------------------------------------')
#         print("TRUE", test_doc[i]['start'], test_doc[i]['end'], test_doc[i]['txt'])
        for sent in doc.sents:
            s_start = sent.start_char
            s_end = sent.end_char
            t_start = test_doc[i]['start']
            t_end = test_doc[i]['end']
            if(s_start >= t_start-3 and s_end <= t_end+3):
                if(s_start >= t_start-3 and s_start <= t_start+3 and s_end >= t_end-3 and s_end <= t_end+3):
                    tp += 1
#                     print("********* TP1")
#                     print(sent.text)
#                     print("sent", s_start, s_end)
                else:
                    fp+=1
                    #internal sentences
#                     print("####### FP1")
#                     print(sent.text)
#                     print("sent", s_start, s_end)

            elif(s_start <= t_end and s_start>=t_start and s_end >=t_end):
                fp+=1
                #start inside end outside
#                 print("####### FP2")
#                 print(sent.text)
#                 print("sent", s_start, s_end)

            elif(i!=len(test_doc)-1 and s_start >= t_end and s_start < test_doc[i+1]['start'] and s_end <= test_doc[i+1]['end']):
                fp+=1
                #in betweens
#                 print("####### FP3")
#                 print(sent.text)
#                 print("sent", s_start, s_end)
    worst_score_dict = {}            
    fn = len_ann - tp
    tp_total += tp
    fp_total += fp
    fn_total += fn
    doc_precision = tp / (tp+fp)
    doc_recall = tp / (tp+fn)
    worst_score_dict = {"File": new_doc_list[index],
             "Precision": doc_precision,
             "Recall": doc_recall}
    worst_score_list.append(worst_score_dict)
        
    #print("File",new_doc_list[index], "P", doc_precision, "R", doc_recall, "F1", doc_f1_score)
    #print("File",new_doc_list[index], "TP", tp, "FP", fp, "FN", fn)
#print("Total", tp_total, fp_total, fn_total)

precision = tp_total / (tp_total+fp_total)
recall = tp_total / (tp_total+fn_total)
f1_score = 2 * precision * recall/(precision+recall)

print(f"Worst Precision: {precision}\n Recall: {recall}\n F1 Score: {f1_score}")


### Spacy extended

In [None]:
#https://blog.ceshine.net/post/spacy-sentencizer/
from spacy.language import Language
@Language.component("set_custom_boundaries")
def set_custom_boundaries(doc):
    for i, token in enumerate(doc):
        if token.text in ("’s", "'s"):
            doc[i].is_sent_start = False
        elif token.text in ("“", "‘") and i < len(doc) - 1:
            # opening quote
            doc[i+1].is_sent_start = False
        elif token.text in ("”", "’"):
            # closing quote
            doc[i].is_sent_start = False
        elif token.text == "\t":
            doc[i].is_sent_start = False
        elif token.text in (" ","  ","   ","    "):
            doc[i].is_sent_start = False
        elif token.text == "\n":
            doc[i].is_sent_start = False
        elif token.text == "\r":
            doc[i].is_sent_start = False
        elif token.text == "DC.":
            doc[i].is_sent_start = False
        elif token.text in ("Archive","DOCKET","NO.","DATE","(",")"):
            doc[i].is_sent_start = False
        elif token.text in (": "):
            doc[i+1].is_sent_start = False
        
    return doc

In [None]:
nlp.add_pipe("set_custom_boundaries", before="parser")

In [None]:
#New Worst ones

tp_total = 0
fp_total = 0
fn_total = 0

worst_score_list = [] 

for index in range(len(new_doc_list)):
    #print('@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@')
    test_doc = []
    for s in train_spans_new:
        if(s['document']==new_doc_list[index]): 
            test_doc.append(s)
    len_ann = len(test_doc)
    document_txt = documents_by_id[new_doc_list[index]]['plainText']
    doc = nlp(document_txt)
    test_doc = sorted(test_doc, key=lambda k: k['start'])
    tp = 0
    fp = 0

    for i in range(len(test_doc)):
#         print('---------------------------------------')
#         print("TRUE", test_doc[i]['start'], test_doc[i]['end'], test_doc[i]['txt'])
        for sent in doc.sents:
            s_start = sent.start_char
            s_end = sent.end_char
            t_start = test_doc[i]['start']
            t_end = test_doc[i]['end']
            if(s_start >= t_start-3 and s_end <= t_end+3):
                if(s_start >= t_start-3 and s_start <= t_start+3 and s_end >= t_end-3 and s_end <= t_end+3):
                    tp += 1
#                     print("********* TP1")
#                     print(sent.text)
#                     print("sent", s_start, s_end)
                else:
                    fp+=1
                    #internal sentences
#                     print("####### FP1")
#                     print(sent.text)
#                     print("sent", s_start, s_end)

            elif(s_start <= t_end and s_start>=t_start and s_end >=t_end):
                fp+=1
                #start inside end outside
#                 print("####### FP2")
#                 print(sent.text)
#                 print("sent", s_start, s_end)

            elif(i!=len(test_doc)-1 and s_start >= t_end and s_start < test_doc[i+1]['start'] and s_end <= test_doc[i+1]['end']):
                fp+=1
                #in betweens
#                 print("####### FP3")
#                 print(sent.text)
#                 print("sent", s_start, s_end)
    worst_score_dict = {}            
    fn = len_ann - tp
    tp_total += tp
    fp_total += fp
    fn_total += fn
    doc_precision = tp / (tp+fp)
    doc_recall = tp / (tp+fn)
    worst_score_dict = {"File": new_doc_list[index],
             "Precision": doc_precision,
             "Recall": doc_recall}
    worst_score_list.append(worst_score_dict)
        
    #print("File",new_doc_list[index], "P", doc_precision, "R", doc_recall, "F1", doc_f1_score)
    #print("File",new_doc_list[index], "TP", tp, "FP", fp, "FN", fn)
    #print("Total", tp_total, fp_total, fn_total)

    
precision = tp_total / (tp_total+fp_total)
recall = tp_total / (tp_total+fn_total)
f1_score = 2 * precision * recall/(precision+recall)

print(f"Worst Precision: {precision}\n Recall: {recall}\n F1 Score: {f1_score}")

In [None]:
worst_score_list

In [None]:
#Training Data

tp_total = 0
fp_total = 0
fn_total = 0

score_list = [] 

for index in range(len(train_doc_list)):
    test_doc = []
    for s in train_spans_new:
        if(s['document']==train_doc_list[index]):
            test_doc.append(s)
   
    document_txt = documents_by_id[train_doc_list[index]]['plainText']
    doc = nlp(document_txt)
    test_doc = sorted(test_doc, key=lambda k: k['start'])
    
    tp = 0
    fp = 0

    for i in range(len(test_doc)):
        for sent in doc.sents:
            s_start = sent.start_char
            s_end = sent.end_char
            t_start = test_doc[i]['start']
            t_end = test_doc[i]['end']
            if(s_start >= t_start-3 and s_end <= t_end+3):
                if(s_start >= t_start-3 and s_start <= t_start+3 and s_end >= t_end-3 and s_end <= t_end+3):
                    tp += 1
                else:
                    fp+=1

            elif(s_start <= t_end and s_start>=t_start and s_end >=t_end):
                fp+=1
            elif(i!=len(test_doc)-1 and s_start >= t_end and s_start < test_doc[i+1]['start'] and s_end <= test_doc[i+1]['end']):
                fp+=1
                
    score_dict = {}
    fn = len(test_doc) - tp
    tp_total += tp
    fp_total += fp
    fn_total += fn
    doc_precision = tp / (tp+fp)
    doc_recall = tp / (tp+fn)
    score_dict = {"File": train_doc_list[index],
                 "Precision": doc_precision,
                 "Recall": doc_recall}
    score_list.append(score_dict)


        
#    print("File",train_doc_list[index], "P", doc_precision, "R", doc_recall, "F1", doc_f1_score)
#     print("File",train_doc_list[index], "TP", tp, "FP", fp, "FN", fn)
    
print("Total", tp_total, fp_total, fn_total)

In [None]:
precision = tp_total / (tp_total+fp_total)
recall = tp_total / (tp_total+fn_total)
f1_score = 2 * precision * recall/(precision+recall)

print(f"Precision: {precision}\n Recall: {recall}\n F1 Score: {f1_score}")

### Luima

In [None]:
#Worst files with sbd
tp_total = 0
fp_total = 0
fn_total = 0

worst_score_list = [] 

for index in range(len(new_doc_list)):
    #print('@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@')
    #print(new_doc_list[index])
    test_doc = []
    for s in train_spans_new:
        if(s['document']==new_doc_list[index]): 
            test_doc.append(s)
    len_ann = len(test_doc)
    document_txt = documents_by_id[new_doc_list[index]]['plainText']
    sentences = sbd.text2sentences(document_txt, offsets=False)
    offsets = sbd.text2sentences(document_txt, offsets=True)
    test_doc = sorted(test_doc, key=lambda k: k['start'])
    tp = 0
    fp = 0

    for i in range(len(test_doc)):
        #print('---------------------------------------')
        #print("TRUE", test_doc[i]['start'], test_doc[i]['end'], test_doc[i]['txt'])
        for j in range(len(sentences)):
            s_start = offsets[j][0]
            s_end = offsets[j][1]
            t_start = test_doc[i]['start']
            t_end = test_doc[i]['end']
            if(s_start >= t_start-3 and s_end <= t_end+3):
                if(s_start >= t_start-3 and s_start <= t_start+3 and s_end >= t_end-3 and s_end <= t_end+3):
                    tp += 1
#                     print("********* TP1")
#                     print(sentences[j])
#                     print("sent", s_start, s_end)
                else:
                    fp+=1
                    #internal sentences
#                     print("####### FP1")
#                     print(sentences[j])
#                     print("sent", s_start, s_end)

            elif(s_start <= t_end and s_start>=t_start and s_end >=t_end):
                fp+=1
                #start inside end outside
#                 print("####### FP2")
#                 print(sentences[j])
#                 print("sent", s_start, s_end)

            elif(i!=len(test_doc)-1 and s_start >= t_end and s_start < test_doc[i+1]['start'] and s_end <= test_doc[i+1]['end']):
                fp+=1
                #in betweens
#                 print("####### FP3")
#                 print(sentences[j])
#                 print("sent", s_start, s_end)
    worst_score_dict = {}            
    fn = len_ann - tp
    tp_total += tp
    fp_total += fp
    fn_total += fn
    doc_precision = tp / (tp+fp)
    doc_recall = tp / (tp+fn)
    worst_score_dict = {"File": new_doc_list[index],
             "Precision": doc_precision,
             "Recall": doc_recall}
    worst_score_list.append(worst_score_dict)
        
    #print("File",new_doc_list[index], "P", doc_precision, "R", doc_recall, "F1", doc_f1_score)
    #print("File",new_doc_list[index], "TP", tp, "FP", fp, "FN", fn)
    #print("Total", tp_total, fp_total, fn_total)


In [None]:
precision = tp_total / (tp_total+fp_total)
recall = tp_total / (tp_total+fn_total)
f1_score = 2 * precision * recall/(precision+recall)

print(f"Precision: {precision}\n Recall: {recall}\n F1 Score: {f1_score}")

In [None]:
worst_score_list

In [None]:
#Training Data
tp_total = 0
fp_total = 0
fn_total = 0

score_list = [] 

for index in range(len(train_doc_list)):
    test_doc = []
    for s in train_spans_new:
        if(s['document']==train_doc_list[index]):
            test_doc.append(s)
   
    document_txt = documents_by_id[train_doc_list[index]]['plainText']
    sentences = sbd.text2sentences(document_txt, offsets=False)
    offsets = sbd.text2sentences(document_txt, offsets=True)
    test_doc = sorted(test_doc, key=lambda k: k['start'])
    
    tp = 0
    fp = 0

    for i in range(len(test_doc)):
#         print('---------------------------------------')
#         print("TRUE", test_doc[i]['start'], test_doc[i]['end'], test_doc[i]['txt'])
        for j in range(len(sentences)):
            s_start = offsets[j][0]
            s_end = offsets[j][1]
            t_start = test_doc[i]['start']
            t_end = test_doc[i]['end']
            if(s_start >= t_start-3 and s_end <= t_end+3):
                if(s_start >= t_start-3 and s_start <= t_start+3 and s_end >= t_end-3 and s_end <= t_end+3):
                    tp += 1
                else:
                    fp+=1
#                     print("####### FP1")
#                     print(sentences[j])
#                     print("sent", s_start, s_end)

            elif(s_start <= t_end and s_start>=t_start and s_end >=t_end):
                fp+=1
#                 print("####### FP2")
#                 print(sentences[j])
#                 print("sent", s_start, s_end)
            elif(i!=len(test_doc)-1 and s_start >= t_end and s_start < test_doc[i+1]['start'] and s_end <= test_doc[i+1]['end']):
                fp+=1
#                 print("####### FP3")
#                 print(sentences[j])
#                 print("sent", s_start, s_end)
                
    score_dict = {}
    fn = len(test_doc) - tp
    tp_total += tp
    fp_total += fp
    fn_total += fn
    doc_precision = tp / (tp+fp)
    doc_recall = tp / (tp+fn)
    score_dict = {"File": train_doc_list[index],
                 "Precision": doc_precision,
                 "Recall": doc_recall}
    score_list.append(score_dict)


        
#    print("File",train_doc_list[index], "P", doc_precision, "R", doc_recall, "F1", doc_f1_score)
#print("File",train_doc_list[index], "TP", tp, "FP", fp, "FN", fn)
    
#print("Total", tp_total, fp_total, fn_total)

In [None]:
precision = tp_total / (tp_total+fp_total)
recall = tp_total / (tp_total+fn_total)
f1_score = 2 * precision * recall/(precision+recall)

print(f"Precision: {precision}\n Recall: {recall}\n F1 Score: {f1_score}")

In [None]:
unlabeled_files = os.listdir("./unlabeled/")

In [None]:
hist_list = []
file_no = 1
for file in unlabeled_files:
    unlabeled_txt = open(f"./unlabeled/{file}", "r").read()
    unlabeled_sentences = sbd.text2sentences(unlabeled_txt, offsets=False)
    #unlabeled_offsets = sbd.text2sentences(unlabeled_txt, offsets=True)
    count = len(unlabeled_sentences)
    hist_dict = {"file_name": file,
                "sent_count": count,
                "sentences": unlabeled_sentences}
    
    hist_list.append(hist_dict)
    #unlabeled_txt.close()

In [None]:
hist_df = pd.DataFrame(hist_list)
print(hist_df['sent_count'].sum())

In [None]:
hist_df.to_csv('hist_df.csv', index=False)

In [None]:
#SENTENCE HISTOGRAM
fig_prop()
sns.histplot(data = hist_df, x = "sent_count", binwidth=1)
plt.xlabel('Number of sentences')

### Tokenizer

In [None]:
nlp = spacy.load("en_core_web_sm")

special_cases = ['Vet. App.','Fed. Cir.']
nlp.tokenizer.add_special_case('Vet. App.', [{ORTH: 'Vet. App.'}])
nlp.tokenizer.add_special_case('Fed. Cir.', [{ORTH: 'Fed. Cir.'}])

def spacy_tokenize(txt):
    doc = nlp(txt)
    tokens = list(doc)
    clean_tokens = []
    for i in range(len(tokens)):
        t = tokens[i]
        #print(t.pos_, t.text)
        #print(i, len(tokens))
        if(i != len(tokens) - 1):
            t_next = tokens[i+1]
        else: t_next = None
        if(t_next!=None and t_next.pos_=='PART' and re.search(r'\'', t_next.text)):
            t_combined = t.text + t_next.text
            t_combined = re.sub(r'\W','',t_combined).lower()
            clean_tokens.append(t_combined)
            i+=1           
        elif t.pos_ == 'PUNCT':
            pass
        elif t.text in special_cases:
            clean_tokens.append(t.lemma_.lower())
        elif (t.text[0].isalpha() == False and t.is_digit==False and t.is_upper == False):
            pass            
        elif t.pos_ == 'NUM':
            clean_tokens.append(f'<NUM{len(t)}>')
        else:
            lemma = t.lemma_
            lemma = re.sub(r'\W','',lemma)
            lemma =lemma.lower()
            clean_tokens.append(lemma)
    return clean_tokens

In [None]:
example_basic_1 = 'In sum, as the preponderance of the evidence is against the Veteran\'s claim, his appeal must be denied.'
example_cit_1 = 'Smith v. Gober, 14 Vet. App. 227 (2000), aff\'d 281 F.3d 1384 (Fed. Cir. 2002); \tDela Cruz v. Principi, 15 Vet. App. 143 (2001); see also Quartuccio v. Principi, 16 Vet. App. 183 (2002).'
example_rule_1 = '"To establish a right to compensation for a present disability, a Veteran must show: "(1) the existence of a present disability; (2) in-service incurrence or aggravation of a disease or injury; and (3) a causal relationship between the present disability and the disease or injury incurred or aggravated during service"-the so-called "nexus" requirement."'
example_mixed_1 = 'In Dingess v. Nicholson, 19 Vet. App. 473 (2006), the U.S. Court of Appeals for Veterans Claims held that, upon receipt of an application for a service-connection claim, 38 U.S.C.A. � 5103(a) and 38 C.F.R. � 3.159(b) require VA to provide the claimant with notice that a disability rating and an effective date for the award of benefits will be assigned if service connection is awarded. '

In [None]:
spacy_tokenize(example_mixed_1)

In [None]:
token_list = []
for i in range(len(hist_df)):
    for sent in hist_df['sentences'][i]:        
        tokenized = spacy_tokenize(sent)
            #print(s)
            #print(tokenized)
        token_dict = {"File": hist_df['file_name'][i],
                     "Sentence": sent,
                     "Tokens": tokenized,
                     "Token_number": len(tokenized)}
        token_list.append(token_dict)

In [None]:
token_list_df = pd.DataFrame(token_list)
token_list_df.to_csv('token_df.csv', index=False)

In [None]:
len(token_list_df)

In [None]:
#TOKEN HISTOGRAM
fig_prop()
sns.histplot(data = token_list_df, x = 'Token_number', binwidth=1)

In [None]:
random_token_list_df = token_list_df
random_token_list_df = random_token_list_df.sample(frac=1).reset_index(drop=True)

In [None]:
new_file = open('random_sentences.txt', 'a')

def make_file(Tokens):
    line = ' '.join(Tokens)
    line = line + '\n'
    new_file.write(line)

random_token_list_df[random_token_list_df.Token_number>5].Tokens.apply(make_file)

new_file.close()

In [None]:
random_sentences = open('random_sentences.txt', 'r')
token_list_df = pd.read_csv('token_df.csv')

### Train FastText

In [None]:
model = fasttext.train_unsupervised(input = 'random_sentences.txt', dim = 100, minCount = 20, epoch = 10)

In [None]:
model.save_model("result_model.bin")

In [None]:
result_model = fasttext.load_model("result_model.bin")

In [None]:
words = result_model.get_words(on_unicode_error='replace')

In [None]:
vocab = len(words)
print(vocab)

In [None]:
word_list = ["veteran","vet","service","cause","caused","remanded","vietnam","see","denied","decision","board","physician","evidence","claim","pain","under","appeal"]

In [None]:
neighbors = []
for word in word_list:
    nearest = model.get_nearest_neighbors(word)
    neigh_dict = {"word":word,
                    "neighbours": nearest}
    neighbors.append(neigh_dict)
    
neighbors

In [None]:
df_neigh = pd.DataFrame(neighbors)
df_neigh.to_csv('nearest_neighbors.csv') 

## TFIDF Featurization

In [None]:
# suboptimal: tokenizer gets called twice
spacy_tfidf_vectorizer = TfidfVectorizer(tokenizer=spacy_tokenize,
                                         min_df=3,
                                         ngram_range=(1,1))
spacy_tfidf_vectorizer = spacy_tfidf_vectorizer.fit(train_spans_txt)
tfidf_features_spacy = spacy_tfidf_vectorizer.get_feature_names()

In [None]:
train_tfidf_spacy = spacy_tfidf_vectorizer.transform(train_spans_txt).toarray()
dev_tfidf_spacy = spacy_tfidf_vectorizer.transform(dev_spans_txt).toarray()
test_tfidf_spacy = spacy_tfidf_vectorizer.transform(test_spans_txt).toarray()

train_spans_labels = np.array([s['type'] for s in train_spans_new])
dev_spans_labels = np.array([s['type'] for s in dev_spans_new])
test_spans_labels = np.array([s['type'] for s in test_spans_new])

In [None]:
#SAVE VECTORIZER
joblib.dump(spacy_tfidf_vectorizer, 'spacy_vectorizer.joblib')

### MODEL TRAINING

In [None]:
# MODEL WITHOUT EMBEDDING
def no_make_feature_vectors_and_labels(spans, vectorizer):
    # function takes long to execute
    # note: we un-sparse the matrix here to be able to manipulate it
    tfidf = spacy_tfidf_vectorizer.transform([s['txt'] for s in spans]).toarray()
    starts_normalized = np.array([s['start_normalized'] for s in spans])
    num_tokens_normalized = np.array([(s['tokens_count']-train_tokens_mean)/train_tokens_std for s in spans])
    
    #avg_vec = np.array([s['average_vec'] for s in spans])
    y = np.array([s['type'] for s in spans])
    X = np.concatenate((tfidf, np.expand_dims(starts_normalized, axis=1), np.expand_dims(num_tokens_normalized, axis=1)), axis=1)

    return X, y

train_X, train_y = no_make_feature_vectors_and_labels(train_spans_new, spacy_tfidf_vectorizer)
dev_X, dev_y = no_make_feature_vectors_and_labels(dev_spans_new, spacy_tfidf_vectorizer)
test_X, test_y = no_make_feature_vectors_and_labels(test_spans_new, spacy_tfidf_vectorizer)

print(f'{train_X.shape} {train_y.shape}')
print(f'{dev_X.shape} {dev_y.shape}')
print(f'{test_X.shape} {test_y.shape}')

### Linear Support Vector Machine Classifier

In [None]:
clf = LinearSVC(random_state = 0)
clf = clf.fit(train_X, train_y)

print('TRAIN:\n'+classification_report(train_spans_labels, clf.predict(train_X)))
print('DEV:\n'+classification_report(dev_spans_labels, clf.predict(dev_X)))

plot_confusion_matrix(dev_spans_labels, clf.predict(dev_X), classes=list(clf.classes_),
                      title='Confusion matrix for training data')
plt.show()

### Logistic Regression

In [None]:
clf = LogisticRegression(random_state = 0)
clf = clf.fit(train_X, train_y)

print('TRAIN:\n'+classification_report(train_spans_labels, clf.predict(train_X)))
print('DEV:\n'+classification_report(dev_spans_labels, clf.predict(dev_X)))

plot_confusion_matrix(dev_spans_labels, clf.predict(dev_X), classes=list(clf.classes_),
                      title='Confusion matrix for training data')
plt.show()

### Radial kernel SVM

In [None]:
#Hyperparameter=None, gamma='scale'
clf = SVC(kernel = 'rbf', random_state = 0)
clf = clf.fit(train_X, train_y)

print('TRAIN:\n'+classification_report(train_spans_labels, clf.predict(train_X)))
print('DEV:\n'+classification_report(dev_spans_labels, clf.predict(dev_X)))

plot_confusion_matrix(dev_spans_labels, clf.predict(dev_X), classes=list(clf.classes_),
                      title='Confusion matrix for training data')
plt.show()

In [None]:
#Hyperparameter: gamma = 'auto'
clf = SVC(kernel = 'rbf', gamma = 'auto', random_state = 0)
clf = clf.fit(train_X, train_y)

print('TRAIN:\n'+classification_report(train_spans_labels, clf.predict(train_X)))
print('DEV:\n'+classification_report(dev_spans_labels, clf.predict(dev_X)))

plot_confusion_matrix(dev_spans_labels, clf.predict(dev_X), classes=list(clf.classes_),
                      title='Confusion matrix for training data')
plt.show()

### Polynomial kernel SVM

In [None]:
#Hyperparameter: None, degree = 3
clf = SVC(kernel = 'poly', random_state = 0)
clf = clf.fit(train_X, train_y)

print('TRAIN:\n'+classification_report(train_spans_labels, clf.predict(train_X)))
print('DEV:\n'+classification_report(dev_spans_labels, clf.predict(dev_X)))

plot_confusion_matrix(dev_spans_labels, clf.predict(dev_X), classes=list(clf.classes_),
                      title='Confusion matrix for training data')
plt.show()

In [None]:
#Hyperparameter: degree = 2
clf = SVC(kernel = 'poly', degree = 2, random_state = 0)
clf = clf.fit(train_X, train_y)

print('TRAIN:\n'+classification_report(train_spans_labels, clf.predict(train_X)))
print('DEV:\n'+classification_report(dev_spans_labels, clf.predict(dev_X)))

plot_confusion_matrix(dev_spans_labels, clf.predict(dev_X), classes=list(clf.classes_),
                      title='Confusion matrix for training data')
plt.show()

### Random Forest

In [None]:
#Hyperparameter: max_depth = 20

clf = RandomForestClassifier(max_depth = 20, random_state = 0)
clf = clf.fit(train_X, train_y)

print('TRAIN:\n'+classification_report(train_spans_labels, clf.predict(train_X)))
print('DEV:\n'+classification_report(dev_spans_labels, clf.predict(dev_X)))

plot_confusion_matrix(dev_spans_labels, clf.predict(dev_X), classes=list(clf.classes_),
                      title='Confusion matrix for training data')
plt.show()

In [None]:
#Hyperparameter: max_depth = None
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth = None, random_state = 0)
clf = clf.fit(train_X, train_y)

print('TRAIN:\n'+classification_report(train_spans_labels, clf.predict(train_X)))
print('DEV:\n'+classification_report(dev_spans_labels, clf.predict(dev_X)))

plot_confusion_matrix(dev_spans_labels, clf.predict(dev_X), classes=list(clf.classes_),
                      title='Confusion matrix for training data')
plt.show()

In [None]:
#Hyperparameter: n_estimators = 200

clf = RandomForestClassifier(n_estimators = 200, max_depth = 12, random_state = 0)
clf = clf.fit(train_X, train_y)

print('TRAIN:\n'+classification_report(train_spans_labels, clf.predict(train_X)))
print('DEV:\n'+classification_report(dev_spans_labels, clf.predict(dev_X)))

plot_confusion_matrix(dev_spans_labels, clf.predict(dev_X), classes=list(clf.classes_),
                      title='Confusion matrix for training data')
plt.show()

### Decision Tree Classifier

In [None]:
clf = tree.DecisionTreeClassifier(max_depth=12)
clf = clf.fit(train_X, train_y)

print('TRAIN:\n'+classification_report(train_spans_labels, clf.predict(train_X)))
print('DEV:\n'+classification_report(dev_spans_labels, clf.predict(dev_X)))

plot_confusion_matrix(dev_spans_labels, clf.predict(dev_X), classes=list(clf.classes_),
                      title='Confusion matrix for training data')
plt.show()

In [None]:
#Hyperparameter: max_depth = 22

clf = tree.DecisionTreeClassifier(max_depth=22)
clf = clf.fit(train_X, train_y)

print('TRAIN:\n'+classification_report(train_spans_labels, clf.predict(train_X)))
print('DEV:\n'+classification_report(dev_spans_labels, clf.predict(dev_X)))

plot_confusion_matrix(dev_spans_labels, clf.predict(dev_X), classes=list(clf.classes_),
                      title='Confusion matrix for training data')
plt.show()

In [None]:
#Hyperparameter: min_samples_split = 10

clf = tree.DecisionTreeClassifier(min_samples_split = 10)
clf = clf.fit(train_X, train_y)

print('TRAIN:\n'+classification_report(train_spans_labels, clf.predict(train_X)))
print('DEV:\n'+classification_report(dev_spans_labels, clf.predict(dev_X)))

plot_confusion_matrix(dev_spans_labels, clf.predict(dev_X), classes=list(clf.classes_),
                      title='Confusion matrix for training data')
plt.show()

In [None]:
train_X.shape

## Best Model

In [None]:
clf = LinearSVC(random_state = 0)
clf = clf.fit(train_X, train_y)

print('TRAIN:\n'+classification_report(train_spans_labels, clf.predict(train_X)))
print('DEV:\n'+classification_report(dev_spans_labels, clf.predict(dev_X)))
plot_confusion_matrix(dev_spans_labels, clf.predict(dev_X), classes=list(clf.classes_),
                      title='Confusion matrix for DEV data')

print('TEST:\n'+classification_report(test_spans_labels, clf.predict(test_X)))

plot_confusion_matrix(test_spans_labels, clf.predict(test_X), classes=list(clf.classes_),
                      title='Confusion matrix for TEST data')
plt.show()

## Word Embedding Featurization

In [None]:
def spans_add_spacy_tokens(spans):
    for s in spans:
        s['tokens_spacy'] = spacy_tokenize(s['txt'])
        s['tokens_count'] = len(s['tokens_spacy'])

In [None]:
spans_add_spacy_tokens(train_spans_new)
spans_add_spacy_tokens(test_spans_new)
spans_add_spacy_tokens(dev_spans_new)

In [None]:
train_spans_df = pd.DataFrame(train_spans_new)
test_spans_df = pd.DataFrame(test_spans_new)
dev_spans_df = pd.DataFrame(dev_spans_new)

In [None]:
train_tokens_mean = train_spans_df['tokens_count'].mean()
train_tokens_std = train_spans_df['tokens_count'].std()

In [None]:
print(train_tokens_mean)
print(train_tokens_std)

In [None]:
def word_vector_spans(spans):
    for s in spans:
        total_vec = np.zeros(100,)
        total_tokens = s['tokens_count']
        if(total_tokens != 0):
            for t in s['tokens_spacy']:
                word_vec = model.get_word_vector(t)
                total_vec = np.add(total_vec, word_vec)
            average_vec = total_vec / total_tokens
            s['average_vec'] = average_vec
        else:
            s['average_vec'] = np.zeros(100,)
            
word_vector_spans(train_spans_new)
word_vector_spans(test_spans_new)
word_vector_spans(dev_spans_new)

In [None]:
def make_feature_vectors_and_labels(spans):
    # function takes long to execute
    # note: we un-sparse the matrix here to be able to manipulate it
    #tfidf = spacy_tfidf_vectorizer.transform([s['txt'] for s in spans]).toarray()
    starts_normalized = np.array([s['start_normalized'] for s in spans])
    num_tokens_normalized = np.array([(s['tokens_count']-train_tokens_mean)/train_tokens_std for s in spans])
    
    avg_vec = np.array([s['average_vec'] for s in spans])
    y = np.array([s['type'] for s in spans])
    X = np.concatenate((avg_vec, np.expand_dims(starts_normalized, axis=1), np.expand_dims(num_tokens_normalized, axis=1)), axis=1)
    return X, y

In [None]:
train_X, train_y = make_feature_vectors_and_labels(train_spans_new)
dev_X, dev_y = make_feature_vectors_and_labels(dev_spans_new)
test_X, test_y = make_feature_vectors_and_labels(test_spans_new)

In [None]:
print(f'{train_X.shape} {train_y.shape}')
print(f'{dev_X.shape} {dev_y.shape}')
print(f'{test_X.shape} {test_y.shape}')

### MODEL TRAINING

In [None]:
train_spans_labels = np.array([s['type'] for s in train_spans_new])
dev_spans_labels = np.array([s['type'] for s in dev_spans_new])
test_spans_labels = np.array([s['type'] for s in test_spans_new])

### Linear Support Vector Machine Classifier

In [None]:
clf = LinearSVC(random_state = 0)
clf = clf.fit(train_X, train_y)

print('TRAIN:\n'+classification_report(train_spans_labels, clf.predict(train_X)))
print('DEV:\n'+classification_report(dev_spans_labels, clf.predict(dev_X)))

plot_confusion_matrix(dev_spans_labels, clf.predict(dev_X), classes=list(clf.classes_),
                      title='Confusion matrix for training data')
plt.show()

### Logistic Regression

In [None]:
clf = LogisticRegression(random_state = 0)
clf = clf.fit(train_X, train_y)

print('TRAIN:\n'+classification_report(train_spans_labels, clf.predict(train_X)))
print('DEV:\n'+classification_report(dev_spans_labels, clf.predict(dev_X)))

plot_confusion_matrix(dev_spans_labels, clf.predict(dev_X), classes=list(clf.classes_),
                      title='Confusion matrix for training data')
plt.show()

### Radial kernel SVM

In [None]:
#Hyperparameter=None, gamma='scale'
clf = SVC(kernel = 'rbf', random_state = 0)
clf = clf.fit(train_X, train_y)

print('TRAIN:\n'+classification_report(train_spans_labels, clf.predict(train_X)))
print('DEV:\n'+classification_report(dev_spans_labels, clf.predict(dev_X)))

plot_confusion_matrix(dev_spans_labels, clf.predict(dev_X), classes=list(clf.classes_),
                      title='Confusion matrix for training data')
plt.show()

In [None]:
#Hyperparameter: gamma = 'auto'
clf = SVC(kernel = 'rbf', gamma = 'auto', random_state = 0)
clf = clf.fit(train_X, train_y)

print('TRAIN:\n'+classification_report(train_spans_labels, clf.predict(train_X)))
print('DEV:\n'+classification_report(dev_spans_labels, clf.predict(dev_X)))

plot_confusion_matrix(dev_spans_labels, clf.predict(dev_X), classes=list(clf.classes_),
                      title='Confusion matrix for training data')
plt.show()

### Polynomial kernel SVM

In [None]:
#Hyperparameter: None, degree = 3
clf = SVC(kernel = 'poly', random_state = 0)
clf = clf.fit(train_X, train_y)

print('TRAIN:\n'+classification_report(train_spans_labels, clf.predict(train_X)))
print('DEV:\n'+classification_report(dev_spans_labels, clf.predict(dev_X)))

plot_confusion_matrix(dev_spans_labels, clf.predict(dev_X), classes=list(clf.classes_),
                      title='Confusion matrix for training data')
plt.show()

In [None]:
#Hyperparameter: degree = 2
clf = SVC(kernel = 'poly', degree = 2, random_state = 0)
clf = clf.fit(train_X, train_y)

print('TRAIN:\n'+classification_report(train_spans_labels, clf.predict(train_X)))
print('DEV:\n'+classification_report(dev_spans_labels, clf.predict(dev_X)))

plot_confusion_matrix(dev_spans_labels, clf.predict(dev_X), classes=list(clf.classes_),
                      title='Confusion matrix for training data')
plt.show()

### Decision Tree Classifier

In [None]:
#Hyperparameter: max_depth = 12
clf = tree.DecisionTreeClassifier(max_depth=12)
clf = clf.fit(train_X, train_y)

print('TRAIN:\n'+classification_report(train_spans_labels, clf.predict(train_X)))
print('DEV:\n'+classification_report(dev_spans_labels, clf.predict(dev_X)))

In [None]:
plot_confusion_matrix(dev_spans_labels, clf.predict(dev_X), classes=list(clf.classes_),
                      title='Confusion matrix for training data')
plt.show()

In [None]:
#Hyperparameter: max_depth = 22

clf = tree.DecisionTreeClassifier(max_depth=22)
clf = clf.fit(train_X, train_y)

print('TRAIN:\n'+classification_report(train_spans_labels, clf.predict(train_X)))
print('DEV:\n'+classification_report(dev_spans_labels, clf.predict(dev_X)))

plot_confusion_matrix(dev_spans_labels, clf.predict(dev_X), classes=list(clf.classes_),
                      title='Confusion matrix for training data')
plt.show()

In [None]:
#Hyperparameter: min_samples_split = 10

clf = tree.DecisionTreeClassifier(min_samples_split = 10)
clf = clf.fit(train_X, train_y)

print('TRAIN:\n'+classification_report(train_spans_labels, clf.predict(train_X)))
print('DEV:\n'+classification_report(dev_spans_labels, clf.predict(dev_X)))

plot_confusion_matrix(dev_spans_labels, clf.predict(dev_X), classes=list(clf.classes_),
                      title='Confusion matrix for training data')
plt.show()

### Random Forest

In [None]:
#Hyperparameter: max_depth = 20

clf = RandomForestClassifier(max_depth = 20, random_state = 0)
clf = clf.fit(train_X, train_y)

print('TRAIN:\n'+classification_report(train_spans_labels, clf.predict(train_X)))
print('DEV:\n'+classification_report(dev_spans_labels, clf.predict(dev_X)))

plot_confusion_matrix(dev_spans_labels, clf.predict(dev_X), classes=list(clf.classes_),
                      title='Confusion matrix for training data')
plt.show()

In [None]:
#Hyperparameter: max_depth = None
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth = None, random_state = 0)
clf = clf.fit(train_X, train_y)

print('TRAIN:\n'+classification_report(train_spans_labels, clf.predict(train_X)))
print('DEV:\n'+classification_report(dev_spans_labels, clf.predict(dev_X)))

plot_confusion_matrix(dev_spans_labels, clf.predict(dev_X), classes=list(clf.classes_),
                      title='Confusion matrix for training data')
plt.show()

In [None]:
#Hyperparameter: n_estimators = 200


clf = RandomForestClassifier(n_estimators = 200, max_depth = 12, random_state = 0)
clf = clf.fit(train_X, train_y)

print('TRAIN:\n'+classification_report(train_spans_labels, clf.predict(train_X)))
print('DEV:\n'+classification_report(dev_spans_labels, clf.predict(dev_X)))

plot_confusion_matrix(dev_spans_labels, clf.predict(dev_X), classes=list(clf.classes_),
                      title='Confusion matrix for training data')
plt.show()

In [None]:
#Hyperparameter: 
#max_depth = 12

clf = RandomForestClassifier(max_depth = 12, random_state = 0)
clf = clf.fit(train_X, train_y)

print('TRAIN:\n'+classification_report(train_spans_labels, clf.predict(train_X)))
print('DEV:\n'+classification_report(dev_spans_labels, clf.predict(dev_X)))

plot_confusion_matrix(dev_spans_labels, clf.predict(dev_X), classes=list(clf.classes_),
                      title='Confusion matrix for training data')
plt.show()

## Best Model Word Embedding

In [None]:
#Hyperparameter=None, gamma='scale'
clf = SVC(kernel = 'rbf', random_state = 0)
clf = clf.fit(train_X, train_y)

print('TRAIN:\n'+classification_report(train_spans_labels, clf.predict(train_X)))
print('DEV:\n'+classification_report(dev_spans_labels, clf.predict(dev_X)))

plot_confusion_matrix(dev_spans_labels, clf.predict(dev_X), classes=list(clf.classes_),
                      title='Confusion matrix for DEV data for \n Radial SVM')
plt.show()

print('TEST:\n'+classification_report(test_spans_labels, clf.predict(test_X)))
plot_confusion_matrix(test_spans_labels, clf.predict(test_X), classes=list(clf.classes_),
                      title='Confusion matrix for TEST data for \n Radial SVM')

In [None]:
#SAVE MODEL
joblib.dump(clf, 'RSVM_best_model.joblib')

In [None]:
#LOAD MODEL
clf = load('RSVM_best_model.joblib')

# Error Analysis

In [None]:
def prediction_errors(clf, eval_spans, 
                      select_true_label=None, 
                      select_pred_label=None):
    eval_X, eval_y = make_feature_vectors_and_labels(eval_spans)
    eval_spans_txt = [s['txt'] for s in eval_spans]
    eval_spans_labels = [s['type'] for s in eval_spans]
    pred_y = clf.predict(eval_X)
    for i in range(len(eval_spans)):
        true_label = eval_spans_labels[i]
        pred_label = pred_y[i]
        if true_label != pred_label:
            if select_true_label and true_label != select_true_label: continue
            if select_pred_label and pred_label != select_pred_label: continue
            doc_name = documents_by_id[eval_spans[i]['document']]['name']
            print('sentence # '+str(i)+' / case '+doc_name+' / @'+str(eval_spans[i]['start']))
            print('pred: '+pred_label+' / true: '+true_label)
            print(eval_spans[i]['txt'])
            print()

In [None]:
prediction_errors(clf,
                  random.sample(train_spans_new, 500),
                  select_pred_label='EvidenceBased/Intermediate Finding')

In [None]:
prediction_errors(clf,
                  random.sample(train_spans_new, 500),
                  select_pred_label='EvidenceBasedReasoning')

In [None]:
prediction_errors(clf,
                  random.sample(train_spans_new, 500),
                  select_pred_label='ConclusionOfLaw')

In [None]:
prediction_errors(clf,
                  random.sample(train_spans_new, 500),
                  select_true_label='ConclusionOfLaw')

In [None]:
prediction_errors(clf,
                  random.sample(train_spans_new, 500),
                  select_pred_label='RemandInstructions')

In [None]:
prediction_errors(clf,
                  random.sample(train_spans_new, 500),
                  select_true_label='RemandInstructions')

In [None]:
prediction_errors(clf,
                  random.sample(train_spans_new, 500),
                  select_pred_label='LegalPolicy')