# Legal Data Science and Informatics

Author: Isaac Misael Olguín Nolasco

## Imports

In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import json
import random
import copy
import re
import os
import chardet
import codecs
import importlib
import numpy as np
import itertools
import gc
from datetime import datetime

import multiprocessing as mp
import matplotlib.pyplot as plt

import spacy
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
from spacy.symbols import ORTH
from spacy.language import Language

from sklearn.feature_extraction.text import TfidfVectorizer
#from sklearn.feature_extraction import DictVectorizer
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import NuSVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import tree
from sklearn.metrics import plot_confusion_matrix as plot_confusion_matrix_sklearn

import luima_sbd.sbd_utils as sbd_utils
import config.project_config as project_config
import config.sentences_custom as sentences_custom

import fasttext

gc.collect()

0

## Reading of the dataset and general info


In [2]:
# Path of the dataset
dataset_path = './ldsi_w21_curated_annotations_v2.json'

# Read file 
dataset_json = json.load(open(dataset_path))

In [3]:
# Print keys
dataset_json.keys() # dict_keys(['documents', 'annotations', 'types'])

dict_keys(['documents', 'annotations', 'types'])

In [4]:
# Print length of the dataset
print(f'Length of the curated dataset {len(dataset_json)}')

#Print length of 'documents'
print(f'Total of documents in the dataset is {len(dataset_json["documents"])}')

#Print length of 'annotations'
print(f'Total of annotations in the dataset is {len(dataset_json["annotations"])}')

#Print length of 'types'
print(f'Total of types in the dataset is {len(dataset_json["types"])}')
#print(f'Types in the dataset are {dataset_json["types"]}')

Length of the curated dataset 3
Total of documents in the dataset is 540
Total of annotations in the dataset is 15349
Total of types in the dataset is 14


In [5]:
# Shorthands dictionaries (taken from the LDSI_WS21_Classifier_Workshop)

annotations = dataset_json['annotations']
documents_by_id = {d['_id']: d for d in dataset_json['documents']}
types_by_id = {t['_id']: t for t in dataset_json['types']}
type_ids_by_name = {t['name']: t['_id'] for t in dataset_json['types']}
type_names_by_id = {t['_id']: t['name'] for t in dataset_json['types']}
doc_id_by_name = {d['name']: d['_id'] for d in dataset_json['documents']}
doc_name_by_id = {d['_id']: d['name'] for d in dataset_json['documents']}

# Phase 1 - Dataset splitting

## Getting the granted, denied and remanded cases
"""
print(len(annotations))
print('-'*60)
print(len(documents_by_id))
print('-'*60)
print(len(types_by_id))
print(type_ids_by_name.keys())
print('-'*60)
"""

In [6]:
print(dataset_json["annotations"][0])

{'_id': '61bb066d97ad59b4cfc4699a', 'start': 15922, 'end': 16078, 'document': '61aea57397ad59b4cfc41399', 'type': '61aeaf8097ad59b4cfc416d7'}


In [7]:
docs_with_annotations = []
for docId in documents_by_id:
    for annotId in annotations:
        if annotId['document'] == docId:
            docs_with_annotations.append(docId)
            break
            
print(f'Total number of documents with annotations {len(docs_with_annotations)}')

Total number of documents with annotations 141


In [8]:
docs_decision = {"granted": [], "denied":[], "remanded": []}

In [9]:
docs_decision["granted"] = [doc for doc in docs_with_annotations if documents_by_id.get(doc)["outcome"] == "granted"]
docs_decision["denied"] = [doc for doc in docs_with_annotations if documents_by_id.get(doc)["outcome"] == "denied"]
docs_decision["remanded"] = [doc for doc in docs_with_annotations if documents_by_id.get(doc)["outcome"] == "remanded"]
print(f'There are {len(docs_decision["granted"])} granted, {len(docs_decision["denied"])} denied and, {len(docs_decision["remanded"])} remanded cases')

There are 70 granted, 71 denied and, 0 remanded cases


## Sampling randomly test , dev and training sets

In [10]:
test_set = []
training_set = []
dev_set = []

test_set.extend(random.sample(docs_decision["granted"], 7))
test_set.extend(random.sample(docs_decision["denied"], 7))

docsGrantedAux = [x for x in docs_decision["granted"] if x not in test_set]
docsDeniedAux = [x for x in docs_decision["denied"] if x not in test_set]

dev_set.extend(random.sample(docsGrantedAux, 7))
dev_set.extend(random.sample(docsDeniedAux, 7))

docsGrantedAux = [x for x in docsGrantedAux if x not in dev_set]
docsDeniedAux = [x for x in docsDeniedAux if x not in dev_set]

training_set.extend(docsGrantedAux)
training_set.extend(docsDeniedAux)

print(f'Length of the test_set {len(test_set)}')
print(f'Length of the dev_set {len(dev_set)}')
print(f'Length of the training_set {len(training_set)}')

Length of the test_set 14
Length of the dev_set 14
Length of the training_set 113


In [11]:
print("ID's of the 14 documents for the test set")
for index, idDoc in enumerate(test_set):
    print(f'\tIndex {index} with ID {idDoc}')
    
print("ID's of the 14 documents for the dev set")
for index, idDoc in enumerate(dev_set):
    print(f'\tIndex {index} with ID {idDoc}')
    
"""   
print(f'IDs of the {len(training_set)} documents for the training set')
for index, idDoc in enumerate(training_set):
    print(f'\tIndex {index} with ID {idDoc}')
""" 

ID's of the 14 documents for the test set
	Index 0 with ID 61aea55c97ad59b4cfc4129e
	Index 1 with ID 61aea55d97ad59b4cfc412b5
	Index 2 with ID 61aea55f97ad59b4cfc41319
	Index 3 with ID 61aea55c97ad59b4cfc41299
	Index 4 with ID 61aea55e97ad59b4cfc412de
	Index 5 with ID 61aea55d97ad59b4cfc412d3
	Index 6 with ID 61aea55c97ad59b4cfc412af
	Index 7 with ID 61aea57397ad59b4cfc4138f
	Index 8 with ID 61aea57497ad59b4cfc413c9
	Index 9 with ID 61aea57097ad59b4cfc4135b
	Index 10 with ID 61aea57097ad59b4cfc4135a
	Index 11 with ID 61aea57497ad59b4cfc413e8
	Index 12 with ID 61aea57497ad59b4cfc413e7
	Index 13 with ID 61aea57197ad59b4cfc41375
ID's of the 14 documents for the dev set
	Index 0 with ID 61aea55f97ad59b4cfc4130e
	Index 1 with ID 61aea55c97ad59b4cfc412ac
	Index 2 with ID 61aea55f97ad59b4cfc41304
	Index 3 with ID 61aea55d97ad59b4cfc412bd
	Index 4 with ID 61aea55e97ad59b4cfc412df
	Index 5 with ID 61aea55e97ad59b4cfc412d4
	Index 6 with ID 61aea55c97ad59b4cfc412a3
	Index 7 with ID 61aea57497ad59

"   \nprint(f'IDs of the {len(training_set)} documents for the training set')\nfor index, idDoc in enumerate(training_set):\n    print(f'\tIndex {index} with ID {idDoc}')\n"

In [12]:
training_set = ['61aea55c97ad59b4cfc41290', '61aea55c97ad59b4cfc41297', '61aea55c97ad59b4cfc41299', '61aea55c97ad59b4cfc4129b', '61aea55c97ad59b4cfc4129d', '61aea55c97ad59b4cfc4129e', '61aea55c97ad59b4cfc4129f', '61aea55c97ad59b4cfc412a0', '61aea55c97ad59b4cfc412a1', '61aea55c97ad59b4cfc412a3', '61aea55c97ad59b4cfc412a4', '61aea55c97ad59b4cfc412a6', '61aea55c97ad59b4cfc412aa', '61aea55c97ad59b4cfc412af', '61aea55d97ad59b4cfc412b5', '61aea55d97ad59b4cfc412b7', '61aea55d97ad59b4cfc412bc', '61aea55d97ad59b4cfc412bd', '61aea55d97ad59b4cfc412bf', '61aea55d97ad59b4cfc412c1', '61aea55d97ad59b4cfc412c7', '61aea55d97ad59b4cfc412cb', '61aea55d97ad59b4cfc412cd', '61aea55d97ad59b4cfc412d2', '61aea55d97ad59b4cfc412d3', '61aea55e97ad59b4cfc412d4', '61aea55e97ad59b4cfc412d5', '61aea55e97ad59b4cfc412d7', '61aea55e97ad59b4cfc412d8', '61aea55e97ad59b4cfc412da', '61aea55e97ad59b4cfc412df', '61aea55e97ad59b4cfc412e6', '61aea55e97ad59b4cfc412ea', '61aea55e97ad59b4cfc412ec', '61aea55e97ad59b4cfc412ee', '61aea55e97ad59b4cfc412f0', '61aea55e97ad59b4cfc412f3', '61aea55e97ad59b4cfc412fb', '61aea55e97ad59b4cfc412ff', '61aea55f97ad59b4cfc41301', '61aea55f97ad59b4cfc41304', '61aea55f97ad59b4cfc41306', '61aea55f97ad59b4cfc41307', '61aea55f97ad59b4cfc4130b', '61aea55f97ad59b4cfc4130c', '61aea55f97ad59b4cfc41318', '61aea55f97ad59b4cfc41319', '61aea55f97ad59b4cfc4131a', '61aea55f97ad59b4cfc4131d', '61aea55f97ad59b4cfc41320', '61aea55f97ad59b4cfc41330', '61aea55f97ad59b4cfc41331', '61aea55f97ad59b4cfc41332', '61aea55f97ad59b4cfc41334', '61aea55f97ad59b4cfc4133b', '61aea55f97ad59b4cfc4133c', '61aea56f97ad59b4cfc41342', '61aea56f97ad59b4cfc41344', '61aea56f97ad59b4cfc41347', '61aea56f97ad59b4cfc41349', '61aea56f97ad59b4cfc4134b', '61aea56f97ad59b4cfc4134c', '61aea56f97ad59b4cfc4134d', '61aea57097ad59b4cfc41351', '61aea57097ad59b4cfc41352', '61aea57097ad59b4cfc4135a', '61aea57097ad59b4cfc4135b', '61aea57097ad59b4cfc4135e', '61aea57097ad59b4cfc41361', '61aea57097ad59b4cfc41364', '61aea57097ad59b4cfc41365', '61aea57097ad59b4cfc41366', '61aea57097ad59b4cfc41367', '61aea57097ad59b4cfc41369', '61aea57197ad59b4cfc4136b', '61aea57197ad59b4cfc4136e', '61aea57197ad59b4cfc41372', '61aea57197ad59b4cfc41375', '61aea57197ad59b4cfc41376', '61aea57197ad59b4cfc41377', '61aea57197ad59b4cfc4137a', '61aea57297ad59b4cfc4137f', '61aea57297ad59b4cfc41380', '61aea57297ad59b4cfc41381', '61aea57297ad59b4cfc41382', '61aea57397ad59b4cfc41383', '61aea57397ad59b4cfc4138e', '61aea57397ad59b4cfc4138f', '61aea57397ad59b4cfc41391', '61aea57397ad59b4cfc41395', '61aea57397ad59b4cfc41399', '61aea57397ad59b4cfc4139a', '61aea57397ad59b4cfc4139c', '61aea57397ad59b4cfc4139e', '61aea57397ad59b4cfc413a7', '61aea57497ad59b4cfc413ad', '61aea57497ad59b4cfc413af', '61aea57497ad59b4cfc413b2', '61aea57497ad59b4cfc413b6', '61aea57497ad59b4cfc413be', '61aea57497ad59b4cfc413bd', '61aea57497ad59b4cfc413c0', '61aea57497ad59b4cfc413c4', '61aea57497ad59b4cfc413c9', '61aea57497ad59b4cfc413cc', '61aea57497ad59b4cfc413e0', '61aea57497ad59b4cfc413e1', '61aea57497ad59b4cfc413e3', '61aea57497ad59b4cfc413ea', '61aea57497ad59b4cfc413e7', '61aea57497ad59b4cfc413e8', '61aea57497ad59b4cfc413e9', '61aea57497ad59b4cfc413f1']
dev_set = ['61aea55f97ad59b4cfc41308', '61aea55d97ad59b4cfc412be', '61aea55c97ad59b4cfc412ac', '61aea55e97ad59b4cfc412fd', '61aea55f97ad59b4cfc41335', '61aea55c97ad59b4cfc412ae', '61aea55f97ad59b4cfc41336', '61aea57497ad59b4cfc413d1', '61aea57097ad59b4cfc41355', '61aea57397ad59b4cfc41393', '61aea57497ad59b4cfc413d7', '61aea57097ad59b4cfc41360', '61aea57097ad59b4cfc41368', '61aea56f97ad59b4cfc41343']
test_set = ['61aea55e97ad59b4cfc412de', '61aea55e97ad59b4cfc412eb', '61aea55f97ad59b4cfc41328', '61aea55f97ad59b4cfc4130e', '61aea55f97ad59b4cfc41323', '61aea55f97ad59b4cfc41337', '61aea55f97ad59b4cfc41322', '61aea57497ad59b4cfc413b3', '61aea57497ad59b4cfc413da', '61aea57497ad59b4cfc413d8', '61aea57497ad59b4cfc413d2', '61aea57497ad59b4cfc413ba', '61aea57097ad59b4cfc41358', '61aea57397ad59b4cfc413a5']

print(f'IDs of the training set are : {training_set}')
print(f'IDs of the dev set are : {dev_set}')
print(f'IDs of the test set are : {test_set}')

IDs of the training set are : ['61aea55c97ad59b4cfc41290', '61aea55c97ad59b4cfc41297', '61aea55c97ad59b4cfc41299', '61aea55c97ad59b4cfc4129b', '61aea55c97ad59b4cfc4129d', '61aea55c97ad59b4cfc4129e', '61aea55c97ad59b4cfc4129f', '61aea55c97ad59b4cfc412a0', '61aea55c97ad59b4cfc412a1', '61aea55c97ad59b4cfc412a3', '61aea55c97ad59b4cfc412a4', '61aea55c97ad59b4cfc412a6', '61aea55c97ad59b4cfc412aa', '61aea55c97ad59b4cfc412af', '61aea55d97ad59b4cfc412b5', '61aea55d97ad59b4cfc412b7', '61aea55d97ad59b4cfc412bc', '61aea55d97ad59b4cfc412bd', '61aea55d97ad59b4cfc412bf', '61aea55d97ad59b4cfc412c1', '61aea55d97ad59b4cfc412c7', '61aea55d97ad59b4cfc412cb', '61aea55d97ad59b4cfc412cd', '61aea55d97ad59b4cfc412d2', '61aea55d97ad59b4cfc412d3', '61aea55e97ad59b4cfc412d4', '61aea55e97ad59b4cfc412d5', '61aea55e97ad59b4cfc412d7', '61aea55e97ad59b4cfc412d8', '61aea55e97ad59b4cfc412da', '61aea55e97ad59b4cfc412df', '61aea55e97ad59b4cfc412e6', '61aea55e97ad59b4cfc412ea', '61aea55e97ad59b4cfc412ec', '61aea55e97ad59b4

# Phase 2 - Deciding on a Sentence Segmenter

#### Creation of the corpus

Credit for code goes to M. Grabmair and his LDSI_W21_Classifier_Workshop

In [13]:
# get all sentences assuming every annotation is a sentence
def make_span_data(documents_by_id, types_by_id, annotations):
    span_data = []
    for a in annotations:
        start = a['start']
        end = a['end']
        document_txt = documents_by_id[a['document']]['plainText']
        atype = a['type']
        sd = {'txt': document_txt[start:end],
              'document': a['document'],
              'type': types_by_id[atype]['name'],
              'start': a['start'],
              'start_normalized': a['start'] / len(document_txt), #normalized position
              'end': a['end']}
        span_data.append(sd)
    return span_data

In [14]:
spans = make_span_data(documents_by_id, types_by_id, annotations) # dictionary
span_labels = [s['type'] for s in spans] #e.g. "Citation", "CaseFooter", "Header", etc.
print(len(spans))

15349


### 2.1 Standard segmentation analysis

#### Individual analysis

In [15]:
#Print the first span to know its structure
print(spans[0])

{'txt': "____________________________________________\r\nDEBORAH W. SINGLETON\r\nVeterans Law Judge, Board of Veterans' Appeals\r\n\r\n\r\n\r\n\r\n Department of Veterans Affairs\r", 'document': '61aea57397ad59b4cfc41399', 'type': 'CaseFooter', 'start': 15922, 'start_normalized': 0.9899894298327426, 'end': 16078}


In [16]:
# Print length of the training set
print(f'Length training set {len(training_set)}')

# Get ID of a document from the training_set
index_doc_stdSeg = 0
id_doc_stdSeg = training_set[index_doc_stdSeg]
doc_std_seg = documents_by_id.get(training_set[index_doc_stdSeg])
print(f'ID of the [{index_doc_stdSeg}](st.|rd.|th.)element in the document. Doc in the training_set with id [{id_doc_stdSeg}] with keys [{doc_std_seg.keys()}]')
print("*"*100)
print(f'The document has the following information:\n\t===>ID {doc_std_seg.get("_id")}\n\t===>Name {doc_std_seg.get("name")}\n\t===>PlainText[0:50] {doc_std_seg.get("plainText")[0:100]}\n\t===>Outcome {doc_std_seg.get("outcome")}')

Length training set 113
ID of the [0](st.|rd.|th.)element in the document. Doc in the training_set with id [61aea55c97ad59b4cfc41290] with keys [dict_keys(['_id', 'name', 'plainText', 'outcome'])]
****************************************************************************************************
The document has the following information:
	===>ID 61aea55c97ad59b4cfc41290
	===>Name 0619915.txt
	===>PlainText[0:50] Citation Nr: 0619915	
Decision Date: 07/10/06    Archive Date: 07/21/06

DOCKET NO.  03-37 139	)	
	===>Outcome granted


In [17]:
#spacy.prefer_gpu()
nlp = spacy.load("en_core_web_sm")

OSError: [E050] Can't find model 'en_core_web_sm'. It doesn't seem to be a Python package or a valid path to a data directory.

In [None]:
docSpacy_stdSegIndiv = nlp(doc_std_seg.get("plainText"))
print(docSpacy_stdSegIndiv)

In [None]:
# Entities
#docSpacy_stdSegIndiv.ents

In [None]:
# Getting the sentences and printing how many this document has
sents_std_seg_ind = [str(obj) for obj in docSpacy_stdSegIndiv.sents]
print(len(sents_std_seg_ind))

In [None]:
for sentence in docSpacy_stdSegIndiv.sents:
    print(sentence.text)
    print("%"*50)

In [None]:
spans_per_document = { "_id": id_doc_stdSeg, "spans": [] }

spans_per_document["spans"] = [doc for doc in spans if doc.get("document") == id_doc_stdSeg]

In [None]:
for sentence in spans_per_document["spans"]: 
    print(sentence["txt"])
    print("*"*50)

In [None]:
def computeMetricsNewApproach(annotations, predictions, document):
    listMatchedTrueSplits = []
    truePositives = 0
    trueNegatives = 0
    falsePositives = 0
    falseNegatives = 0
    interval = 3
    
    for p in predictions:
        indexPred = document.index(p)
        flagMatched = False
        
        for i,a in enumerate(annotations):
            if a["start"]-interval <= indexPred and indexPred <= a["start"]+interval:
                truePositives += 1
                flagMatched = True
                listMatchedTrueSplits.append(i)
                break
        if not flagMatched:
            falsePositives += 1
            
    listMatchedTrueSplits.sort()
    listMatchedTrueSplits = list(dict.fromkeys(listMatchedTrueSplits))
    listMissing = [x for x in range(0, len(annotations)) if x not in listMatchedTrueSplits]
    falseNegatives=len(listMissing)
    
    accuracy = (truePositives+trueNegatives)/(truePositives+falsePositives+falseNegatives+trueNegatives)
    precision = truePositives/(truePositives+falsePositives)
    recall = truePositives/(truePositives+falseNegatives)
    f1Score = 2*(recall * precision) / (recall + precision)
    
    return accuracy, precision, recall, f1Score

In [None]:
sentences_std_seg_ind = [str(obj) for obj in docSpacy_stdSegIndiv.sents]

computeMetricsNewApproach(copy.copy(spans_per_document["spans"]), copy.copy(sentences_std_seg_ind), copy.copy(doc_std_seg.get("plainText")))

#### Collective analysis

In [None]:
accuracy = 0
precision = 0
recall = 0
f1Score = 0

for id_doc in training_set:
    doc_by_id = documents_by_id.get(id_doc)
    
    #DocSpacy is obtained
    nlp = spacy.load("en_core_web_sm")
    docSpacy = nlp(doc_by_id.get("plainText"))
    
    #Sentences
    sentences = [str(obj) for obj in docSpacy.sents]
    
    #Spans of the document are obtained
    spans_per_document = { "_id": id_doc, "spans": [] }
    spans_per_document["spans"] = [doc for doc in spans if doc.get("document") == id_doc]
    
    result = computeMetricsNewApproach(copy.deepcopy(spans_per_document["spans"]), copy.copy(sentences), copy.deepcopy(doc_by_id.get("plainText")))
    
    accuracy += result[0]
    precision += result[1]
    recall += result[2]
    f1Score += result[3]
    
accuracy /= len(training_set)
precision /= len(training_set)
recall /= len(training_set)
f1Score /= len(training_set)

print(f'accuracy {accuracy}, precision {precision}, recall {recall}, f1Score {f1Score}')

#accuracy 0.4220689794749151, precision 0.5876022051703683, recall 0.5954441815125752, f1Score 0.5900457535737149

### 2.2 Improved segmentation analysis

In [None]:
# Adding a new rule to the pipeline. In this case numbered list
spacy.prefer_gpu()
nlp = spacy.load("en_core_web_sm")
nlp.tokenizer.add_special_case('1.', [{ORTH: '1.'}])
nlp.tokenizer.add_special_case('2.', [{ORTH: '2.'}])
nlp.tokenizer.add_special_case('3.', [{ORTH: '3.'}])
nlp.tokenizer.add_special_case('a.', [{ORTH: 'a.'}])
nlp.tokenizer.add_special_case('b.', [{ORTH: 'b.'}])
nlp.tokenizer.add_special_case('c.', [{ORTH: 'c.'}])
nlp.tokenizer.add_special_case('I.', [{ORTH: 'I.'}])
nlp.tokenizer.add_special_case('II.', [{ORTH: 'II.'}])
nlp.tokenizer.add_special_case('III.', [{ORTH: 'III.'}])
nlp.tokenizer.add_special_case('REPRESENTATION', [{ORTH: 'REPRESENTATION'}])
nlp.tokenizer.add_special_case('THE ISSUE', [{ORTH: 'THE ISSUE'}])
nlp.tokenizer.add_special_case('ATTORNEY FOR THE BOARD', [{ORTH: 'ATTORNEY FOR THE BOARD'}])
nlp.tokenizer.add_special_case('ORDER', [{ORTH: 'ORDER'}])
nlp.tokenizer.add_special_case('FINDINGS OF FACT', [{ORTH: 'FINDINGS OF FACT'}])
nlp.tokenizer.add_special_case('CONCLUSION OF LAW', [{ORTH: 'CONCLUSION OF LAW'}])
nlp.tokenizer.add_special_case('INTRODUCTION', [{ORTH: 'INTRODUCTION'}])
nlp.tokenizer.add_special_case('WITNESS AT HEARINGS ON APPEAL', [{ORTH: 'WITNESS AT HEARINGS ON APPEAL'}])
nlp.tokenizer.add_special_case('REASONS AND BASES FOR FINDINGS AND CONCLUSION', [{ORTH: 'REASONS AND BASES FOR FINDINGS AND CONCLUSION'}])
regexBoundaryDigits = re.compile('^[0-9]$')

@Language.component("my_component")
def custom_headers(doc):
    flagNext = False
    listHeaders = ["REPRESENTATION", "THE ISSUE", "ATTORNEY FOR THE BOARD", "ORDER", "FINDINGS OF FACT", "CONCLUSION OF LAW", "INTRODUCTION", "WITNESS AT HEARING ON APPEAL", "FINDING OF FACT", "REASONS AND BASES FOR FINDING AND CONCLUSION"]
    for index, token in enumerate(doc):
        if flagNext and not token.text:
            token.sent_start = True
            flagNext = False
        
        if token.text in listHeaders:
            token.sent_start = True
            flagNext = True
    return doc

Language.component("custom_headers", func=custom_headers)

nlp.add_pipe("custom_headers", before='parser')

In [None]:
accuracy = 0
precision = 0
recall = 0
f1Score = 0

for id_doc in training_set:
    doc_by_id = documents_by_id.get(id_doc)
    
    #DocSpacy is obtained    
    docSpacy = nlp(doc_by_id.get("plainText"))
    
    #Sentences
    sentences = [str(obj) for obj in docSpacy.sents]
    
    #Spans of the document are obtained
    spans_per_document = { "_id": id_doc, "spans": [] }
    spans_per_document["spans"] = [doc for doc in spans if doc.get("document") == id_doc]
    
    result = computeMetricsNewApproach(copy.deepcopy(spans_per_document["spans"]), copy.copy(sentences), copy.deepcopy(doc_by_id.get("plainText")))
    
    accuracy += result[0]
    precision += result[1]
    recall += result[2]
    f1Score += result[3]

accuracy /= len(training_set)
precision /= len(training_set)
recall /= len(training_set)
f1Score /= len(training_set)

print(f'accuracy {accuracy}, precision {precision}, recall {recall}, f1Score {f1Score}')
#

In [None]:
index_doc_stdSeg = 0
id_doc_stdSeg = training_set[index_doc_stdSeg]
doc_std_seg = documents_by_id.get(training_set[index_doc_stdSeg])

docSpacy_stdSegIndiv = nlp(doc_std_seg.get("plainText"))

for sentence in docSpacy_stdSegIndiv.sents:
    print(sentence.text)
    print("%"*50)

In [None]:
accuracy = 0
precision = 0
recall = 0
f1Score = 0

for id_doc in training_set:
    doc_by_id = documents_by_id.get(id_doc)
    
    #DocSpacy is obtained    
    docSpacy = nlp(doc_by_id.get("plainText"))
    
    #Sentences
    sentences = [str(obj) for obj in docSpacy.sents]
    
    #Spans of the document are obtained
    spans_per_document = { "_id": id_doc, "spans": [] }
    spans_per_document["spans"] = [doc for doc in spans if doc.get("document") == id_doc]
    
    result = computeMetricsNewApproach(copy.deepcopy(spans_per_document["spans"]), copy.copy(sentences), copy.deepcopy(doc_by_id.get("plainText")))
    
    accuracy += result[0]
    precision += result[1]
    recall += result[2]
    f1Score += result[3]

accuracy /= len(training_set)
precision /= len(training_set)
recall /= len(training_set)
f1Score /= len(training_set)


print(f'accuracy {accuracy}, precision {precision}, recall {recall}, f1Score {f1Score}')
#accuracy 0.45684867111110933, precision 0.5956175404449385, recall 0.6583961801916716, f1Score 0.6246184746083606
#### With interval of 5
#accuracy 0.6967265369532635, precision 0.7822844028944752, recall 0.8626031173254878, f1Score 0.8193964537620934
#### With interval of 6
#accuracy 0.740566313643408, precision 0.8211516129093976, recall 0.883276911756848, f1Score 0.8497883236613925

### 2.3 Law-specific sentence segmenter

#### Individual analysis

In [None]:
docLawSpec_0 = documents_by_id.get(training_set[0]).get("plainText")
print(docLawSpec_0)

In [None]:
sentLegalSegmenter = [sent for sent in sbd_utils.text2sentences(docLawSpec_0)]

In [None]:
print(len(sentLegalSegmenter))

In [None]:
for sent in sentLegalSegmenter: print(sent)

In [None]:
spans_per_document = { "_id": training_set[0], "spans": [] }

spans_per_document["spans"] = [doc for doc in spans if doc.get("document") == training_set[0]]

len(spans_per_document["spans"])

In [None]:
def computeMetricsLegalSpecific(annotations, predictions, document):
    listMatchedTrueSplits = []
    truePositives = 0
    trueNegatives = 0
    falsePositives = 0
    falseNegatives = 0
    interval = 3
    
    for p in predictions:
        indexPred = document.index(p)
        flagMatched = False
        
        for i,a in enumerate(annotations):
            if a["start"]-interval <= indexPred and indexPred <= a["start"]+interval:
                truePositives += 1
                flagMatched = True
                listMatchedTrueSplits.append(i)
                break
        if not flagMatched:
            falsePositives += 1
            
    listMatchedTrueSplits.sort()
    listMatchedTrueSplits = list(dict.fromkeys(listMatchedTrueSplits))
    listMissing = [x for x in range(0, len(annotations)) if x not in listMatchedTrueSplits]
    falseNegatives=len(listMissing)
    
    accuracy = (truePositives+trueNegatives)/(truePositives+falsePositives+falseNegatives+trueNegatives)
    precision = truePositives/(truePositives+falsePositives)
    recall = truePositives/(truePositives+falseNegatives)
    f1Score = 2*(recall * precision) / (recall + precision)
    
    return accuracy, precision, recall, f1Score

In [None]:
accuracy, precision, recall, f1Score = computeMetricsLegalSpecific(copy.deepcopy(spans_per_document["spans"]), copy.copy(sentLegalSegmenter), docLawSpec_0)

print(f'accuracy {accuracy}, precision {precision}, recall {recall}, f1Score {f1Score}')
"""
TruePositives 109 - TrueNegatives 0 - falsePositives 15 - falseNegatives 3
accuracy 0.8582677165354331, precision 0.8790322580645161, recall 0.9732142857142857, f1Score 0.923728813559322
"""

#### Collective analysis

In [None]:
accuracy = 0
precision = 0
recall = 0
f1Score = 0

for id_doc in training_set:
    doc_by_id = documents_by_id.get(id_doc)
    
    #Sentences
    sentences = [sent for sent in sbd_utils.text2sentences(doc_by_id.get("plainText"))]
    
    #Spans of the document are obtained
    spans_per_document = { "_id": id_doc, "spans": [] }
    spans_per_document["spans"] = [doc for doc in spans if doc.get("document") == id_doc]
    
    result = computeMetricsLegalSpecific(copy.deepcopy(spans_per_document["spans"]), copy.copy(sentences), doc_by_id.get("plainText"))
    
    accuracy += result[0]
    precision += result[1]
    recall += result[2]
    f1Score += result[3]

accuracy /= len(training_set)
precision /= len(training_set)
recall /= len(training_set)
f1Score /= len(training_set)


print(f'accuracy {accuracy}, precision {precision}, recall {recall}, f1Score {f1Score}')
#accuracy 0.8138229150722465, precision 0.8324095133044638, recall 0.9716983081350014, f1Score 0.8931354492101693

# Phase 3 - Preprocessing

In [None]:
name_files_unlabeled = os.listdir(os.path.join(project_config.UNLABELED_DIR))
print(f'Num files {len(name_files_unlabeled)} in {os.path.join(project_config.UNLABELED_DIR)} directory')

### 3.1 Splitting unlabeled data

In [None]:
""" # Change to file
def getSententesWithSegmenterPerFile(filename):
    print(filename, end = '')
    path_file = os.path.join(project_config.UNLABELED_DIR, filename)
    #print(f'{mp.current_process()} - {filename} - {path_file}\n')    
    try:         
        raw = open(path_file, 'rb').read()
        enc = chardet.detect(raw)['encoding']
        #with codecs.open(path_file, mode='r', encoding=enc) as f:
        with open(path_file, encoding='latin-1') as f:
            return len(sbd_utils.text2sentences(f.read()))
    except Error as e:
        print(e)
        return 0
    
    return 0
"""

In [None]:
pool = mp.Pool(mp.cpu_count())
list_files_x_names = os.listdir(os.path.join(project_config.UNLABELED_DIR))
print(f'list_files_x_names length = {len(list_files_x_names)}')
num_sentences_x_file = pool.map(sentences_custom.getLengthSententesWithSegmenterPerFile, list_files_x_names)
print("")
print("The sum of num_sentences_x_file is", sum(num_sentences_x_file))


In [None]:
list_files_x_names = os.listdir(os.path.join(project_config.UNLABELED_DIR))

pool = mp.Pool(mp.cpu_count())
list_files_x_names = os.listdir(os.path.join(project_config.UNLABELED_DIR))
print(f'list_files_x_names length = {len(list_files_x_names)}')
list_files_name_num_sents = pool.map(sentences_custom.getSententesWithSegmenterPerFile, list_files_x_names)

In [None]:
list_files_name_num_sents

In [None]:
name_files_unlabeled

In [None]:
# Clean every variable (Point 3 up to here)

del name_files_unlabeled
del list_files_x_names
del num_sentences_x_file
del list_files_name_num_sents

gc.collect()

##### Storing "list_files_x_names" and  "num_sentences_x_file"

In [None]:
#open file
file = open("ListFiles.txt", "w")
 
#convert variable to string
str = repr(list_files_x_names)
file.write("list_files_x_names = " + str + "\n")
 
#close file
file.close()

In [None]:
#open file
file = open("ListNumSentencesXfile.txt", "w")
 
#convert variable to string
str = repr(num_sentences_x_file)
file.write("num_sentences_x_file = " + str + "\n")
 
#close file
file.close()

##### Reading "list_files_x_names" and  "num_sentences_x_file"

In [None]:
with open('ListFiles.txt', 'rb') as f:
    list_files_x_names = f.read()
    f.close()
    
singleStrWithNames = (str(list_files_x_names)).split("=")[1].strip().replace("[", "").replace("]", "")
list_files_x_names = [name.replace("'", "").strip() for name in singleStrWithNames.split(",")]

In [None]:
with open('ListNumSentencesXfile.txt', 'rb') as f:
    num_sentences_x_file = f.read()
    f.close()

singleStrWithIndex = (str(num_sentences_x_file)).split("=")[1].strip().replace("[", "").replace("]", "")
num_sentences_x_file = [int(re.sub(r'\D', '', name)) for name in singleStrWithIndex.split(",")]

### 3.2 Sentence-wise preprocessing

Credit for the code goes to M. Grabmair and his LDSI_W21_Classifier_Workshop. It has been applied a small change in the tokenize function.

In [None]:
def tokenize(txt):
    txt = txt.replace("_", "")
    dirty_tokens = re.split('\s+', txt)  # split words.
    print(f'dirty_tokens {dirty_tokens}')
    # remove all non-alphanumerics
    clean_tokens = [re.sub(r'\W', '', t).lower() for t in dirty_tokens]
    clean_tokens = [t for t in clean_tokens if t] # Remove empty strings  <<===============
    return clean_tokens

def tokenize_spans(spans):
    for s in spans:
        s['tokens_manual'] = tokenize(s['txt'])        
        
def build_vocabulary(spans): #A dictionary of how often a word appears
    vocab_counts = {}
    for sd in spans:
        for t in tokenize(sd['txt']):
            if t in vocab_counts:
                vocab_counts[t] += 1
            else:
                vocab_counts[t] = 1
    return vocab_counts

In [None]:
index_doc_in_json_list = 0
id_doc_tokenizer_example = training_set[index_doc_in_json_list]
doc_tokenizer_example = documents_by_id.get(training_set[index_doc_in_json_list])

spans_per_document_tokenizer = { "_id": id_doc_tokenizer_example, "spans": [] }

spans_per_document_tokenizer["spans"] = [doc for doc in spans if doc.get("document") == id_doc_tokenizer_example]

In [None]:
id_span_in_document = 11
spans_per_document_tokenizer["spans"][id_span_in_document]["txt"]
#len(spans_per_document_tokenizer["spans"])-3

In [None]:
tokens = tokenize(spans_per_document_tokenizer["spans"][id_span_in_document]["txt"]) #Tokenize with empty strings

In [None]:
tokens

In [None]:
indexSpan = 10
auxSpan = spans_per_document_tokenizer["spans"][indexSpan]["txt"]
print(auxSpan)

In [None]:
pool = mp.Pool(mp.cpu_count())
list_files_x_names = os.listdir(os.path.join(project_config.UNLABELED_DIR))
print(f'list_files_x_names length = {len(list_files_x_names)}')
list_files_name_num_sents = pool.map(sentences_custom.getSententesWithSegmenterPerFile, list_files_x_names)

#### Write list of dictionaries into file

In [None]:
file = open("FileListOfDictNameNumSentences.txt", "w")
 
#convert variable to string
str_list_files_x_names = repr(list_files_name_num_sents)
file.write(str_list_files_x_names + "\n")
 
#close file
file.close()

#### Read list of dictionaries from file

In [None]:
"""
import ast

with open('FileListOfDictNameNumSentences.txt', 'rb') as f:
    list_files_x_names = ast.literal_eval(f.read())
    f.close()
"""

"""
name_files_unlabeled = os.listdir(os.path.join(project_config.UNLABELED_DIR))
print(f'Num files {len(name_files_unlabeled)} in {os.path.join(project_config.UNLABELED_DIR)} directory')
"""

### Normal execution

In [None]:

pool = mp.Pool(mp.cpu_count())
num_tokens_x_file = pool.map(sentences_custom.getNumTokensPerFile, list_files_name_num_sents)


In [None]:
len(num_tokens_x_file)

In [None]:
# Clean every variable 

del num_tokens_x_file
del pool
del list_files_name_num_sents

gc.collect()

In [None]:
"""
import glob

files = glob.glob(os.path.join(project_config.OUTPUT_DIR, "*"))
for f in files:
    os.remove(f)
"""

#### Test after fail due to special character in file

In [None]:
name_files_unlabeled = os.listdir(os.path.join(project_config.UNLABELED_DIR))
list_files_x_names_tokens = os.listdir(os.path.join(project_config.OUTPUT_DIR))

In [None]:
len(list_files_x_names_tokens)

In [None]:
list_missing_files = [file for file in name_files_unlabeled if ("o_"+file) not in list_files_x_names_tokens]

In [None]:
len(list_missing_files)

In [None]:
# Get info from files (name, length, sentences)
pool = mp.Pool(mp.cpu_count())
print(f'list_files_x_names length = {len(list_missing_files)}')
list_files_name_num_sents = pool.map(sentences_custom.getSententesWithSegmenterPerFile, list_missing_files)

In [None]:
len(list_files_name_num_sents)

In [None]:
# Get tokens and produce files
pool = mp.Pool(mp.cpu_count())
num_tokens_x_file = pool.map(sentences_custom.getNumTokensPerFile, list_files_name_num_sents)

In [None]:
num_tokens_x_file

In [None]:
del list_files_x_names_tokens
del list_missing_files
del pool
del list_files_name_num_sents
del num_tokens_x_file

gc.collect()

### Merge files

In [None]:
filename_tokens = os.path.join(project_config.FILENAME_OUTPUT_TOKENS)

file_tokens = open(filename_tokens, "w")
list_files = os.listdir(os.path.join(project_config.OUTPUT_DIR))
print(len(list_files))

for filename in list_files:
    
    with open(os.path.join(project_config.OUTPUT_DIR, filename), 'r') as f:
        #Read from source
        string_aux = f.readlines()
        #Write to dest
        for line in string_aux:
            file_tokens.write(line)
        
        f.close()

file_tokens.close()

del filename_tokens
del file_tokens
del list_files
gc.collect()


#### Count number of words

In [None]:
filename_tokens = os.path.join(project_config.FILENAME_OUTPUT_TOKENS)

file = open(filename_tokens, "rt")
data = file.read()
words = data.split()

print('Number of words in text file :', len(words))

del filename_tokens
del file
del data
del words

gc.collect()

### Generate histogram tokens per file

In [None]:
# Get sentences from files. As a result we have a list of dictionaries with keys: "name", "num_sentences", "sentences"
list_files_x_names = os.listdir(os.path.join(project_config.UNLABELED_DIR))

pool = mp.Pool(mp.cpu_count())
list_files_x_names = os.listdir(os.path.join(project_config.UNLABELED_DIR))
print(f'list_files_x_names length = {len(list_files_x_names)}')
list_files_name_num_sents = pool.map(sentences_custom.getSententesWithSegmenterPerFile, list_files_x_names)

In [None]:
len(list_files_name_num_sents)

In [None]:
#Get num tokens per file
pool = mp.Pool(mp.cpu_count())
num_tokens_x_file = pool.map(sentences_custom.getListNumTokensPerFile, list_files_name_num_sents)

In [None]:
len(num_tokens_x_file)

In [None]:
sum(num_tokens_x_file)

In [None]:
print(f'The document with most tokens has {max(num_tokens_x_file)}. ', end="")
index_max = num_tokens_x_file.index(max(num_tokens_x_file))
print(f'It is located in the file {list_files_name_num_sents[index_max].get("name")}')

print(f'The document with least tokens has {min(num_tokens_x_file)}. ', end="")
index_min = num_tokens_x_file.index(min(num_tokens_x_file))
print(f'It is located in the file {list_files_name_num_sents[index_min].get("name")}')

In [None]:
plt.hist(num_tokens_x_file, bins=1000)
plt.show()

In [None]:
list_files_name_num_sents[0].keys()

In [None]:
# Write num_tokens_x_file
"""
file = open("FileNumTokensXFile.txt", "w")
 
#convert variable to string
str_num_tokens_x_file = repr(num_tokens_x_file)
file.write(str_num_tokens_x_file + "\n")
 
#close file
file.close()
"""
# Write name files of num_tokens_x_file
"""
file = open("FileListFiles_x_numTokensXFile.txt", "w")
 
#convert variable to string
str_list_namefiles_tokens = repr(list_files_x_names)
file.write(str_list_namefiles_tokens + "\n")
 
#close file
file.close()

del file
del str_list_namefiles_tokens
del str_num_tokens_x_file
"""

In [None]:
# Clean every variable 

del num_tokens_x_file
del pool
del list_files_name_num_sents

gc.collect()

# Phase 4 - Developing word embeddings

### 4.1 Custom FastText Embeddings

The following line was executed in the command line:

> ./fasttext skipgram -input output_tokens.txt  -output model_ldsi_skipgram -verbose 2 -minCount 20 -dim 100 -epoch 10 -thread 4

### 4.2 Evaluating Custom Embeddings Manually

In [None]:
from pathlib import Path

path_current_file = os.path.abspath('')
print(f'path_current_file: {path_current_file}')
pathModel = os.path.join(path_current_file, project_config.PATH_MODEL_FASTTEXT)
print(f'pathModel: {pathModel}')
modelFastText = fasttext.load_model(pathModel)#, model='skipgram'

In [None]:
words = modelFastText.get_words(on_unicode_error='ignore')#replace

In [None]:
len(words)

In [None]:
#words[20100:20200]
modelFastText.get_word_vector("the")

In [None]:
list_words = ["veteran", "v.", "argues", "ptsd", "granted", "korea", "holding", "also", "diagnosed", "injure", "surgery", "spine", "accident", "disorder", "posttraumatic", "depression", "anxiety"]

for word_to_search in list_words:
    print(word_to_search, "*"*60)
    list_nn_of_word = modelFastText.get_nearest_neighbors(word_to_search)
    for nn in list_nn_of_word:
        print("\t", nn)

In [None]:
del pathModel
del path_current_file
del modelFastText 

gc.collect()

# Phase 5 - Training Classifiers

abridged from [scikit-learn example code](https://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html) and to M. Grabmair and his LDSI_W21_Classifier_Workshop

In [None]:
def plot_confusion_matrix(y_true, y_pred, classes,
                          title=None,
                          cmap=plt.cm.Blues):
    cm = confusion_matrix(y_true, y_pred)
    fig, ax = plt.subplots(figsize=(8, 8))
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], 'd'),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    return ax

Credit for code goes to [buhrmann.github.io](https://buhrmann.github.io/tfidf-analysis.html) and to M. Grabmair and his LDSI_W21_Classifier_Workshop

In [None]:
def top_tfidf_features(row, features, top_n=15):
    ''' Get top n tfidf values in row and return them with their corresponding feature names.'''
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_feats = [(features[i], row[i]) for i in topn_ids]
    df = pd.DataFrame(top_feats)
    df.columns = ['feature', 'tfidf']
    return df


def top_features_in_doc(Xtr, features, row_id, top_n=15):
    ''' Top tfidf features in specific document (matrix row) '''
    xtr_row = Xtr[row_id]
    if type(xtr_row) is not np.ndarray:
        xtr_row = xtr_row.toarray()
    row = np.squeeze(xtr_row)
    return top_tfidf_features(row, features, top_n)


def top_mean_features(Xtr, features, grp_ids=None, min_tfidf=0.1, top_n=25):
    ''' Return the top n features that on average are most important amongst documents in rows
        indentified by indices in grp_ids. '''
    if grp_ids:
        D = Xtr[grp_ids]
    else:
        D = Xtr
    if type(D) is not np.ndarray:
        D = D.toarray()
    D[D < min_tfidf] = 0
    tfidf_means = np.mean(D, axis=0)
    return top_tfidf_features(tfidf_means, features, top_n)


def top_features_by_class(Xtr, y, features, min_tfidf=0.1, top_n=25):
    ''' Return a list of dfs, where each df holds top_n features and their mean tfidf value
        calculated across documents with the same class label. '''
    dfs = {}
    labels = np.unique(y)
    for label in labels:
        ids = np.where(y==label)
        feats_df = top_mean_features(Xtr, features, ids, min_tfidf=min_tfidf, top_n=top_n)
        feats_df.label = label
        dfs[label] = feats_df
    return dfs


def span_top_tfidf(spans_txt, spans_tfidf, features, index):
    print('span text:\n'+spans_txt[index]+'\n')
    print(top_features_in_doc(spans_tfidf, features, index))
    
#We can give it a look on the Github webpage posted above

In [None]:
def prediction_errors(clf, eval_spans, vectorizer, 
                      select_true_label=None, 
                      select_pred_label=None):
    eval_X, eval_y = make_feature_vectors_and_labels(eval_spans, vectorizer)
    eval_spans_txt = [s['txt'] for s in eval_spans]
    eval_spans_labels = [s['type'] for s in eval_spans]
    pred_y = clf.predict(eval_X)
    for i in range(len(eval_spans)):
        true_label = eval_spans_labels[i]
        pred_label = pred_y[i]
        if true_label != pred_label:
            if select_true_label and true_label != select_true_label: continue
            if select_pred_label and pred_label != select_pred_label: continue
            doc_name = documents_by_id[eval_spans[i]['document']]['name']
            print('sentence # '+str(i)+' / case '+doc_name+' / @'+str(eval_spans[i]['start']))
            print('pred: '+pred_label+' / true: '+true_label)
            print(eval_spans[i]['txt'])
            print()

### 5.1 TFIDF featurization

Credit for code goes to M. Grabmair and his LDSI_W21_Classifier_Workshop

##### Creation of spans (training, dev & test)

In [None]:
train_spans = [span for span in spans if span["document"] in training_set]
dev_spans = [span for span in spans if span["document"] in dev_set]
test_spans = [span for span in spans if span["document"] in test_set]

print(f'Size of train_span is {len(train_spans)}')
print(f'Size of dev_spans is {len(dev_spans)}')
print(f'Size of test_spans is {len(test_spans)}')

##### Creation of spans_text (training, dev & test)

In [None]:
train_spans_txt = [span.get("txt") for span in train_spans]
dev_spans_txt = [span.get("txt") for span in dev_spans]
test_spans_txt = [span.get("txt") for span in test_spans]

print(f'Size of train_spans_txt is {len(train_spans_txt)}')
print(f'Size of dev_spans_txt is {len(dev_spans_txt)}')
print(f'Size of test_spans_txt is {len(test_spans_txt)}')

##### TFIDF vectorization (sklearn)

In [None]:
"""
vectorizer = TfidfVectorizer(min_df=5)
vectorizer = vectorizer.fit(train_spans_txt) #It's important to do it on the training data
tfidf_features_skl = vectorizer.get_feature_names_out()#vectorizer.get_feature_names()
"""
spacy_tfidf_vectorizer = TfidfVectorizer(tokenizer=sentences_custom.get_tokens_spacy,
                                         min_df=5,
                                         ngram_range=(1,1))
spacy_tfidf_vectorizer = spacy_tfidf_vectorizer.fit(train_spans_txt)
tfidf_features_skl = spacy_tfidf_vectorizer.get_feature_names_out()

In [None]:
#len(tfidf_features_skl)
tfidf_features_skl[1000:1200]

In [None]:
train_tfidf_skl = spacy_tfidf_vectorizer.transform(train_spans_txt).toarray()
dev_tfidf_skl = spacy_tfidf_vectorizer.transform(dev_spans_txt).toarray()
test_tfidf_skl = spacy_tfidf_vectorizer.transform(test_spans_txt).toarray()

train_spans_labels = np.array([s['type'] for s in train_spans])
dev_spans_labels = np.array([s['type'] for s in dev_spans])
test_spans_labels = np.array([s['type'] for s in test_spans])

##### Examine the top TFIDF values of tokens in a sentence (training_set)

In [None]:
span_top_tfidf(train_spans_txt, 
               train_tfidf_skl,
               tfidf_features_skl,
               random.randint(0, len(train_spans)))

##### Examine the top TFIDF values of tokens in a sentence (dev_set)

In [None]:
span_top_tfidf(dev_spans_txt, 
               dev_tfidf_skl,
               tfidf_features_skl,
               random.randint(0, len(dev_spans)))

##### Examine the top TFIDF values of tokens in a sentence (test_set)

In [None]:
span_top_tfidf(test_spans_txt, 
               test_tfidf_skl,
               tfidf_features_skl,
               random.randint(0, len(test_spans)))

##### Examine features with highest average TFIDF score per class

In [None]:
dfs = top_features_by_class(train_tfidf_skl, 
                            train_spans_labels,
                            tfidf_features_skl)

In [None]:
dfs.keys()

In [None]:
for classSpan in dfs.keys():
    print('{0}\n==>{1}'.format('*'*50, classSpan))
    print(dfs[classSpan])

In [None]:
dfs["CaseIssue"]

### 5.2 Word Embedding Featurization

###### Preparation: mean and standard deviation (of tokens) are gotten

In [None]:
def debugLog(message):
    now = datetime.now()
    print(now.strftime("%H:%M:%S"), message)

def getTxtFromSpan(span):
    return span["txt"].lower()

def getMeanAndStdDevTokensTraining(spans):
    debugLog("BEGIN getMeanAndStdDevTokensTraining")    
    
    mean = 0.0
    std_dev = 0.0
    nlp = sentences_custom.getNlpObj()
    
    #listNumTokens = [len(sentences_custom.get_tokens_spacy(t["txt"], nlp)) for t in train_spans]
    pool = mp.Pool(mp.cpu_count())
    debugLog("To execute map getTxtFromSpan")
    spans_txt = pool.map(getTxtFromSpan, spans)
    
    debugLog("To execute pipe to get SpacyDocuments")
    #spacy.prefer_gpu()
    nlp = sentences_custom.getNlpObj()
    spacy_docs_from_spans = list(nlp.pipe(spans_txt))
    
    print(f'Length of spacy_docs_from_spans is {len(spacy_docs_from_spans)}')
    
    debugLog("To execute map get_tokens_spacy_opt")
    #listNumTokens = pool.starmap(sentences_custom.get_tokens_spacy_opt, zip(spacy_docs_from_spans, spans_txt))
    listTokens = pool.map(sentences_custom.get_tokens_spacy, spacy_docs_from_spans)
    
    debugLog("Converting npListNumTokens, computing mean, std_dev")
    npListNumTokens = [len(t) for t in listTokens]
    npListNumTokens = np.asarray(npListNumTokens, dtype='float32')
    
    mean = npListNumTokens.mean()
    std_dev = npListNumTokens.std()
    
    listNumTokensNorm = (npListNumTokens-mean)/std_dev
    
    debugLog("Adding tokens_normalized to each span")    
    for span, tokens_norm in zip(spans, listNumTokensNorm):
        span["tokens_normalized"] = tokens_norm
    
    debugLog("END getMeanAndStdDevTokensTraining")    
    return mean, std_dev, listNumTokensNorm, listTokens

In [None]:
mean, std_dev, listNumTokensNorm, listTokens = getMeanAndStdDevTokensTraining(train_spans)

In [None]:
print("mean: ", mean)
print("std_dev: ", std_dev)
print("Lista num tokens norm: ", len(listNumTokensNorm))
print(listTokens[0])
print(len(listTokens))

In [None]:
train_spans[0]

In [None]:
#len(words)

In [None]:
def getVectorFeature(tokens, modelFastText):
    #print("Num tokens =", len(tokens))
    if len(tokens)==0: tokens = [""]
    vectors = [modelFastText.get_word_vector(t) for t in tokens]
    
    vector = np.asarray(vectors, dtype='float32').mean(axis=0)
        
    return vector

In [None]:
vectorized_train_span = [getVectorFeature(t, modelFastText) for t in listTokens]

In [None]:
vector_np = np.asarray(vectorized_train_span, dtype='float32')
starts_normalized = np.array([s['start_normalized'] for s in train_spans])
tokens_normalized = np.array([s['tokens_normalized'] for s in train_spans])
train_y = np.array([s['type'] for s in train_spans])

train_X = np.concatenate((vector_np, np.expand_dims(starts_normalized, axis=1)), axis=1)
train_X = np.concatenate((train_X, np.expand_dims(tokens_normalized, axis=1)), axis=1)

train_X.shape

In [None]:
def get_x_y(vectorized_span, spans, tokens_normalized=None):
    vector_np = np.asarray(vectorized_span, dtype='float32')
    starts_normalized = np.array([s['start_normalized'] for s in spans])
    if tokens_normalized is None:
        tokens_normalized = np.array([s['tokens_normalized'] for s in spans])
        
    _y = np.array([s['type'] for s in spans])

    X = np.concatenate((vector_np, np.expand_dims(starts_normalized, axis=1)), axis=1)
    _X = np.concatenate((X, np.expand_dims(tokens_normalized, axis=1)), axis=1)
    
    return _X, _y

In [None]:
mean_dev, std_dev_dev, listNumTokensNorm_dev, listTokens_dev = getMeanAndStdDevTokensTraining(dev_spans)

vectorized_dev_span = [getVectorFeature(t, modelFastText) for t in listTokens_dev]

dev_x, dev_y = get_x_y(vectorized_dev_span, dev_spans, listNumTokensNorm_dev)

### 5.3 Model Training

#### TFIDF

##### Predicting

In [None]:
def make_feature_vectors_and_labels(spans, tfidf): #vectorizer
    # function takes long to execute
    # note: we un-sparse the matrix here to be able to manipulate it
    #>>tfidf = spacy_tfidf_vectorizer.transform([s['txt'] for s in spans]).toarray()
    starts_normalized = np.array([s['start_normalized'] for s in spans])
    print("starts_normalized.shape = ", tfidf.shape)
    print("starts_normalized.shape = ", starts_normalized.shape)
    y = np.array([s['type'] for s in spans])
    X = np.concatenate((tfidf, np.expand_dims(starts_normalized, axis=1)), axis=1)
    return X, y

In [None]:
train_tfidf_skl = spacy_tfidf_vectorizer.transform(train_spans_txt).toarray()
dev_tfidf_skl = spacy_tfidf_vectorizer.transform(dev_spans_txt).toarray()
test_tfidf_skl = spacy_tfidf_vectorizer.transform(test_spans_txt).toarray()

train_spans_labels = np.array([s['type'] for s in train_spans])
dev_spans_labels = np.array([s['type'] for s in dev_spans])
test_spans_labels = np.array([s['type'] for s in test_spans])

In [None]:
#clf = tree.DecisionTreeClassifier(max_depth=50) #, splitter="random"
#clf = RandomForestClassifier(n_estimators=100, max_depth=100)
#clf = LinearSVC(dual=False, max_iter=1000, loss='squared_hinge', penalty='l2', C=50)
#clf = LinearSVC(dual=False, max_iter=1000)
#clf = LogisticRegression(max_iter=2000)
clf = SVC(kernel='poly', degree=3)#SVC(kernel='poly', gamma='auto', degree=4)#SVC(kernel='poly', gamma='scale', degree=3)

clf = clf.fit(train_X, train_y)

##### Assessing the prediction

In [None]:
prediction = clf.predict(train_X)
print('TRAIN:\n'+classification_report(train_y, prediction))

In [None]:
plot_confusion_matrix(train_y, prediction, classes=list(clf.classes_),
                      title='Confusion matrix for training data')
plt.show()

##### Assessing against dev set

In [None]:
dev_X, dev_y = make_feature_vectors_and_labels(dev_spans, dev_tfidf_skl)
print(dev_tfidf_skl.shape)

In [None]:
#dev_X.shape
dev_y.shape

In [None]:
prediction_dev = clf.predict(dev_X)
print('DEV:\n'+classification_report(dev_y, prediction_dev))

In [None]:
#plot_confusion_matrix(dev_y, prediction_dev, classes=list(clf.classes_),
#                      title='Confusion matrix for dev data')
#plt.show()

plot_confusion_matrix_sklearn(clf, dev_X, dev_y, xticks_rotation='vertical')  
plt.show()

In [None]:
from joblib import dump, load

#dump(clf, 'modelPolynomialSvmKernel3.joblib') 
clf2 = load('modelLinearSvcDualFalseMaxIt1K.joblib')

In [None]:
test_X, test_y = make_feature_vectors_and_labels(test_spans, test_tfidf_skl)

In [None]:
prediction_test = clf2.predict(test_X)
print('TEST:\n'+classification_report(test_y, prediction_test))

plot_confusion_matrix_sklearn(clf2, test_X, test_y, xticks_rotation='vertical')  
plt.show()

In [None]:
prediction_errors(clf,
                  random.sample(test_spans, len(test_spans)),
                  test_tfidf_skl,
                  select_pred_label='Evidence')

#### Word Embeddings

##### Predicting

In [None]:
#clf = tree.DecisionTreeClassifier(max_depth=150, splitter="best", criterion="gini")
#clf = RandomForestClassifier(n_estimators=143, max_depth=100)
#clf = LinearSVC(dual=False, max_iter=553, loss='squared_hinge', penalty='l2', C=1)
#clf = LogisticRegression(max_iter=744) 
clf = SVC(kernel='rbf', gamma='scale', C=4)
#clf = NuSVC(nu=3e-3)
clf = clf.fit(train_X, train_y)

##### Assessing the prediction

In [None]:
print('TRAIN:\n'+classification_report(train_y, clf.predict(train_X)))

In [None]:
plot_confusion_matrix(train_y, clf.predict(train_X), classes=list(clf.classes_),
                      title='Confusion matrix for training data')
plt.show()

In [None]:
prediction_emd_dev = clf.predict(dev_x)
print('DEV:\n'+classification_report(dev_y, prediction_emd_dev))

plot_confusion_matrix_sklearn(clf, dev_x, dev_y, xticks_rotation='vertical')  
plt.show()

In [None]:
from sklearn.datasets import load_iris
from sklearn.experimental import enable_halving_search_cv  # noqa
from sklearn.model_selection import HalvingRandomSearchCV
from scipy.stats import randint

import random

In [None]:
#clf_rndm = RandomForestClassifier()#random_state=0
clf_rndm = LogisticRegression() 
#clf_rndm = SVC()
#clf_rndm = LinearSVC(loss='squared_hinge', penalty='l2')
#clf_rndm = tree.DecisionTreeClassifier()
np.random.seed(0)

param_distributions = {"max_iter": randint(200, 1500)}
search = HalvingRandomSearchCV(clf_rndm, param_distributions,
                               random_state=0).fit(train_X, train_y)
search.best_params_  

In [None]:
from joblib import dump, load

In [None]:
dump(clf, 'modelEmbRadKerSVMScaleC4_v2.joblib') 
clf2 = load('modelEmbRadKerSVMScaleC4_v2.joblib')

In [None]:
mean_test, std_dev_test, listNumTokensNorm_test, listTokens_test = getMeanAndStdDevTokensTraining(test_spans)

vectorized_test_span = [getVectorFeature(t, modelFastText) for t in listTokens_test]

test_x, test_y = get_x_y(vectorized_test_span, test_spans, listNumTokensNorm_test)

In [None]:
prediction_test = clf2.predict(test_x)
print('TEST:\n'+classification_report(test_y, prediction_test))

plot_confusion_matrix_sklearn(clf2, test_x, test_y, xticks_rotation='vertical')  
plt.show()

In [None]:
for i in prediction_test: print(i)

In [None]:
def prediction_errors_embeddings(eval_spans, predictions, 
                      select_true_label=None, 
                      select_pred_label=None):
    eval_spans_txt = [s['txt'] for s in eval_spans]
    eval_spans_labels = [s['type'] for s in eval_spans]
    pred_y = predictions
    
    for i in range(len(eval_spans)):
        true_label = eval_spans_labels[i]
        pred_label = pred_y[i]
        if true_label != pred_label:
            if select_true_label and true_label != select_true_label: continue
            if select_pred_label and pred_label != select_pred_label: continue
            doc_name = documents_by_id[eval_spans[i]['document']]['name']
            print('sentence # '+str(i)+' / case '+doc_name+' / @'+str(eval_spans[i]['start']))
            print('pred: '+pred_label+' / true: '+true_label)
            print(eval_spans[i]['txt'])
            print()

In [None]:
prediction_errors_embeddings(
                  dev_spans,
                  prediction_test)