
![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)





# Train Legal Classifiers

## Setup

In [2]:
%pip install -q tensorflow==2.7.0
%pip install -q tensorflow-addons

In [4]:
from johnsnowlabs import *

import json
import os

import numpy as np
import pandas as pd

print("Spark NLP Version :", sparknlp.version())

spark = start_spark()

Spark NLP Version : 4.2.1
📋 Loading license number 0 from /home/ubuntu/.johnsnowlabs/licenses/license_number_0_for_.json


22/10/20 12:47:05 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


👌 Launched [92mcpu-Optimized JVM[39m SparkSession with Jars for: 🚀Spark-NLP==4.2.1, 💊Spark-Healthcare==4.2.0, 🕶Spark-OCR==4.1.0, running on ⚡ PySpark==3.1.2


# Multilabel classifier training

## Loading the data

In [6]:
! wget -q https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/tutorials/Certification_Trainings_JSL/Legal/data/finance_data.csv

In [7]:
df = pd.read_csv('./finance_data.csv')
df['label'] = df['label'].apply(eval)

In [8]:
data = spark.createDataFrame(df)

# If you have a single dataset, then split it or else you can load the test dataset the same way that you load the train data.
train, test = data.randomSplit([0.8, 0.2], seed = 123)

In [9]:
train.show(truncate=50)

22/10/20 12:47:23 WARN TaskSetManager: Stage 0 contains a task of very large size (1746 KiB). The maximum recommended task size is 1000 KiB.
[Stage 0:>                                                          (0 + 1) / 1]

+--------------------------------------------------+-----------------------------------+
|                                         provision|                              label|
+--------------------------------------------------+-----------------------------------+
|(a) Consultant or Company may terminate this Pr...|                     [terminations]|
|(a) Effective as of the Effective Date, the Hol...|            [waivers, terminations]|
|(a) No failure or delay by the Administrative A...|              [waivers, amendments]|
|(a) No failure or delay by the Agent or any Len...|              [waivers, amendments]|
|(a) No failure or delay of the Administrative A...|              [waivers, amendments]|
|(a) The Credit Agreement is, effective as of th...|                       [amendments]|
|(a) The provisions of this Agreement shall be b...|              [assigns, successors]|
|(a) To induce the other parties hereto to enter...|      [representations, warranties]|
|(a)  The provisions 

                                                                                

In [10]:
from pyspark.sql.functions import col

test.groupBy("label") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

22/10/20 12:47:24 WARN TaskSetManager: Stage 1 contains a task of very large size (1746 KiB). The maximum recommended task size is 1000 KiB.

+--------------------+-----+
|               label|count|
+--------------------+-----+
|    [governing laws]|  772|
|      [counterparts]|  573|
| [entire agreements]|  558|
|           [notices]|  530|
|      [severability]|  507|
|          [survival]|  343|
|[assigns, success...|  319|
|        [amendments]|  309|
|          [expenses]|  248|
|      [terminations]|  237|
|           [waivers]|  205|
|[waivers, amendme...|  195|
|[representations,...|  191|
|       [assignments]|  187|
|   [representations]|   94|
|[amendments, enti...|   54|
|        [successors]|   54|
|        [warranties]|   39|
|[amendments, term...|   25|
|[assignments, suc...|   11|
+--------------------+-----+
only showing top 20 rows



                                                                                

 ## With Universal Encoder

In [11]:
document_assembler = nlp.DocumentAssembler() \
      .setInputCol("provision") \
      .setOutputCol("document") \
      .setCleanupMode("shrink")

embeddings = nlp.UniversalSentenceEncoder.pretrained() \
      .setInputCols("document") \
      .setOutputCol("sentence_embeddings")

classsifierdl = nlp.MultiClassifierDLApproach()\
      .setInputCols(["sentence_embeddings"])\
      .setOutputCol("class")\
      .setLabelColumn("label")\
      .setMaxEpochs(30)\
      .setEnableOutputLogs(True)

clf_pipeline = Pipeline(
    stages = [
        document_assembler,
        embeddings,
        classsifierdl
    ])

tfhub_use download started this may take some time.
Approximate size to download 923.7 MB
[ | ]tfhub_use download started this may take some time.
Approximate size to download 923.7 MB
Download done! Loading the resource.
[ / ]

2022-10-20 12:47:41.741507: I external/org_tensorflow/tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


[OK!]


In [12]:
%%time
clf_pipelineModel = clf_pipeline.fit(train)

22/10/20 12:47:52 WARN TaskSetManager: Stage 4 contains a task of very large size (1746 KiB). The maximum recommended task size is 1000 KiB.
22/10/20 12:47:58 WARN TaskSetManager: Stage 6 contains a task of very large size (1746 KiB). The maximum recommended task size is 1000 KiB.
22/10/20 12:47:59 WARN TaskSetManager: Stage 7 contains a task of very large size (1746 KiB). The maximum recommended task size is 1000 KiB.
2022-10-20 12:48:05.178535: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:43] Reading SavedModel from: /tmp/474de67c81ab_classifier_dl8079699436630106693
2022-10-20 12:48:05.681779: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:107] Reading meta graph with tags { serve }
2022-10-20 12:48:05.681852: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:148] Reading SavedModel debug info (if present) from: /tmp/474de67c81ab_classifier_dl8079699436630106693
2022-10-20 12:48:11.254782: I external/org_tensorflow/tensorflow/cc/saved_mode

Training started - epochs: 30 - learning_rate: 0.001 - batch_size: 64 - training_examples: 22002 - classes: 15
Epoch 1/30 - 7.85s - loss: 0.168192 - acc: 0.94994646 - batches: 344
Epoch 2/30 - 3.02s - loss: 0.07170924 - acc: 0.9776423 - batches: 344
Epoch 3/30 - 2.98s - loss: 0.06072507 - acc: 0.98128265 - batches: 344
Epoch 4/30 - 3.00s - loss: 0.055653386 - acc: 0.9830967 - batches: 344
Epoch 5/30 - 3.09s - loss: 0.052645776 - acc: 0.984132 - batches: 344
Epoch 6/30 - 2.97s - loss: 0.05059618 - acc: 0.9848573 - batches: 344
Epoch 7/30 - 2.98s - loss: 0.049065854 - acc: 0.9853485 - batches: 344
Epoch 8/30 - 3.01s - loss: 0.04785178 - acc: 0.9858103 - batches: 344
Epoch 9/30 - 2.99s - loss: 0.04684918 - acc: 0.98614776 - batches: 344
Epoch 10/30 - 3.01s - loss: 0.04599774 - acc: 0.9864211 - batches: 344
Epoch 11/30 - 3.01s - loss: 0.04525937 - acc: 0.98659724 - batches: 344
Epoch 12/30 - 3.05s - loss: 0.044609535 - acc: 0.98679745 - batches: 344
Epoch 13/30 - 3.03s - loss: 0.044030655 

In [13]:
preds = clf_pipelineModel.transform(test)

In [14]:
preds_df = preds.select('label','provision',"class.result").toPandas()
preds_df.head()

22/10/20 12:50:16 WARN TaskSetManager: Stage 8 contains a task of very large size (1746 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

Unnamed: 0,label,provision,result
0,"[waivers, amendments]",(a) No failure or delay by any Agent or any Le...,"[waivers, amendments]"
1,[assignments],"(a) Seller, the Agent, each Managing Agent, ea...",[assignments]
2,[waivers],(a) Any provision of this Agreement may be wai...,"[waivers, amendments]"
3,[notices],(a) Except where telephonic instructions or no...,[notices]
4,[governing laws],(a) THIS AGREEMENT AND EACH OTHER LOAN DOCUMEN...,[governing laws]


In [15]:
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

mlb = MultiLabelBinarizer()

y_true = mlb.fit_transform(preds_df['label'])
y_pred = mlb.fit_transform(preds_df['result'])


print("Classification report: \n", (classification_report(y_true, y_pred)))
print("F1 micro averaging:",(f1_score(y_true, y_pred, average='micro')))
print("ROC: ",(roc_auc_score(y_true, y_pred, average="micro")))


Classification report: 
               precision    recall  f1-score   support

           0       0.87      0.81      0.84       609
           1       0.71      0.57      0.63       212
           2       0.81      0.78      0.79       334
           3       1.00      0.98      0.99       585
           4       0.97      0.96      0.97       647
           5       0.96      0.98      0.97       248
           6       0.99      0.97      0.98       791
           7       0.97      0.94      0.95       530
           8       0.90      0.81      0.85       285
           9       0.96      0.96      0.96       521
          10       0.87      0.86      0.86       394
          11       0.93      0.92      0.92       346
          12       0.88      0.73      0.80       268
          13       0.86      0.76      0.81       417
          14       0.84      0.70      0.77       230

   micro avg       0.92      0.88      0.90      6417
   macro avg       0.90      0.85      0.87      6417
w

## With RoBerta Embeddings

We do not have have any specific Legal Sentence Embeddings, but we can use Legal RoBerta Embeddings and then average them.

In [16]:
embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_legal_roberta_base","en") \
    .setInputCols(["document", "token"]) \
    .setOutputCol("embeddings") \
    .setMaxSentenceLength(512)

roberta_embeddings_legal_roberta_base download started this may take some time.
Approximate size to download 447.2 MB
[ | ]roberta_embeddings_legal_roberta_base download started this may take some time.
Approximate size to download 447.2 MB
Download done! Loading the resource.
[OK!]


In [17]:
document_assembler = nlp.DocumentAssembler() \
    .setInputCol("provision") \
    .setOutputCol("document")

tokenizer = nlp.Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("token")

embeddingsSentence = nlp.SentenceEmbeddings() \
    .setInputCols(["document", "embeddings"]) \
    .setOutputCol("sentence_embeddings") \
    .setPoolingStrategy("AVERAGE")

classsifierdl = nlp.MultiClassifierDLApproach() \
    .setInputCols(["sentence_embeddings"]) \
    .setOutputCol("class") \
    .setLabelColumn("label")\
    .setMaxEpochs(6)\
    .setEnableOutputLogs(True)

clf_pipeline = Pipeline(
    stages = [
        document_assembler,
        tokenizer,
        embeddings,
        embeddingsSentence,
        classsifierdl
    ])

In [18]:
%%time
clf_pipelineModel = clf_pipeline.fit(train)

22/10/20 12:50:56 WARN TaskSetManager: Stage 13 contains a task of very large size (1746 KiB). The maximum recommended task size is 1000 KiB.
22/10/20 13:39:36 WARN TaskSetManager: Stage 15 contains a task of very large size (1746 KiB). The maximum recommended task size is 1000 KiB.
22/10/20 13:39:40 WARN TaskSetManager: Stage 16 contains a task of very large size (1746 KiB). The maximum recommended task size is 1000 KiB.
2022-10-20 14:28:46.988983: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:43] Reading SavedModel from: /tmp/760e094711e0_classifier_dl2151115246637280169
2022-10-20 14:28:47.618745: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:107] Reading meta graph with tags { serve }
2022-10-20 14:28:47.618810: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:148] Reading SavedModel debug info (if present) from: /tmp/760e094711e0_classifier_dl2151115246637280169
2022-10-20 14:28:52.872271: I external/org_tensorflow/tensorflow/cc/saved_m

Training started - epochs: 6 - learning_rate: 0.001 - batch_size: 64 - training_examples: 22002 - classes: 15
Epoch 1/6 - 8.06s - loss: 0.09549305 - acc: 0.97049695 - batches: 344
Epoch 2/6 - 3.04s - loss: 0.039114594 - acc: 0.9887728 - batches: 344
Epoch 3/6 - 3.03s - loss: 0.03448456 - acc: 0.9903079 - batches: 344
Epoch 4/6 - 3.02s - loss: 0.03188738 - acc: 0.9912162 - batches: 344
Epoch 5/6 - 3.11s - loss: 0.030053359 - acc: 0.9919112 - batches: 344
Epoch 6/6 - 3.06s - loss: 0.028621674 - acc: 0.99234784 - batches: 344
CPU times: user 851 ms, sys: 254 ms, total: 1.11 s
Wall time: 1h 38min 50s


In [19]:
preds = clf_pipelineModel.transform(test)

In [20]:
preds_df = preds.select('provision','label',"class.result").toPandas()

22/10/20 14:29:47 WARN TaskSetManager: Stage 17 contains a task of very large size (1746 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

In [21]:
preds_df.head()

Unnamed: 0,provision,label,result
0,(a) No failure or delay by any Agent or any Le...,"[waivers, amendments]","[waivers, amendments]"
1,"(a) Seller, the Agent, each Managing Agent, ea...",[assignments],"[successors, assigns]"
2,(a) Any provision of this Agreement may be wai...,[waivers],[waivers]
3,(a) Except where telephonic instructions or no...,[notices],[notices]
4,(a) THIS AGREEMENT AND EACH OTHER LOAN DOCUMEN...,[governing laws],[governing laws]


In [22]:
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

mlb = MultiLabelBinarizer()

y_true = mlb.fit_transform(preds_df['label'])
y_pred = mlb.fit_transform(preds_df['result'])


print("Classification report: \n", (classification_report(y_true, y_pred)))
print("F1 micro averaging:",(f1_score(y_true, y_pred, average='micro')))
print("ROC: ",(roc_auc_score(y_true, y_pred, average="micro")))


Classification report: 
               precision    recall  f1-score   support

           0       0.92      0.84      0.88       609
           1       0.88      0.50      0.63       212
           2       0.80      0.89      0.84       334
           3       1.00      0.98      0.99       585
           4       0.98      0.99      0.99       647
           5       0.99      0.96      0.98       248
           6       1.00      0.99      0.99       791
           7       0.98      0.99      0.99       530
           8       0.89      0.95      0.92       285
           9       0.99      0.98      0.99       521
          10       0.83      0.93      0.88       394
          11       0.97      0.86      0.92       346
          12       0.94      0.82      0.87       268
          13       0.94      0.74      0.83       417
          14       0.84      0.82      0.83       230

   micro avg       0.94      0.91      0.93      6417
   macro avg       0.93      0.88      0.90      6417
w

### Saving & loading back the trained model

In [23]:
clf_pipelineModel.stages

[DocumentAssembler_ecd5ee3dcf58,
 REGEX_TOKENIZER_6f0a2852ed26,
 ROBERTA_EMBEDDINGS_b915dff90901,
 SentenceEmbeddings_4ba45b7f548d,
 MultiClassifierDLModel_7a28940374e2]

In [24]:
clf_pipelineModel.stages[-1].write().overwrite().save('MultilabelClfRoBerta')

In [25]:
# Load back  saved Multilabel Classifier Model
MultilabelClfModel = nlp.MultiClassifierDLModel.load('MultilabelClfRoBerta')

In [26]:
ld_pipeline = Pipeline(stages=[document_assembler, tokenizer, embeddings, embeddingsSentence, MultilabelClfModel])
ld_pipeline_model = ld_pipeline.fit(spark.createDataFrame([['']]).toDF("provision"))

In [27]:
# Apply Model Transform to testData
ld_preds = ld_pipeline_model.transform(test)

In [28]:
ld_preds_df = ld_preds.select('provision','label',"class.result").toPandas()

22/10/20 14:43:32 WARN TaskSetManager: Stage 23 contains a task of very large size (1746 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

In [29]:
ld_preds_df.head(10)

Unnamed: 0,provision,label,result
0,(a) No failure or delay by any Agent or any Le...,"[waivers, amendments]","[waivers, amendments]"
1,"(a) Seller, the Agent, each Managing Agent, ea...",[assignments],"[successors, assigns]"
2,(a) Any provision of this Agreement may be wai...,[waivers],[waivers]
3,(a) Except where telephonic instructions or no...,[notices],[notices]
4,(a) THIS AGREEMENT AND EACH OTHER LOAN DOCUMEN...,[governing laws],[governing laws]
5,(a) To induce the other parties hereto to ente...,"[representations, warranties]",[representations]
6,A counterpart original of this Amendment duly ...,[amendments],[]
7,Advisor represents that Advisor’s services und...,[representations],[representations]
8,"Agent may assign, indorse or transfer any inst...",[assignments],[]
9,Agreement may be amended from time to time by ...,[amendments],[amendments]


# Multiclass classifier training

## Loading the data

In [30]:
! wget -q https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/tutorials/Certification_Trainings_JSL/Legal/data/finance_clf_data.csv

In [31]:
df = pd.read_csv('finance_clf_data.csv')

In [32]:
df.head()

Unnamed: 0,text,label,len
0,\nOperating\nLeases\n \nOn\nJanuary 1 2010 th...,financial_statements,465
1,the Exercise Price and is exercisable for fiv...,financial_statements,406
2,Income Taxes\n69\nTable of Contents\nWe accoun...,financial_statements,843
3,Invoice2go\n has not been required to maintain...,risk_factors,474
4,A\nB\nC\nPlan Category\nNumber of Securitiesto...,equity,358


In [33]:
df['label'].value_counts()

risk_factors               3831
financial_statements       3726
business                   2002
financial_conditions        702
form_10k_summary            491
executives_compensation     304
controls_procedures         277
equity                      223
market_risk                 204
executives                  161
legal_proceedings            94
security_ownership           84
properties                   81
exhibits                     77
Name: label, dtype: int64

In [34]:
data = spark.createDataFrame(df)

train, test = data.randomSplit([0.8, 0.2], seed = 100)

In [35]:
from pyspark.sql.functions import col

train.groupBy("label") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

22/10/20 15:01:30 WARN TaskSetManager: Stage 24 contains a task of very large size (1279 KiB). The maximum recommended task size is 1000 KiB.


+--------------------+-----+
|               label|count|
+--------------------+-----+
|        risk_factors| 3064|
|financial_statements| 2966|
|            business| 1597|
|financial_conditions|  541|
|    form_10k_summary|  385|
|executives_compen...|  241|
| controls_procedures|  218|
|              equity|  173|
|         market_risk|  159|
|          executives|  128|
|   legal_proceedings|   77|
|  security_ownership|   66|
|          properties|   66|
|            exhibits|   65|
+--------------------+-----+



In [36]:
from pyspark.sql.functions import col

test.groupBy("label") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

22/10/20 15:01:34 WARN TaskSetManager: Stage 26 contains a task of very large size (1279 KiB). The maximum recommended task size is 1000 KiB.


+--------------------+-----+
|               label|count|
+--------------------+-----+
|        risk_factors|  767|
|financial_statements|  760|
|            business|  405|
|financial_conditions|  161|
|    form_10k_summary|  106|
|executives_compen...|   63|
| controls_procedures|   59|
|              equity|   50|
|         market_risk|   45|
|          executives|   33|
|  security_ownership|   18|
|   legal_proceedings|   17|
|          properties|   15|
|            exhibits|   12|
+--------------------+-----+



 ## With Universal Encoder

In [37]:
document_assembler = nlp.DocumentAssembler() \
      .setInputCol("text") \
      .setOutputCol("document") 

embeddings = nlp.UniversalSentenceEncoder.pretrained() \
      .setInputCols("document") \
      .setOutputCol("sentence_embeddings")

classsifierdl = legal.ClassifierDLApproach()\
      .setInputCols(["sentence_embeddings"])\
      .setOutputCol("class")\
      .setLabelColumn("label")\
      .setMaxEpochs(30)\
      .setEnableOutputLogs(True)

clf_pipeline = Pipeline(
    stages = [
        document_assembler,
        embeddings,
        classsifierdl
    ])

tfhub_use download started this may take some time.
Approximate size to download 923.7 MB
[OK!]


In [38]:
clf_pipelineModel = clf_pipeline.fit(train)

22/10/20 15:02:05 WARN TaskSetManager: Stage 28 contains a task of very large size (1279 KiB). The maximum recommended task size is 1000 KiB.
22/10/20 15:02:07 WARN TaskSetManager: Stage 31 contains a task of very large size (1279 KiB). The maximum recommended task size is 1000 KiB.
2022-10-20 15:02:10.806660: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:43] Reading SavedModel from: /tmp/0b935a88dbf5_classifier_dl3788460967414782246
2022-10-20 15:02:10.962436: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:107] Reading meta graph with tags { serve }
2022-10-20 15:02:10.962503: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:148] Reading SavedModel debug info (if present) from: /tmp/0b935a88dbf5_classifier_dl3788460967414782246
2022-10-20 15:02:11.855807: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:228] Restoring SavedModel bundle.
2022-10-20 15:02:13.375618: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:212

Training started - epochs: 30 - learning_rate: 0.005 - batch_size: 64 - training_examples: 9746 - classes: 14
Epoch 1/30 - 1.30s - loss: 338.58203 - acc: 0.5773711 - batches: 153
Epoch 2/30 - 0.67s - loss: 323.76065 - acc: 0.683925 - batches: 153
Epoch 3/30 - 0.66s - loss: 322.3374 - acc: 0.6934393 - batches: 153
Epoch 4/30 - 0.66s - loss: 321.29047 - acc: 0.6990931 - batches: 153
Epoch 5/30 - 0.66s - loss: 319.76938 - acc: 0.70639163 - batches: 153
Epoch 6/30 - 0.70s - loss: 319.6436 - acc: 0.70875597 - batches: 153
Epoch 7/30 - 0.67s - loss: 319.4949 - acc: 0.71019506 - batches: 153
Epoch 8/30 - 0.67s - loss: 319.35 - acc: 0.71173704 - batches: 153
Epoch 9/30 - 0.69s - loss: 319.24713 - acc: 0.7126622 - batches: 153
Epoch 10/30 - 0.69s - loss: 319.20343 - acc: 0.7130734 - batches: 153
Epoch 11/30 - 0.69s - loss: 319.19696 - acc: 0.71338177 - batches: 153
Epoch 12/30 - 0.70s - loss: 319.21097 - acc: 0.71420413 - batches: 153
Epoch 13/30 - 0.70s - loss: 319.22702 - acc: 0.7152321 - bat

In [39]:
preds = clf_pipelineModel.transform(test)

In [40]:
preds_df = preds.select('label','text',"class.result").toPandas()
preds_df.head()

22/10/20 15:03:18 WARN TaskSetManager: Stage 32 contains a task of very large size (1279 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

Unnamed: 0,label,text,result
0,risk_factors,\n\n \n\n\nOn July 1 2020 we entered into the ...,[financial_statements]
1,financial_statements,\n \n\n\nIdentification and treatment of contr...,[financial_statements]
2,form_10k_summary,\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n20...,[financial_statements]
3,financial_statements,\n\n\nCash \n 3 \n\nReceivables \n 10 \n\nOth...,[financial_statements]
4,form_10k_summary,\n \n \n \n \n \n \n \n \n\n\n\nGranted\n\n \...,[financial_statements]


In [41]:
# The result is an array since in Spark NLP you can have multiple sentences.
# Let's explode the array and get the item(s) inside of result column out
preds_df['result'] = preds_df['result'].apply(lambda x : x[0])

In [42]:
# We are going to use sklearn to evalute the results on test dataset
from sklearn.metrics import classification_report

print (classification_report(preds_df['label'], preds_df['result']))

                         precision    recall  f1-score   support

               business       0.70      0.83      0.76       405
    controls_procedures       0.00      0.00      0.00        59
                 equity       0.00      0.00      0.00        50
             executives       0.00      0.00      0.00        33
executives_compensation       0.00      0.00      0.00        63
               exhibits       0.00      0.00      0.00        12
   financial_conditions       0.00      0.00      0.00       161
   financial_statements       0.62      0.94      0.75       760
       form_10k_summary       0.00      0.00      0.00       106
      legal_proceedings       0.00      0.00      0.00        17
            market_risk       0.00      0.00      0.00        45
             properties       0.00      0.00      0.00        15
           risk_factors       0.77      0.89      0.83       767
     security_ownership       0.00      0.00      0.00        18

               accuracy

### Saving & loading back the trained model

In [43]:
clf_pipelineModel.stages

[DocumentAssembler_5ea11b14148e,
 UNIVERSAL_SENTENCE_ENCODER_4de71669b7ec,
 LegalClassifierDLModel_5f7c04889286]

In [44]:
clf_pipelineModel.stages[-1].write().overwrite().save('Clf_Use')

In [45]:
# Load back  saved Classifier Model
ClfModel = legal.ClassifierDLModel.load('Clf_Use')

In [46]:
ld_pipeline = Pipeline(stages=[document_assembler, embeddings,ClfModel])
ld_pipeline_model = ld_pipeline.fit(spark.createDataFrame([['']]).toDF("text"))

In [47]:
# Apply Model Transform to testData
ld_preds = ld_pipeline_model.transform(test)

In [48]:
ld_preds_df = ld_preds.select('text','label',"class.result").toPandas()

22/10/20 15:04:27 WARN TaskSetManager: Stage 38 contains a task of very large size (1279 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

In [49]:
ld_preds_df.head()

Unnamed: 0,text,label,result
0,\n\n \n\n\nOn July 1 2020 we entered into the ...,risk_factors,[financial_statements]
1,\n \n\n\nIdentification and treatment of contr...,financial_statements,[financial_statements]
2,\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n20...,form_10k_summary,[financial_statements]
3,\n\n\nCash \n 3 \n\nReceivables \n 10 \n\nOth...,financial_statements,[financial_statements]
4,\n \n \n \n \n \n \n \n \n\n\n\nGranted\n\n \...,form_10k_summary,[financial_statements]


## With RoBerta Embeddings

We do not have Legal Sentence Embeddings yet, But we can use the Legal RoBerta Embeddings and then average them.

In [50]:
embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_legal_roberta_base","en") \
    .setInputCols(["document", "token"]) \
    .setOutputCol("embeddings") \
    .setMaxSentenceLength(512)

roberta_embeddings_legal_roberta_base download started this may take some time.
Approximate size to download 447.2 MB
[OK!]


In [51]:
document_assembler = nlp.DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("document")

tokenizer = nlp.Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("token")

embeddingsSentence = nlp.SentenceEmbeddings() \
    .setInputCols(["document", "embeddings"]) \
    .setOutputCol("sentence_embeddings") \
    .setPoolingStrategy("AVERAGE")

classsifierdl = legal.ClassifierDLApproach() \
    .setInputCols(["sentence_embeddings"]) \
    .setOutputCol("class") \
    .setLabelColumn("label")\
    .setMaxEpochs(8)\
    .setLr(0.001)\
    .setEnableOutputLogs(True)

clf_pipeline = Pipeline(
    stages = [
        document_assembler,
        tokenizer,
        embeddings,
        embeddingsSentence,
        classsifierdl
    ])

In [52]:
clf_pipelineModel = clf_pipeline.fit(train)

22/10/20 15:04:35 WARN TaskSetManager: Stage 39 contains a task of very large size (1279 KiB). The maximum recommended task size is 1000 KiB.
22/10/20 15:23:54 WARN TaskSetManager: Stage 42 contains a task of very large size (1279 KiB). The maximum recommended task size is 1000 KiB.
2022-10-20 15:43:23.278928: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:43] Reading SavedModel from: /tmp/9f34c15c5d9a_classifier_dl7862500997666759905
2022-10-20 15:43:23.489507: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:107] Reading meta graph with tags { serve }
2022-10-20 15:43:23.489557: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:148] Reading SavedModel debug info (if present) from: /tmp/9f34c15c5d9a_classifier_dl7862500997666759905
2022-10-20 15:43:24.377958: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:228] Restoring SavedModel bundle.
2022-10-20 15:43:25.663582: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:212

Training started - epochs: 8 - learning_rate: 0.001 - batch_size: 64 - training_examples: 9746 - classes: 14
Epoch 1/8 - 1.22s - loss: 327.53708 - acc: 0.56954724 - batches: 153
Epoch 2/8 - 0.66s - loss: 323.2948 - acc: 0.59401274 - batches: 153
Epoch 3/8 - 0.65s - loss: 322.47052 - acc: 0.59555465 - batches: 153
Epoch 4/8 - 0.66s - loss: 322.27127 - acc: 0.59740496 - batches: 153
Epoch 5/8 - 0.68s - loss: 322.20648 - acc: 0.59802175 - batches: 153
Epoch 6/8 - 0.67s - loss: 322.1755 - acc: 0.59904975 - batches: 153
Epoch 7/8 - 0.67s - loss: 322.14856 - acc: 0.5997693 - batches: 153
Epoch 8/8 - 0.65s - loss: 322.1189 - acc: 0.6000777 - batches: 153


In [53]:
preds = clf_pipelineModel.transform(test)

In [54]:
preds_df = preds.select('label','text',"class.result").toPandas()

22/10/20 15:43:37 WARN TaskSetManager: Stage 43 contains a task of very large size (1279 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

In [55]:
preds_df.head()

Unnamed: 0,label,text,result
0,risk_factors,\n\n \n\n\nOn July 1 2020 we entered into the ...,[financial_statements]
1,financial_statements,\n \n\n\nIdentification and treatment of contr...,[financial_statements]
2,form_10k_summary,\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n20...,[financial_statements]
3,financial_statements,\n\n\nCash \n 3 \n\nReceivables \n 10 \n\nOth...,[financial_statements]
4,form_10k_summary,\n \n \n \n \n \n \n \n \n\n\n\nGranted\n\n \...,[financial_statements]


In [56]:
# Let's explode the array and get the item(s) inside of result column out
preds_df['result'] = preds_df['result'].apply(lambda x : x[0])

from sklearn.metrics import classification_report

print (classification_report(preds_df['label'], preds_df['result']))


                         precision    recall  f1-score   support

               business       0.00      0.00      0.00       405
    controls_procedures       0.00      0.00      0.00        59
                 equity       0.00      0.00      0.00        50
             executives       0.00      0.00      0.00        33
executives_compensation       0.00      0.00      0.00        63
               exhibits       0.00      0.00      0.00        12
   financial_conditions       0.00      0.00      0.00       161
   financial_statements       0.58      0.97      0.73       760
       form_10k_summary       0.00      0.00      0.00       106
      legal_proceedings       0.00      0.00      0.00        17
            market_risk       0.00      0.00      0.00        45
             properties       0.00      0.00      0.00        15
           risk_factors       0.59      0.96      0.73       767
     security_ownership       0.00      0.00      0.00        18

               accuracy

# Save model and Zip it for Modelshub Upload/Downloads

In [57]:
# Save a Spark NLP model
clf_pipelineModel.stages[-1].write().overwrite().save('ClfBert')

import shutil

shutil.make_archive('ClfBert', 'zip', 'ClfBert')

'/home/ubuntu/notebooks/examples/legal/ClfBert.zip'