
![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)





# Train Domain-specific Multiclass and Multilabel classifiers

## Setup

In [2]:
%pip install -q tensorflow==2.7.0
%pip install -q tensorflow-addons

In [4]:
from johnsnowlabs import *

import json
import os

print("Spark NLP Version :", sparknlp.version())

spark = start_spark()

Spark NLP Version : 4.2.1
📋 Loading license number 0 from /home/ubuntu/.johnsnowlabs/licenses/license_number_0_for_.json


22/10/20 09:37:34 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


👌 Launched [92mcpu-Optimized JVM[39m SparkSession with Jars for: 🚀Spark-NLP==4.2.1, 💊Spark-Healthcare==4.2.0, 🕶Spark-OCR==4.1.0, running on ⚡ PySpark==3.1.2


# Multilabel classifier training

## Loading the data

In [6]:
! wget -q https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/tutorials/Certification_Trainings_JSL/Finance/data/finance_data.csv

In [7]:
import pandas as pd
df = pd.read_csv('./finance_data.csv')
df['label'] = df['label'].apply(eval)

In [8]:
data = spark.createDataFrame(df)

# If you have a single dataset, then split it or else you can load the test dataset the same way that you load the train data.
train, test = data.randomSplit([0.8, 0.2], seed = 123)

In [9]:
train.show(truncate=50)

22/10/20 09:40:14 WARN TaskSetManager: Stage 0 contains a task of very large size (1746 KiB). The maximum recommended task size is 1000 KiB.
[Stage 0:>                                                          (0 + 1) / 1]

+--------------------------------------------------+-----------------------------------+
|                                         provision|                              label|
+--------------------------------------------------+-----------------------------------+
|(a) Consultant or Company may terminate this Pr...|                     [terminations]|
|(a) Effective as of the Effective Date, the Hol...|            [waivers, terminations]|
|(a) No failure or delay by the Administrative A...|              [waivers, amendments]|
|(a) No failure or delay by the Agent or any Len...|              [waivers, amendments]|
|(a) No failure or delay of the Administrative A...|              [waivers, amendments]|
|(a) The Credit Agreement is, effective as of th...|                       [amendments]|
|(a) The provisions of this Agreement shall be b...|              [assigns, successors]|
|(a) To induce the other parties hereto to enter...|      [representations, warranties]|
|(a)  The provisions 

                                                                                

In [10]:
from pyspark.sql.functions import col

test.groupBy("label") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

22/10/20 09:40:16 WARN TaskSetManager: Stage 1 contains a task of very large size (1746 KiB). The maximum recommended task size is 1000 KiB.

+--------------------+-----+
|               label|count|
+--------------------+-----+
|    [governing laws]|  772|
|      [counterparts]|  573|
| [entire agreements]|  558|
|           [notices]|  530|
|      [severability]|  507|
|          [survival]|  343|
|[assigns, success...|  319|
|        [amendments]|  309|
|          [expenses]|  248|
|      [terminations]|  237|
|           [waivers]|  205|
|[waivers, amendme...|  195|
|[representations,...|  191|
|       [assignments]|  187|
|   [representations]|   94|
|[amendments, enti...|   54|
|        [successors]|   54|
|        [warranties]|   39|
|[amendments, term...|   25|
|[assignments, suc...|   11|
+--------------------+-----+
only showing top 20 rows



                                                                                

 ## With Universal Encoder

In [13]:
document_assembler = nlp.DocumentAssembler() \
    .setInputCol("provision") \
    .setOutputCol("document") \
    .setCleanupMode("shrink")

embeddings = nlp.UniversalSentenceEncoder.pretrained() \
      .setInputCols("document") \
      .setOutputCol("sentence_embeddings")

classsifierdl = nlp.MultiClassifierDLApproach()\
      .setInputCols(["sentence_embeddings"])\
      .setOutputCol("class")\
      .setLabelColumn("label")\
      .setMaxEpochs(30)\
      .setEnableOutputLogs(True)

clf_pipeline = Pipeline(
    stages = [
        document_assembler,
        embeddings,
        classsifierdl
    ])

tfhub_use download started this may take some time.
Approximate size to download 923.7 MB
[ | ]tfhub_use download started this may take some time.
Approximate size to download 923.7 MB
Download done! Loading the resource.
[ \ ]

2022-10-19 16:01:04.282594: I external/org_tensorflow/tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


[OK!]


In [14]:
%%time
clf_pipelineModel = clf_pipeline.fit(train)

22/10/19 16:01:16 WARN TaskSetManager: Stage 4 contains a task of very large size (3476 KiB). The maximum recommended task size is 1000 KiB.
22/10/19 16:01:22 WARN TaskSetManager: Stage 6 contains a task of very large size (3476 KiB). The maximum recommended task size is 1000 KiB.
22/10/19 16:01:23 WARN TaskSetManager: Stage 7 contains a task of very large size (3476 KiB). The maximum recommended task size is 1000 KiB.
2022-10-19 16:01:29.348151: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:43] Reading SavedModel from: /tmp/d8122f7a39a2_classifier_dl1394760924665424582
2022-10-19 16:01:29.686160: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:107] Reading meta graph with tags { serve }
2022-10-19 16:01:29.686440: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:148] Reading SavedModel debug info (if present) from: /tmp/d8122f7a39a2_classifier_dl1394760924665424582
2022-10-19 16:01:33.376724: I external/org_tensorflow/tensorflow/cc/saved_mode

Training started - epochs: 30 - learning_rate: 0.001 - batch_size: 64 - training_examples: 22042 - classes: 15
Epoch 1/30 - 6.23s - loss: 0.16814168 - acc: 0.9498104 - batches: 345
Epoch 2/30 - 3.22s - loss: 0.07261445 - acc: 0.9774559 - batches: 345
Epoch 3/30 - 3.34s - loss: 0.06150661 - acc: 0.9812575 - batches: 345
Epoch 4/30 - 3.21s - loss: 0.05632347 - acc: 0.9827865 - batches: 345
Epoch 5/30 - 3.51s - loss: 0.053312555 - acc: 0.9838887 - batches: 345
Epoch 6/30 - 3.46s - loss: 0.05126492 - acc: 0.98463815 - batches: 345
Epoch 7/30 - 3.18s - loss: 0.049728964 - acc: 0.9851406 - batches: 345
Epoch 8/30 - 3.21s - loss: 0.04850401 - acc: 0.9856131 - batches: 345
Epoch 9/30 - 3.16s - loss: 0.04748668 - acc: 0.9858874 - batches: 345
Epoch 10/30 - 3.18s - loss: 0.04661861 - acc: 0.98614484 - batches: 345
Epoch 11/30 - 3.17s - loss: 0.045863416 - acc: 0.98649 - batches: 345
Epoch 12/30 - 3.18s - loss: 0.045195952 - acc: 0.98674446 - batches: 345
Epoch 13/30 - 3.18s - loss: 0.044598877 -

In [16]:
preds = clf_pipelineModel.transform(test)

In [17]:
preds_df = preds.select('label','provision',"class.result").toPandas()
preds_df.head()

22/10/19 16:04:54 WARN TaskSetManager: Stage 8 contains a task of very large size (3476 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

Unnamed: 0,label,provision,result
0,"[waivers, terminations]","(a) Effective as of the Effective Date, the Ho...",[representations]
1,"[waivers, amendments]",(a) No failure or delay by the Administrative ...,[waivers]
2,"[waivers, amendments]",(a) No failure or delay on the part of any par...,[waivers]
3,[assignments],"(a) Seller, the Agent, each Managing Agent, ea...","[successors, assignments]"
4,"[assigns, successors]",(a) The provisions of this Agreement shall be ...,"[successors, assigns]"


In [18]:
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

mlb = MultiLabelBinarizer()

y_true = mlb.fit_transform(preds_df['label'])
y_pred = mlb.fit_transform(preds_df['result'])


print("Classification report: \n", (classification_report(y_true, y_pred)))
print("F1 micro averaging:",(f1_score(y_true, y_pred, average='micro')))
print("ROC: ",(roc_auc_score(y_true, y_pred, average="micro")))


Classification report: 
               precision    recall  f1-score   support

           0       0.88      0.77      0.82       618
           1       0.73      0.52      0.61       198
           2       0.79      0.75      0.77       302
           3       0.99      0.98      0.99       587
           4       0.98      0.94      0.96       675
           5       0.98      0.92      0.95       228
           6       0.98      0.98      0.98       784
           7       0.98      0.96      0.97       574
           8       0.92      0.79      0.85       291
           9       0.99      0.94      0.96       531
          10       0.84      0.85      0.84       361
          11       0.96      0.91      0.94       329
          12       0.89      0.73      0.80       272
          13       0.90      0.75      0.82       460
          14       0.83      0.80      0.82       227

   micro avg       0.93      0.87      0.90      6437
   macro avg       0.91      0.84      0.87      6437
w

## With Bert Embeddings

We do not have have any specific Financial Sentence Embeddings, but we can use Financial Bert Embeddings and then average them.

In [11]:
embeddings = nlp.BertEmbeddings.pretrained("bert_embeddings_sec_bert_base","en") \
          .setInputCols(["document", "token"]) \
          .setOutputCol("embeddings")

bert_embeddings_sec_bert_base download started this may take some time.
Approximate size to download 390.4 MB
[ | ]bert_embeddings_sec_bert_base download started this may take some time.
Approximate size to download 390.4 MB
Download done! Loading the resource.
[ / ]

2022-10-20 09:40:41.389452: I external/org_tensorflow/tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


[OK!]


In [12]:
document_assembler = nlp.DocumentAssembler() \
    .setInputCol("provision") \
    .setOutputCol("document")

tokenizer = nlp.Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("token")

embeddingsSentence = nlp.SentenceEmbeddings() \
    .setInputCols(["document", "embeddings"]) \
    .setOutputCol("sentence_embeddings") \
    .setPoolingStrategy("AVERAGE")

classsifierdl = nlp.MultiClassifierDLApproach() \
    .setInputCols(["sentence_embeddings"]) \
    .setOutputCol("class") \
    .setLabelColumn("label")\
    .setMaxEpochs(8)\
    .setEnableOutputLogs(True)

clf_pipeline = Pipeline(
    stages = [
        document_assembler,
        tokenizer,
        embeddings,
        embeddingsSentence,
        classsifierdl
    ])

In [13]:
%%time
clf_pipelineModel = clf_pipeline.fit(train)

22/10/20 09:40:47 WARN TaskSetManager: Stage 6 contains a task of very large size (1746 KiB). The maximum recommended task size is 1000 KiB.
22/10/20 09:59:48 WARN TaskSetManager: Stage 8 contains a task of very large size (1746 KiB). The maximum recommended task size is 1000 KiB.
22/10/20 09:59:49 WARN TaskSetManager: Stage 9 contains a task of very large size (1746 KiB). The maximum recommended task size is 1000 KiB.
2022-10-20 10:19:33.390877: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:43] Reading SavedModel from: /tmp/3ecddebecd08_classifier_dl8386214112112396326
2022-10-20 10:19:34.177559: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:107] Reading meta graph with tags { serve }
2022-10-20 10:19:34.177629: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:148] Reading SavedModel debug info (if present) from: /tmp/3ecddebecd08_classifier_dl8386214112112396326
2022-10-20 10:19:39.606777: I external/org_tensorflow/tensorflow/cc/saved_mode

Training started - epochs: 8 - learning_rate: 0.001 - batch_size: 64 - training_examples: 22002 - classes: 15
Epoch 1/8 - 8.00s - loss: 0.08046199 - acc: 0.9752819 - batches: 344
Epoch 2/8 - 3.12s - loss: 0.03758629 - acc: 0.9896701 - batches: 344
Epoch 3/8 - 3.02s - loss: 0.033418175 - acc: 0.9909579 - batches: 344
Epoch 4/8 - 3.07s - loss: 0.03089657 - acc: 0.99175954 - batches: 344
Epoch 5/8 - 3.08s - loss: 0.02908047 - acc: 0.99235976 - batches: 344
Epoch 6/8 - 3.02s - loss: 0.027666898 - acc: 0.99286973 - batches: 344
Epoch 7/8 - 3.03s - loss: 0.026504492 - acc: 0.993295 - batches: 344
Epoch 8/8 - 3.03s - loss: 0.025508536 - acc: 0.9936544 - batches: 344
CPU times: user 362 ms, sys: 141 ms, total: 502 ms
Wall time: 39min 50s


In [14]:
preds = clf_pipelineModel.transform(test)

In [15]:
preds_df = preds.select('provision','label',"class.result").toPandas()

22/10/20 10:20:44 WARN TaskSetManager: Stage 10 contains a task of very large size (1746 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

In [16]:
preds_df.head()

Unnamed: 0,provision,label,result
0,(a) No failure or delay by any Agent or any Le...,"[waivers, amendments]","[waivers, amendments]"
1,"(a) Seller, the Agent, each Managing Agent, ea...",[assignments],[assignments]
2,(a) Any provision of this Agreement may be wai...,[waivers],[waivers]
3,(a) Except where telephonic instructions or no...,[notices],[notices]
4,(a) THIS AGREEMENT AND EACH OTHER LOAN DOCUMEN...,[governing laws],[governing laws]


In [17]:
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

mlb = MultiLabelBinarizer()

y_true = mlb.fit_transform(preds_df['label'])
y_pred = mlb.fit_transform(preds_df['result'])


print("Classification report: \n", (classification_report(y_true, y_pred)))
print("F1 micro averaging:",(f1_score(y_true, y_pred, average='micro')))
print("ROC: ",(roc_auc_score(y_true, y_pred, average="micro")))


Classification report: 
               precision    recall  f1-score   support

           0       0.96      0.82      0.88       609
           1       0.86      0.61      0.72       212
           2       0.78      0.93      0.85       334
           3       1.00      0.98      0.99       585
           4       1.00      0.96      0.98       647
           5       0.99      0.98      0.99       248
           6       0.98      0.98      0.98       791
           7       0.97      0.99      0.98       530
           8       0.91      0.95      0.93       285
           9       0.99      0.98      0.98       521
          10       0.83      0.94      0.88       394
          11       0.97      0.90      0.93       346
          12       0.93      0.88      0.91       268
          13       0.97      0.74      0.84       417
          14       0.85      0.80      0.83       230

   micro avg       0.94      0.91      0.93      6417
   macro avg       0.93      0.90      0.91      6417
w

### Saving & loading back the trained model

In [18]:
clf_pipelineModel.stages

[DocumentAssembler_d8e9467dc46f,
 REGEX_TOKENIZER_9e07f01bddbf,
 BERT_EMBEDDINGS_29ce72cd673e,
 SentenceEmbeddings_d782500f40bc,
 MultiClassifierDLModel_e6b50a8d31be]

In [19]:
clf_pipelineModel.stages[-1].write().overwrite().save('MultilabelClfBert')

In [20]:
# Load back  saved Multilabel Classifier Model
MultilabelClfModel = nlp.MultiClassifierDLModel.load('MultilabelClfBert')

In [21]:
ld_pipeline = Pipeline(stages=[document_assembler, tokenizer, embeddings, embeddingsSentence, MultilabelClfModel])
ld_pipeline_model = ld_pipeline.fit(spark.createDataFrame([['']]).toDF("provision"))

In [22]:
# Apply Model Transform to testData
ld_preds = ld_pipeline_model.transform(test)

In [23]:
ld_preds_df = ld_preds.select('provision','label',"class.result").toPandas()

22/10/20 10:26:23 WARN TaskSetManager: Stage 16 contains a task of very large size (1746 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

In [24]:
ld_preds_df.head(10)

Unnamed: 0,provision,label,result
0,(a) No failure or delay by any Agent or any Le...,"[waivers, amendments]","[waivers, amendments]"
1,"(a) Seller, the Agent, each Managing Agent, ea...",[assignments],[assignments]
2,(a) Any provision of this Agreement may be wai...,[waivers],[waivers]
3,(a) Except where telephonic instructions or no...,[notices],[notices]
4,(a) THIS AGREEMENT AND EACH OTHER LOAN DOCUMEN...,[governing laws],[governing laws]
5,(a) To induce the other parties hereto to ente...,"[representations, warranties]","[warranties, representations]"
6,A counterpart original of this Amendment duly ...,[amendments],[]
7,Advisor represents that Advisor’s services und...,[representations],[representations]
8,"Agent may assign, indorse or transfer any inst...",[assignments],[]
9,Agreement may be amended from time to time by ...,[amendments],[amendments]


# Multiclass classifier training


## Loading the data

In [25]:
! wget -q https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/tutorials/Certification_Trainings_JSL/Finance/data/finance_clf_data.csv

In [26]:
import pandas as pd
df = pd.read_csv('./finance_clf_data.csv')

In [27]:
df.head()

Unnamed: 0,text,label,len
0,Presently we do not believe any U S or State r...,business,402
1,\nnetwork outages or performance degradation ...,risk_factors,496
2,Available Information\nOur reports filed with ...,business,356
3,\n 42 530\n \n \n \n \n \n 42 530\nTotal liab...,financial_statements,359
4,8\nTable of Contents\ndevelopment employee eng...,business,582


In [28]:
df['label'].value_counts()

risk_factors               1926
financial_statements       1888
business                    970
financial_conditions        346
form_10k_summary            240
executives_compensation     155
controls_procedures         138
equity                      111
market_risk                 100
executives                   73
legal_proceedings            51
properties                   48
security_ownership           46
exhibits                     36
Name: label, dtype: int64

In [29]:
data = spark.createDataFrame(df)

train, test = data.randomSplit([0.8, 0.2], seed = 100)

In [30]:
from pyspark.sql.functions import col

train.groupBy("label") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

+--------------------+-----+
|               label|count|
+--------------------+-----+
|        risk_factors| 1529|
|financial_statements| 1491|
|            business|  769|
|financial_conditions|  267|
|    form_10k_summary|  196|
|executives_compen...|  128|
| controls_procedures|  107|
|              equity|   92|
|         market_risk|   75|
|          executives|   58|
|          properties|   39|
|   legal_proceedings|   36|
|  security_ownership|   34|
|            exhibits|   31|
+--------------------+-----+



In [31]:
from pyspark.sql.functions import col

test.groupBy("label") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

+--------------------+-----+
|               label|count|
+--------------------+-----+
|        risk_factors|  397|
|financial_statements|  397|
|            business|  201|
|financial_conditions|   79|
|    form_10k_summary|   44|
| controls_procedures|   31|
|executives_compen...|   27|
|         market_risk|   25|
|              equity|   19|
|   legal_proceedings|   15|
|          executives|   15|
|  security_ownership|   12|
|          properties|    9|
|            exhibits|    5|
+--------------------+-----+



 ## With Universal Encoder

In [32]:
document_assembler = nlp.DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("document") 

embeddings = nlp.UniversalSentenceEncoder.pretrained() \
    .setInputCols("document") \
    .setOutputCol("sentence_embeddings")

classsifierdl = finance.ClassifierDLApproach()\
    .setInputCols(["sentence_embeddings"])\
    .setOutputCol("class")\
    .setLabelColumn("label")\
    .setMaxEpochs(30)\
    .setEnableOutputLogs(True)

clf_pipeline = Pipeline(
    stages = [
        document_assembler,
        embeddings,
        classsifierdl
    ])

tfhub_use download started this may take some time.
Approximate size to download 923.7 MB
[ | ]tfhub_use download started this may take some time.
Approximate size to download 923.7 MB
Download done! Loading the resource.
[OK!]


In [33]:
%%time
clf_pipelineModel = clf_pipeline.fit(train)

2022-10-20 10:32:20.794978: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:43] Reading SavedModel from: /tmp/fd0bbcc448ad_classifier_dl5542219089775412824
2022-10-20 10:32:20.899775: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:107] Reading meta graph with tags { serve }
2022-10-20 10:32:20.899835: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:148] Reading SavedModel debug info (if present) from: /tmp/fd0bbcc448ad_classifier_dl5542219089775412824
2022-10-20 10:32:21.613509: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:228] Restoring SavedModel bundle.
2022-10-20 10:32:22.699767: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:212] Running initialization op on SavedModel bundle at path: /tmp/fd0bbcc448ad_classifier_dl5542219089775412824
2022-10-20 10:32:22.936683: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:301] SavedModel load for tags { serve }; Status: success: OK. Took 2141719 microsecon

Training started - epochs: 30 - learning_rate: 0.005 - batch_size: 64 - training_examples: 4852 - classes: 14
Epoch 1/30 - 0.78s - loss: 162.76936 - acc: 0.52030444 - batches: 76
Epoch 2/30 - 0.34s - loss: 160.40976 - acc: 0.58634615 - batches: 76
Epoch 3/30 - 0.32s - loss: 160.40503 - acc: 0.62735575 - batches: 76
Epoch 4/30 - 0.32s - loss: 160.16975 - acc: 0.6813622 - batches: 76
Epoch 5/30 - 0.31s - loss: 159.4101 - acc: 0.69032055 - batches: 76
Epoch 6/30 - 0.31s - loss: 158.22449 - acc: 0.69261223 - batches: 76
Epoch 7/30 - 0.31s - loss: 157.78584 - acc: 0.6992789 - batches: 76
Epoch 8/30 - 0.30s - loss: 157.5914 - acc: 0.70245194 - batches: 76
Epoch 9/30 - 0.31s - loss: 157.44997 - acc: 0.7037019 - batches: 76
Epoch 10/30 - 0.31s - loss: 157.33437 - acc: 0.7062019 - batches: 76
Epoch 11/30 - 0.31s - loss: 157.22273 - acc: 0.70745194 - batches: 76
Epoch 12/30 - 0.31s - loss: 157.12648 - acc: 0.7082853 - batches: 76
Epoch 13/30 - 0.30s - loss: 157.0501 - acc: 0.7091186 - batches: 7

In [34]:
preds = clf_pipelineModel.transform(test)

In [35]:
preds_df = preds.select('label','text',"class.result").toPandas()
preds_df.head()

                                                                                

Unnamed: 0,label,text,result
0,risk_factors,\n\n \n\n\nNet cash provided by operating acti...,[financial_statements]
1,financial_statements,\n\n\n \n \n \n Identification of the contrac...,[financial_statements]
2,form_10k_summary,\n \n \n \n \n \n \n \n \n\n\n\nGranted\n\n \...,[financial_statements]
3,financial_statements,\n \n 120\n \n \n \n 202\n \n\n\n Net cash pr...,[financial_statements]
4,risk_factors,\n \nAn assertion by a third party that we ar...,[risk_factors]


In [36]:
# The result is an array since in Spark NLP you can have multiple sentences.
# Let's explode the array and get the item(s) inside of result column out
preds_df['result'] = preds_df['result'].apply(lambda x : x[0])

In [37]:
# We are going to use sklearn to evalute the results on test dataset
from sklearn.metrics import classification_report

print (classification_report(preds_df['label'], preds_df['result']))

                         precision    recall  f1-score   support

               business       0.71      0.70      0.71       201
    controls_procedures       0.00      0.00      0.00        31
                 equity       0.00      0.00      0.00        19
             executives       0.00      0.00      0.00        15
executives_compensation       0.00      0.00      0.00        27
               exhibits       0.00      0.00      0.00         5
   financial_conditions       0.00      0.00      0.00        79
   financial_statements       0.63      0.93      0.75       397
       form_10k_summary       0.00      0.00      0.00        44
      legal_proceedings       0.00      0.00      0.00        15
            market_risk       0.00      0.00      0.00        25
             properties       0.00      0.00      0.00         9
           risk_factors       0.72      0.88      0.79       397
     security_ownership       0.00      0.00      0.00        12

               accuracy

### Saving & loading back the trained model

In [38]:
clf_pipelineModel.stages

[DocumentAssembler_12c836272c69,
 UNIVERSAL_SENTENCE_ENCODER_4de71669b7ec,
 FinanceClassifierDLModel_1ed3e72d50cd]

In [39]:
clf_pipelineModel.stages[-1].write().overwrite().save('Clf_Use')

In [40]:
# Load back  saved Classifier Model
ClfModel = finance.ClassifierDLModel.load('Clf_Use')

In [41]:
ld_pipeline = Pipeline(stages=[document_assembler, embeddings,ClfModel])
ld_pipeline_model = ld_pipeline.fit(spark.createDataFrame([['']]).toDF("text"))

In [42]:
# Apply Model Transform to testData
ld_preds = ld_pipeline_model.transform(test)

In [43]:
ld_preds_df = ld_preds.select('text','label',"class.result").toPandas()

                                                                                

In [44]:
ld_preds_df.head()

Unnamed: 0,text,label,result
0,\n\n \n\n\nNet cash provided by operating acti...,risk_factors,[financial_statements]
1,\n\n\n \n \n \n Identification of the contrac...,financial_statements,[financial_statements]
2,\n \n \n \n \n \n \n \n \n\n\n\nGranted\n\n \...,form_10k_summary,[financial_statements]
3,\n \n 120\n \n \n \n 202\n \n\n\n Net cash pr...,financial_statements,[financial_statements]
4,\n \nAn assertion by a third party that we ar...,risk_factors,[risk_factors]


## With Bert Embeddings

We do not have Financial Sentence Embeddings yet, But we can use the Financial Word Embeddings and then average them.

In [60]:
embeddings = nlp.BertEmbeddings.pretrained("bert_embeddings_sec_bert_base","en") \
    .setInputCols(["document", "token"]) \
    .setOutputCol("embeddings")

bert_embeddings_sec_bert_base download started this may take some time.
Approximate size to download 390.4 MB
[OK!]


In [61]:
document_assembler = nlp.DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("document")

tokenizer = nlp.Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("token")

embeddingsSentence = nlp.SentenceEmbeddings() \
    .setInputCols(["document", "embeddings"]) \
    .setOutputCol("sentence_embeddings") \
    .setPoolingStrategy("AVERAGE")

classsifierdl = finance.ClassifierDLApproach() \
    .setInputCols(["sentence_embeddings"]) \
    .setOutputCol("class") \
    .setLabelColumn("label")\
    .setMaxEpochs(8)\
    .setLr(0.0003)\
    .setEnableOutputLogs(True)

clf_pipeline = Pipeline(
    stages = [
        document_assembler,
        tokenizer,
        embeddings,
        embeddingsSentence,
        classsifierdl
    ])

In [62]:
%%time
clf_pipelineModel = clf_pipeline.fit(train)

2022-10-20 11:09:47.108826: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:43] Reading SavedModel from: /tmp/b92e6dda2a49_classifier_dl9053818733325333302
2022-10-20 11:09:47.237424: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:107] Reading meta graph with tags { serve }
2022-10-20 11:09:47.237482: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:148] Reading SavedModel debug info (if present) from: /tmp/b92e6dda2a49_classifier_dl9053818733325333302
2022-10-20 11:09:47.914353: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:228] Restoring SavedModel bundle.
2022-10-20 11:09:48.782802: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:212] Running initialization op on SavedModel bundle at path: /tmp/b92e6dda2a49_classifier_dl9053818733325333302
2022-10-20 11:09:49.001339: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:301] SavedModel load for tags { serve }; Status: success: OK. Took 1892526 microsecon

Training started - epochs: 8 - learning_rate: 3.0E-4 - batch_size: 64 - training_examples: 4852 - classes: 14
Epoch 1/8 - 0.77s - loss: 168.7488 - acc: 0.55785257 - batches: 76
Epoch 2/8 - 0.31s - loss: 158.28677 - acc: 0.6030609 - batches: 76
Epoch 3/8 - 0.31s - loss: 157.80093 - acc: 0.60722756 - batches: 76
Epoch 4/8 - 0.31s - loss: 157.63396 - acc: 0.60977566 - batches: 76
Epoch 5/8 - 0.32s - loss: 157.5247 - acc: 0.610609 - batches: 76
Epoch 6/8 - 0.33s - loss: 157.43472 - acc: 0.61185896 - batches: 76
Epoch 7/8 - 0.32s - loss: 157.36235 - acc: 0.61185896 - batches: 76
Epoch 8/8 - 0.31s - loss: 157.30675 - acc: 0.61227566 - batches: 76
CPU times: user 140 ms, sys: 40.3 ms, total: 180 ms
Wall time: 8min 20s


In [63]:
preds = clf_pipelineModel.transform(test)

In [64]:
preds_df = preds.select('label','text',"class.result").toPandas()

                                                                                

In [65]:
preds_df.head()

Unnamed: 0,label,text,result
0,risk_factors,\n\n \n\n\nNet cash provided by operating acti...,[financial_statements]
1,financial_statements,\n\n\n \n \n \n Identification of the contrac...,[financial_statements]
2,form_10k_summary,\n \n \n \n \n \n \n \n \n\n\n\nGranted\n\n \...,[financial_statements]
3,financial_statements,\n \n 120\n \n \n \n 202\n \n\n\n Net cash pr...,[financial_statements]
4,risk_factors,\n \nAn assertion by a third party that we ar...,[risk_factors]


In [66]:
# Let's explode the array and get the item(s) inside of result column out
preds_df['result'] = preds_df['result'].apply(lambda x : x[0])

from sklearn.metrics import classification_report

print (classification_report(preds_df['label'], preds_df['result']))


                         precision    recall  f1-score   support

               business       0.00      0.00      0.00       201
    controls_procedures       0.00      0.00      0.00        31
                 equity       0.00      0.00      0.00        19
             executives       0.00      0.00      0.00        15
executives_compensation       0.00      0.00      0.00        27
               exhibits       0.00      0.00      0.00         5
   financial_conditions       0.00      0.00      0.00        79
   financial_statements       0.59      0.97      0.73       397
       form_10k_summary       0.00      0.00      0.00        44
      legal_proceedings       0.00      0.00      0.00        15
            market_risk       0.00      0.00      0.00        25
             properties       0.00      0.00      0.00         9
           risk_factors       0.60      0.95      0.74       397
     security_ownership       0.00      0.00      0.00        12

               accuracy

# Save model and Zip it for Modelshub Upload/Downloads

In [69]:
# Save a Spark NLP model
clf_pipelineModel.stages[-1].write().overwrite().save('ClfBert')


In [70]:
import shutil

shutil.make_archive('ClfBert', 'zip', 'ClfBert')

'/home/ubuntu/notebooks/examples/finance/ClfBert.zip'