
![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)





# Train Domain-specific Multiclass and Multilabel classifiers

## Setup

In [None]:
from johnsnowlabs import *

import pandas as pd
import json
import os

spark = start_spark()

# Multilabel classifier training

## Loading the data

In [2]:
! wget -q https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/tutorials/Certification_Trainings_JSL/Finance/data/finance_data.csv

In [3]:
df = pd.read_csv('./finance_data.csv')
df['label'] = df['label'].apply(eval)

In [4]:
data = spark.createDataFrame(df)

# If you have a single dataset, then split it or else you can load the test dataset the same way that you load the train data.
train, test = data.randomSplit([0.8, 0.2], seed = 123)

In [5]:
train.show(truncate=50)

22/12/12 14:37:11 WARN TaskSetManager: Stage 0 contains a task of very large size (1746 KiB). The maximum recommended task size is 1000 KiB.
[Stage 0:>                                                          (0 + 1) / 1]

+--------------------------------------------------+-----------------------------------+
|                                         provision|                              label|
+--------------------------------------------------+-----------------------------------+
|(a) Consultant or Company may terminate this Pr...|                     [terminations]|
|(a) Effective as of the Effective Date, the Hol...|            [waivers, terminations]|
|(a) No failure or delay by the Administrative A...|              [waivers, amendments]|
|(a) No failure or delay by the Agent or any Len...|              [waivers, amendments]|
|(a) No failure or delay of the Administrative A...|              [waivers, amendments]|
|(a) The Credit Agreement is, effective as of th...|                       [amendments]|
|(a) The provisions of this Agreement shall be b...|              [assigns, successors]|
|(a) To induce the other parties hereto to enter...|      [representations, warranties]|
|(a)  The provisions 

                                                                                

In [6]:
from pyspark.sql.functions import col

test.groupBy("label") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

22/12/12 14:37:13 WARN TaskSetManager: Stage 1 contains a task of very large size (1746 KiB). The maximum recommended task size is 1000 KiB.

+--------------------+-----+
|               label|count|
+--------------------+-----+
|    [governing laws]|  772|
|      [counterparts]|  573|
| [entire agreements]|  558|
|           [notices]|  530|
|      [severability]|  507|
|          [survival]|  343|
|[assigns, success...|  319|
|        [amendments]|  309|
|          [expenses]|  248|
|      [terminations]|  237|
|           [waivers]|  205|
|[waivers, amendme...|  195|
|[representations,...|  191|
|       [assignments]|  187|
|   [representations]|   94|
|        [successors]|   54|
|[amendments, enti...|   54|
|        [warranties]|   39|
|[amendments, term...|   25|
|[assignments, suc...|   11|
+--------------------+-----+
only showing top 20 rows



                                                                                

 ## With Universal Encoder

In [None]:
document_assembler = nlp.DocumentAssembler() \
    .setInputCol("provision") \
    .setOutputCol("document") \
    .setCleanupMode("shrink")

embeddings = nlp.UniversalSentenceEncoder.pretrained() \
      .setInputCols("document") \
      .setOutputCol("sentence_embeddings")

classsifierdl = nlp.MultiClassifierDLApproach()\
      .setInputCols(["sentence_embeddings"])\
      .setOutputCol("class")\
      .setLabelColumn("label")\
      .setMaxEpochs(30)\
      .setEnableOutputLogs(True)

clf_pipeline = nlp.Pipeline(stages = [
        document_assembler,
        embeddings,
        classsifierdl
    ])

In [9]:
%%time
clf_pipelineModel = clf_pipeline.fit(train)

Training started - epochs: 30 - learning_rate: 0.001 - batch_size: 64 - training_examples: 22002 - classes: 15
Epoch 1/30 - 7.07s - loss: 0.168192 - acc: 0.94994646 - batches: 344
Epoch 2/30 - 2.86s - loss: 0.07170924 - acc: 0.9776423 - batches: 344
Epoch 3/30 - 2.94s - loss: 0.06072507 - acc: 0.98128265 - batches: 344
Epoch 4/30 - 2.91s - loss: 0.055653386 - acc: 0.9830967 - batches: 344
Epoch 5/30 - 2.90s - loss: 0.052645776 - acc: 0.984132 - batches: 344
Epoch 6/30 - 2.82s - loss: 0.05059618 - acc: 0.9848573 - batches: 344
Epoch 7/30 - 2.87s - loss: 0.049065854 - acc: 0.9853485 - batches: 344
Epoch 8/30 - 2.80s - loss: 0.04785178 - acc: 0.9858103 - batches: 344
Epoch 9/30 - 2.79s - loss: 0.04684918 - acc: 0.98614776 - batches: 344
Epoch 10/30 - 2.82s - loss: 0.04599774 - acc: 0.9864211 - batches: 344
Epoch 11/30 - 2.85s - loss: 0.04525937 - acc: 0.98659724 - batches: 344
Epoch 12/30 - 2.82s - loss: 0.044609535 - acc: 0.98679745 - batches: 344
Epoch 13/30 - 2.80s - loss: 0.044030655 

In [10]:
preds = clf_pipelineModel.transform(test)

In [11]:
preds_df = preds.select('label','provision',"class.result").toPandas()
preds_df.head()

22/12/12 14:18:12 WARN TaskSetManager: Stage 10 contains a task of very large size (1746 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

Unnamed: 0,label,provision,result
0,"[waivers, amendments]",(a) No failure or delay by any Agent or any Le...,"[waivers, amendments]"
1,[assignments],"(a) Seller, the Agent, each Managing Agent, ea...",[assignments]
2,[waivers],(a) Any provision of this Agreement may be wai...,"[waivers, amendments]"
3,[notices],(a) Except where telephonic instructions or no...,[notices]
4,[governing laws],(a) THIS AGREEMENT AND EACH OTHER LOAN DOCUMEN...,[governing laws]


In [12]:
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

mlb = MultiLabelBinarizer()

y_true = mlb.fit_transform(preds_df['label'])
y_pred = mlb.fit_transform(preds_df['result'])


print("Classification report: \n", (classification_report(y_true, y_pred)))
print("F1 micro averaging:",(f1_score(y_true, y_pred, average='micro')))
print("ROC: ",(roc_auc_score(y_true, y_pred, average="micro")))


Classification report: 
               precision    recall  f1-score   support

           0       0.87      0.81      0.84       609
           1       0.71      0.57      0.63       212
           2       0.81      0.78      0.79       334
           3       1.00      0.98      0.99       585
           4       0.97      0.96      0.97       647
           5       0.96      0.98      0.97       248
           6       0.99      0.97      0.98       791
           7       0.97      0.94      0.95       530
           8       0.90      0.81      0.85       285
           9       0.96      0.96      0.96       521
          10       0.87      0.86      0.86       394
          11       0.93      0.92      0.92       346
          12       0.88      0.73      0.80       268
          13       0.86      0.76      0.81       417
          14       0.84      0.70      0.77       230

   micro avg       0.92      0.88      0.90      6417
   macro avg       0.90      0.85      0.87      6417
w

## With Bert Embeddings

We do not have have any specific Financial Sentence Embeddings, but we can use Financial Bert Embeddings and then average them.

In [None]:
embeddings = nlp.BertEmbeddings.pretrained("bert_embeddings_sec_bert_base","en") \
          .setInputCols(["document", "token"]) \
          .setOutputCol("embeddings")

In [8]:
document_assembler = nlp.DocumentAssembler() \
    .setInputCol("provision") \
    .setOutputCol("document")

tokenizer = nlp.Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("token")

embeddingsSentence = nlp.SentenceEmbeddings() \
    .setInputCols(["document", "embeddings"]) \
    .setOutputCol("sentence_embeddings") \
    .setPoolingStrategy("AVERAGE")

classsifierdl = nlp.MultiClassifierDLApproach() \
    .setInputCols(["sentence_embeddings"]) \
    .setOutputCol("class") \
    .setLabelColumn("label")\
    .setMaxEpochs(8)\
    .setEnableOutputLogs(True)

clf_pipeline = nlp.Pipeline(stages = [
        document_assembler,
        tokenizer,
        embeddings,
        embeddingsSentence,
        classsifierdl
    ])

In [9]:
%%time
clf_pipelineModel = clf_pipeline.fit(train)

22/12/12 14:37:31 WARN TaskSetManager: Stage 6 contains a task of very large size (1746 KiB). The maximum recommended task size is 1000 KiB.
22/12/12 14:58:00 WARN TaskSetManager: Stage 8 contains a task of very large size (1746 KiB). The maximum recommended task size is 1000 KiB.
22/12/12 14:58:01 WARN TaskSetManager: Stage 9 contains a task of very large size (1746 KiB). The maximum recommended task size is 1000 KiB.
2022-12-12 15:18:52.090285: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:43] Reading SavedModel from: /tmp/8d3ada72c050_classifier_dl3410442188111515414
2022-12-12 15:18:52.816645: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:107] Reading meta graph with tags { serve }
2022-12-12 15:18:52.816694: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:148] Reading SavedModel debug info (if present) from: /tmp/8d3ada72c050_classifier_dl3410442188111515414
2022-12-12 15:18:57.781998: I external/org_tensorflow/tensorflow/cc/saved_mode

Training started - epochs: 8 - learning_rate: 0.001 - batch_size: 64 - training_examples: 22002 - classes: 15
Epoch 1/8 - 7.26s - loss: 0.08046199 - acc: 0.9752819 - batches: 344
Epoch 2/8 - 2.93s - loss: 0.03758629 - acc: 0.9896701 - batches: 344
Epoch 3/8 - 2.78s - loss: 0.033418175 - acc: 0.9909579 - batches: 344
Epoch 4/8 - 2.86s - loss: 0.03089657 - acc: 0.99175954 - batches: 344
Epoch 5/8 - 2.84s - loss: 0.02908047 - acc: 0.99235976 - batches: 344
Epoch 6/8 - 2.72s - loss: 0.027666898 - acc: 0.99286973 - batches: 344
Epoch 7/8 - 2.74s - loss: 0.026504492 - acc: 0.993295 - batches: 344
Epoch 8/8 - 2.77s - loss: 0.025508536 - acc: 0.9936544 - batches: 344
CPU times: user 325 ms, sys: 90.4 ms, total: 415 ms
Wall time: 42min 21s


In [10]:
preds = clf_pipelineModel.transform(test)

In [11]:
preds_df = preds.select('provision','label',"class.result").toPandas()

22/12/12 15:19:51 WARN TaskSetManager: Stage 10 contains a task of very large size (1746 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

In [12]:
preds_df.head()

Unnamed: 0,provision,label,result
0,(a) No failure or delay by any Agent or any Le...,"[waivers, amendments]","[waivers, amendments]"
1,"(a) Seller, the Agent, each Managing Agent, ea...",[assignments],[assignments]
2,(a) Any provision of this Agreement may be wai...,[waivers],[waivers]
3,(a) Except where telephonic instructions or no...,[notices],[notices]
4,(a) THIS AGREEMENT AND EACH OTHER LOAN DOCUMEN...,[governing laws],[governing laws]


In [13]:
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

mlb = MultiLabelBinarizer()

y_true = mlb.fit_transform(preds_df['label'])
y_pred = mlb.fit_transform(preds_df['result'])


print("Classification report: \n", (classification_report(y_true, y_pred)))
print("F1 micro averaging:",(f1_score(y_true, y_pred, average='micro')))
print("ROC: ",(roc_auc_score(y_true, y_pred, average="micro")))


Classification report: 
               precision    recall  f1-score   support

           0       0.96      0.82      0.88       609
           1       0.86      0.61      0.72       212
           2       0.78      0.93      0.85       334
           3       1.00      0.98      0.99       585
           4       1.00      0.96      0.98       647
           5       0.99      0.98      0.99       248
           6       0.98      0.98      0.98       791
           7       0.97      0.99      0.98       530
           8       0.91      0.95      0.93       285
           9       0.99      0.98      0.98       521
          10       0.83      0.94      0.88       394
          11       0.97      0.90      0.93       346
          12       0.93      0.88      0.91       268
          13       0.97      0.74      0.84       417
          14       0.85      0.80      0.83       230

   micro avg       0.94      0.91      0.93      6417
   macro avg       0.93      0.90      0.91      6417
w

### Saving & loading back the trained model

In [14]:
clf_pipelineModel.stages

[DocumentAssembler_28f376563362,
 REGEX_TOKENIZER_e8647eaf1aca,
 BERT_EMBEDDINGS_29ce72cd673e,
 SentenceEmbeddings_fe8b18b32259,
 MultiClassifierDLModel_3038ff096f7e]

In [15]:
clf_pipelineModel.stages[-1].write().overwrite().save('MultilabelClfBert')

In [16]:
# Load back  saved Multilabel Classifier Model
MultilabelClfModel = nlp.MultiClassifierDLModel.load('MultilabelClfBert')

In [17]:
ld_pipeline = nlp.Pipeline(stages=[document_assembler, tokenizer, embeddings, embeddingsSentence, MultilabelClfModel])
ld_pipeline_model = ld_pipeline.fit(spark.createDataFrame([['']]).toDF("provision"))

In [18]:
# Apply Model Transform to testData
ld_preds = ld_pipeline_model.transform(test)

In [19]:
ld_preds_df = ld_preds.select('provision','label',"class.result").toPandas()

22/12/12 15:26:31 WARN TaskSetManager: Stage 17 contains a task of very large size (1746 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

In [20]:
ld_preds_df.head(10)

Unnamed: 0,provision,label,result
0,(a) No failure or delay by any Agent or any Le...,"[waivers, amendments]","[waivers, amendments]"
1,"(a) Seller, the Agent, each Managing Agent, ea...",[assignments],[assignments]
2,(a) Any provision of this Agreement may be wai...,[waivers],[waivers]
3,(a) Except where telephonic instructions or no...,[notices],[notices]
4,(a) THIS AGREEMENT AND EACH OTHER LOAN DOCUMEN...,[governing laws],[governing laws]
5,(a) To induce the other parties hereto to ente...,"[representations, warranties]","[warranties, representations]"
6,A counterpart original of this Amendment duly ...,[amendments],[]
7,Advisor represents that Advisor’s services und...,[representations],[representations]
8,"Agent may assign, indorse or transfer any inst...",[assignments],[]
9,Agreement may be amended from time to time by ...,[amendments],[amendments]


# Multiclass classifier training


## Loading the data

In [21]:
! wget -q https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/tutorials/Certification_Trainings_JSL/Finance/data/finance_clf_data.csv

In [22]:
import pandas as pd
df = pd.read_csv('./finance_clf_data.csv')

In [23]:
df.head()

Unnamed: 0,text,label,len
0,Presently we do not believe any U S or State r...,business,402
1,\nnetwork outages or performance degradation ...,risk_factors,496
2,Available Information\nOur reports filed with ...,business,356
3,\n 42 530\n \n \n \n \n \n 42 530\nTotal liab...,financial_statements,359
4,8\nTable of Contents\ndevelopment employee eng...,business,582


In [24]:
df['label'].value_counts()

risk_factors               1926
financial_statements       1888
business                    970
financial_conditions        346
form_10k_summary            240
executives_compensation     155
controls_procedures         138
equity                      111
market_risk                 100
executives                   73
legal_proceedings            51
properties                   48
security_ownership           46
exhibits                     36
Name: label, dtype: int64

In [34]:
data = spark.createDataFrame(df)

train, test = data.randomSplit([0.90, 0.1], seed = 42)

In [35]:
from pyspark.sql.functions import col

train.groupBy("label") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

+--------------------+-----+
|               label|count|
+--------------------+-----+
|        risk_factors| 1738|
|financial_statements| 1696|
|            business|  867|
|financial_conditions|  315|
|    form_10k_summary|  219|
|executives_compen...|  141|
| controls_procedures|  126|
|         market_risk|   95|
|              equity|   95|
|          executives|   67|
|   legal_proceedings|   47|
|  security_ownership|   40|
|          properties|   39|
|            exhibits|   36|
+--------------------+-----+



In [36]:
from pyspark.sql.functions import col

test.groupBy("label") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

+--------------------+-----+
|               label|count|
+--------------------+-----+
|financial_statements|  192|
|        risk_factors|  188|
|            business|  103|
|financial_conditions|   31|
|    form_10k_summary|   21|
|              equity|   16|
|executives_compen...|   14|
| controls_procedures|   12|
|          properties|    9|
|  security_ownership|    6|
|          executives|    6|
|         market_risk|    5|
|   legal_proceedings|    4|
+--------------------+-----+



 ## With Universal Encoder

In [37]:
document_assembler = nlp.DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("document") 

embeddings = nlp.UniversalSentenceEncoder.pretrained() \
    .setInputCols("document") \
    .setOutputCol("sentence_embeddings")

classsifierdl = finance.ClassifierDLApproach()\
    .setInputCols(["sentence_embeddings"])\
    .setOutputCol("class")\
    .setLabelColumn("label")\
    .setMaxEpochs(30)\
    .setEnableOutputLogs(True)

clf_pipeline = nlp.Pipeline(
    stages = [
        document_assembler,
        embeddings,
        classsifierdl
    ])

tfhub_use download started this may take some time.
Approximate size to download 923.7 MB
[OK!]


In [38]:
%%time
clf_pipelineModel = clf_pipeline.fit(train)

1g6ax7jxec82paqzk7ivz9fx7
codes retrieved: ArrayBuffer(1g6ax7jxec82paqzk7ivz9fx7), product code(from property): 1g6ax7jxec82paqzk7ivz9fx7


2022-12-12 15:40:24.342769: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:43] Reading SavedModel from: /tmp/c497080e405f_classifier_dl9755035331685214231
2022-12-12 15:40:24.456301: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:107] Reading meta graph with tags { serve }
2022-12-12 15:40:24.456342: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:148] Reading SavedModel debug info (if present) from: /tmp/c497080e405f_classifier_dl9755035331685214231
2022-12-12 15:40:25.156661: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:228] Restoring SavedModel bundle.
2022-12-12 15:40:26.086312: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:212] Running initialization op on SavedModel bundle at path: /tmp/c497080e405f_classifier_dl9755035331685214231
2022-12-12 15:40:26.307292: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:301] SavedModel load for tags { serve }; Status: success: OK. Took 1964534 microsecon

Training started - epochs: 30 - learning_rate: 0.005 - batch_size: 64 - training_examples: 5521 - classes: 14
Epoch 1/30 - 0.81s - loss: 202.04024 - acc: 0.53654027 - batches: 87
Epoch 2/30 - 0.36s - loss: 192.73738 - acc: 0.6252992 - batches: 87
Epoch 3/30 - 0.37s - loss: 185.43294 - acc: 0.67821264 - batches: 87
Epoch 4/30 - 0.37s - loss: 185.01172 - acc: 0.6842083 - batches: 87
Epoch 5/30 - 0.36s - loss: 184.2607 - acc: 0.6920208 - batches: 87
Epoch 6/30 - 0.37s - loss: 184.14146 - acc: 0.6969263 - batches: 87
Epoch 7/30 - 0.37s - loss: 183.90776 - acc: 0.70001495 - batches: 87
Epoch 8/30 - 0.36s - loss: 183.66045 - acc: 0.7031036 - batches: 87
Epoch 9/30 - 0.37s - loss: 183.44075 - acc: 0.70546556 - batches: 87
Epoch 10/30 - 0.38s - loss: 183.23085 - acc: 0.7071007 - batches: 87
Epoch 11/30 - 0.37s - loss: 183.05194 - acc: 0.70782745 - batches: 87
Epoch 12/30 - 0.36s - loss: 182.92593 - acc: 0.7087359 - batches: 87
Epoch 13/30 - 0.36s - loss: 182.84663 - acc: 0.70891756 - batches: 

In [None]:
preds = clf_pipelineModel.transform(test)

In [40]:
preds_df = preds.select('label','text',"class.result").toPandas()
preds_df.head()

                                                                                

Unnamed: 0,label,text,result
0,financial_statements,\n \n\n\nComplexity in the application of rele...,[financial_statements]
1,financial_statements,\n \n\nWeighted Avg \n\nExercise Prices\n\n \n...,[financial_statements]
2,financial_statements,\n 1 744\n \n 367\n \n 2 375\n \n 1 376\n \n ...,[financial_statements]
3,financial_statements,\n2 198 \n \n1 630 \n \n1 782 \nCash paid for...,[financial_statements]
4,business,\nCloud and mobile enabled \n Our business ma...,[business]


In [41]:
# The result is an array since in Spark NLP you can have multiple sentences.
# Let's explode the array and get the item(s) inside of result column out
preds_df['result'] = preds_df['result'].apply(lambda x : x[0])

In [42]:
# We are going to use sklearn to evalute the results on test dataset
from sklearn.metrics import classification_report

print (classification_report(preds_df['label'], preds_df['result']))

                         precision    recall  f1-score   support

               business       0.73      0.81      0.77       103
    controls_procedures       0.00      0.00      0.00        12
                 equity       0.00      0.00      0.00        16
             executives       0.00      0.00      0.00         6
executives_compensation       0.00      0.00      0.00        14
   financial_conditions       0.00      0.00      0.00        31
   financial_statements       0.67      0.93      0.78       192
       form_10k_summary       0.00      0.00      0.00        21
      legal_proceedings       0.00      0.00      0.00         4
            market_risk       0.00      0.00      0.00         5
             properties       0.00      0.00      0.00         9
           risk_factors       0.77      0.93      0.84       188
     security_ownership       0.00      0.00      0.00         6

               accuracy                           0.72       607
              macro avg

### Saving & loading back the trained model

In [43]:
clf_pipelineModel.stages

[DocumentAssembler_20fc51b07cc2,
 UNIVERSAL_SENTENCE_ENCODER_4de71669b7ec,
 FinanceClassifierDLModel_ba5593b0f3c2]

In [44]:
clf_pipelineModel.stages[-1].write().overwrite().save('Clf_Use')

In [45]:
# Load back  saved Classifier Model
ClfModel = finance.ClassifierDLModel.load('Clf_Use')

In [46]:
ld_pipeline = nlp.Pipeline(stages=[document_assembler, embeddings,ClfModel])
ld_pipeline_model = ld_pipeline.fit(spark.createDataFrame([['']]).toDF("text"))

In [47]:
# Apply Model Transform to testData
ld_preds = ld_pipeline_model.transform(test)

1g6ax7jxec82paqzk7ivz9fx7
codes retrieved: ArrayBuffer(1g6ax7jxec82paqzk7ivz9fx7), product code(from property): 1g6ax7jxec82paqzk7ivz9fx7


In [48]:
ld_preds_df = ld_preds.select('text','label',"class.result").toPandas()

                                                                                

In [49]:
ld_preds_df.head()

Unnamed: 0,text,label,result
0,\n \n\n\nComplexity in the application of rele...,financial_statements,[financial_statements]
1,\n \n\nWeighted Avg \n\nExercise Prices\n\n \n...,financial_statements,[financial_statements]
2,\n 1 744\n \n 367\n \n 2 375\n \n 1 376\n \n ...,financial_statements,[financial_statements]
3,\n2 198 \n \n1 630 \n \n1 782 \nCash paid for...,financial_statements,[financial_statements]
4,\nCloud and mobile enabled \n Our business ma...,business,[business]


## With Bert Embeddings

We do not have Financial Sentence Embeddings yet, But we can use the Financial Word Embeddings and then average them.

In [54]:
document_assembler = nlp.DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("document")

tokenizer = nlp.Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("token")

embeddings = nlp.BertEmbeddings.pretrained("bert_embeddings_sec_bert_base","en") \
    .setInputCols(["document", "token"]) \
    .setOutputCol("embeddings")

embeddingsSentence = nlp.SentenceEmbeddings() \
    .setInputCols(["document", "embeddings"]) \
    .setOutputCol("sentence_embeddings") \
    .setPoolingStrategy("AVERAGE")

classsifierdl = finance.ClassifierDLApproach() \
    .setInputCols(["sentence_embeddings"]) \
    .setOutputCol("class") \
    .setLabelColumn("label")\
    .setMaxEpochs(8)\
    .setLr(0.0003)\
    .setEnableOutputLogs(True)

clf_pipeline = nlp.Pipeline(stages = [
        document_assembler,
        tokenizer,
        embeddings,
        embeddingsSentence,
        classsifierdl
    ])

bert_embeddings_sec_bert_base download started this may take some time.
Approximate size to download 390.4 MB
[OK!]


In [55]:
%%time
clf_pipelineModel = clf_pipeline.fit(train)

1g6ax7jxec82paqzk7ivz9fx7
codes retrieved: ArrayBuffer(1g6ax7jxec82paqzk7ivz9fx7), product code(from property): 1g6ax7jxec82paqzk7ivz9fx7


2022-12-12 16:29:15.858700: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:43] Reading SavedModel from: /tmp/fe1e2ba5f200_classifier_dl3591081285898410384
2022-12-12 16:29:15.969262: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:107] Reading meta graph with tags { serve }
2022-12-12 16:29:15.969302: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:148] Reading SavedModel debug info (if present) from: /tmp/fe1e2ba5f200_classifier_dl3591081285898410384
2022-12-12 16:29:16.465349: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:228] Restoring SavedModel bundle.
2022-12-12 16:29:17.315567: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:212] Running initialization op on SavedModel bundle at path: /tmp/fe1e2ba5f200_classifier_dl3591081285898410384
2022-12-12 16:29:17.501853: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:301] SavedModel load for tags { serve }; Status: success: OK. Took 1643165 microsecon

Training started - epochs: 8 - learning_rate: 3.0E-4 - batch_size: 64 - training_examples: 5521 - classes: 14
Epoch 1/8 - 0.69s - loss: 194.76477 - acc: 0.55561733 - batches: 87
Epoch 2/8 - 0.36s - loss: 186.19788 - acc: 0.5999487 - batches: 87
Epoch 3/8 - 0.38s - loss: 185.46783 - acc: 0.6060833 - batches: 87
Epoch 4/8 - 0.38s - loss: 185.19696 - acc: 0.60662836 - batches: 87
Epoch 5/8 - 0.36s - loss: 185.05669 - acc: 0.60753673 - batches: 87
Epoch 6/8 - 0.37s - loss: 184.95215 - acc: 0.60844517 - batches: 87
Epoch 7/8 - 0.36s - loss: 184.86292 - acc: 0.6088086 - batches: 87
Epoch 8/8 - 0.36s - loss: 184.78632 - acc: 0.60899025 - batches: 87
CPU times: user 138 ms, sys: 27.6 ms, total: 166 ms
Wall time: 11min 2s


In [None]:
preds = clf_pipelineModel.transform(test)

In [57]:
preds_df = preds.select('label','text',"class.result").toPandas()

                                                                                

In [58]:
preds_df.head()

Unnamed: 0,label,text,result
0,financial_statements,\n \n\n\nComplexity in the application of rele...,[financial_statements]
1,financial_statements,\n \n\nWeighted Avg \n\nExercise Prices\n\n \n...,[financial_statements]
2,financial_statements,\n 1 744\n \n 367\n \n 2 375\n \n 1 376\n \n ...,[financial_statements]
3,financial_statements,\n2 198 \n \n1 630 \n \n1 782 \nCash paid for...,[financial_statements]
4,business,\nCloud and mobile enabled \n Our business ma...,[risk_factors]


In [59]:
# Let's explode the array and get the item(s) inside of result column out
preds_df['result'] = preds_df['result'].apply(lambda x : x[0])

from sklearn.metrics import classification_report

print (classification_report(preds_df['label'], preds_df['result']))


                         precision    recall  f1-score   support

               business       0.00      0.00      0.00       103
    controls_procedures       0.00      0.00      0.00        12
                 equity       0.00      0.00      0.00        16
             executives       0.00      0.00      0.00         6
executives_compensation       0.00      0.00      0.00        14
   financial_conditions       0.00      0.00      0.00        31
   financial_statements       0.61      0.97      0.75       192
       form_10k_summary       0.00      0.00      0.00        21
      legal_proceedings       0.00      0.00      0.00         4
            market_risk       0.00      0.00      0.00         5
             properties       0.00      0.00      0.00         9
           risk_factors       0.61      0.97      0.75       188
     security_ownership       0.00      0.00      0.00         6

               accuracy                           0.61       607
              macro avg

# Save model and Zip it for Modelshub Upload/Downloads

In [53]:
# Save a Spark NLP model
clf_pipelineModel.stages[-1].write().overwrite().save('ClfBert')


In [54]:
import shutil

shutil.make_archive('ClfBert', 'zip', 'ClfBert')

'/home/ubuntu/notebooks/examples/finance/ClfBert.zip'