![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)

# Train Domain-specific Multiclass and Multilabel classifiers

In [0]:
from johnsnowlabs import * 

In [0]:
print(dir(finance))

# Multilabel classifier training

## Loading the data

In [0]:
! wget -q https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/tutorials/Certification_Trainings/Finance/data/finance_data.csv

dbutils.fs.cp("file:/databricks/driver/finance_data.csv", "dbfs:/") 

In [0]:
import pandas as pd
df = pd.read_csv('/dbfs/finance_data.csv')
df['label'] = df['label'].apply(eval)

In [0]:
data = spark.createDataFrame(df)

# If you have a single dataset, then split it or else you can load the test dataset the same way that you load the train data.
train, test = data.randomSplit([0.8, 0.2], seed = 123)

In [0]:
train.show(truncate=50)

In [0]:
from pyspark.sql.functions import col

test.groupBy("label") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

## With Universal Encoder

In [0]:
document_assembler = nlp.DocumentAssembler() \
    .setInputCol("provision") \
    .setOutputCol("document") \
    .setCleanupMode("shrink")

embeddings = nlp.UniversalSentenceEncoder.pretrained() \
      .setInputCols("document") \
      .setOutputCol("sentence_embeddings")

classsifierdl = nlp.MultiClassifierDLApproach()\
      .setInputCols(["sentence_embeddings"])\
      .setOutputCol("class")\
      .setLabelColumn("label")\
      .setMaxEpochs(30)\
      .setEnableOutputLogs(True)

clf_pipeline = Pipeline(
    stages = [
        document_assembler,
        embeddings,
        classsifierdl
    ])

In [0]:
%%time
clf_pipelineModel = clf_pipeline.fit(train)

In [0]:
import os
log_file_name = os.listdir("/dbfs/root/annotator_logs/")[0]

with open("/dbfs/root/annotator_logs/"+log_file_name, "r") as log_file :
    print(log_file.read())

In [0]:
preds = clf_pipelineModel.transform(test)

In [0]:
preds_df = preds.select('label','provision',"class.result").toPandas()
preds_df.head()

Unnamed: 0,label,provision,result
0,"[waivers, amendments]",(a) No failure or delay by any Agent or any Le...,"[waivers, amendments]"
1,[assignments],"(a) Seller, the Agent, each Managing Agent, ea...",[assignments]
2,[waivers],(a) Any provision of this Agreement may be wai...,"[waivers, amendments]"
3,[notices],(a) Except where telephonic instructions or no...,[notices]
4,"[governing laws, entire agreements]","(a) THIS AGREEMENT AND ANY CLAIM, CONTROVERSY,...","[governing laws, entire agreements]"


In [0]:
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

mlb = MultiLabelBinarizer()

y_true = mlb.fit_transform(preds_df['label'])
y_pred = mlb.fit_transform(preds_df['result'])


print("Classification report: \n", (classification_report(y_true, y_pred)))
print("F1 micro averaging:",(f1_score(y_true, y_pred, average='micro')))
print("ROC: ",(roc_auc_score(y_true, y_pred, average="micro")))


## With Bert Embeddings

We do not have have any specific Financial Sentence Embeddings, but we can use Financial Bert Embeddings and then average them.

In [0]:
embeddings = nlp.BertEmbeddings.pretrained("bert_embeddings_sec_bert_base","en") \
          .setInputCols(["document", "token"]) \
          .setOutputCol("embeddings")

In [0]:
document_assembler = nlp.DocumentAssembler() \
    .setInputCol("provision") \
    .setOutputCol("document")

tokenizer = nlp.Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("token")

embeddingsSentence = nlp.SentenceEmbeddings() \
    .setInputCols(["document", "embeddings"]) \
    .setOutputCol("sentence_embeddings") \
    .setPoolingStrategy("AVERAGE")

classsifierdl = nlp.MultiClassifierDLApproach() \
    .setInputCols(["sentence_embeddings"]) \
    .setOutputCol("class") \
    .setLabelColumn("label")\
    .setMaxEpochs(1)\
    .setEnableOutputLogs(True)

clf_pipeline = Pipeline(
    stages = [
        document_assembler,
        tokenizer,
        embeddings,
        embeddingsSentence,
        classsifierdl
    ])

In [0]:
%%time
clf_pipelineModel = clf_pipeline.fit(train)

In [0]:
preds = clf_pipelineModel.transform(test)

In [0]:
preds_df = preds.select('provision','label',"class.result").toPandas()

In [0]:
preds_df.head()

Unnamed: 0,provision,label,result
0,(a) No failure or delay by any Agent or any Le...,"[waivers, amendments]",[waivers]
1,"(a) Seller, the Agent, each Managing Agent, ea...",[assignments],[assignments]
2,(a) Any provision of this Agreement may be wai...,[waivers],[waivers]
3,(a) Except where telephonic instructions or no...,[notices],[notices]
4,"(a) THIS AGREEMENT AND ANY CLAIM, CONTROVERSY,...","[governing laws, entire agreements]",[entire agreements]


In [0]:
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

mlb = MultiLabelBinarizer()

y_true = mlb.fit_transform(preds_df['label'])
y_pred = mlb.fit_transform(preds_df['result'])


print("Classification report: \n", (classification_report(y_true, y_pred)))
print("F1 micro averaging:",(f1_score(y_true, y_pred, average='micro')))
print("ROC: ",(roc_auc_score(y_true, y_pred, average="micro")))


### Saving & loading back the trained model

In [0]:
clf_pipelineModel.stages

Out[21]: [DocumentAssembler_0befb157d5b5,
 REGEX_TOKENIZER_845e9cea52a1,
 BERT_EMBEDDINGS_29ce72cd673e,
 SentenceEmbeddings_c13729f7bf05,
 MultiClassifierDLModel_fb74a81172fa]

In [0]:
clf_pipelineModel.stages[-1].write().overwrite().save('/dbfs/MultilabelClfBert')

In [0]:
# Load back  saved Multilabel Classifier Model
MultilabelClfModel = nlp.MultiClassifierDLModel.load('/dbfs/MultilabelClfBert')

In [0]:
ld_pipeline = Pipeline(stages=[document_assembler, tokenizer, embeddings, embeddingsSentence, MultilabelClfModel])
ld_pipeline_model = ld_pipeline.fit(spark.createDataFrame([['']]).toDF("provision"))

In [0]:
# Apply Model Transform to testData
ld_preds = ld_pipeline_model.transform(test)

In [0]:
ld_preds_df = ld_preds.select('provision','label',"class.result").toPandas()

In [0]:
ld_preds_df.head(10)

Unnamed: 0,provision,label,result
0,"(a) Effective as of the Effective Date, the Ho...","[waivers, terminations]",[waivers]
1,(a) No failure or delay by the Administrative ...,"[waivers, amendments]","[waivers, amendments]"
2,(a) No failure or delay on the part of any par...,"[waivers, amendments]",[waivers]
3,"(a) Seller, the Agent, each Managing Agent, ea...",[assignments],[assignments]
4,(a) The provisions of this Agreement shall be ...,"[assigns, successors]","[successors, assigns]"
5,(a) No failure or delay of the Administrative...,"[waivers, amendments]","[waivers, amendments]"
6,(a) All of the representations and warranties ...,"[representations, warranties]","[warranties, representations]"
7,(a) Any Lender may at any time assign to one o...,[assignments],[assignments]
8,(a) Each of the Borrower and the Parent hereby...,"[representations, warranties]","[warranties, representations]"
9,(a) Except as otherwise expressly provided her...,[notices],[notices]


# Multiclass classifier training

## Loading the data

In [0]:
! wget -q https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/tutorials/Certification_Trainings/Finance/data/finance_clf_data.csv

dbutils.fs.cp("file:/databricks/driver/finance_clf_data.csv", "dbfs:/") 

In [0]:
import pandas as pd
df = pd.read_csv('/dbfs/finance_clf_data.csv')

In [0]:
df.head()

Unnamed: 0,text,label,len
0,Presently we do not believe any U S or State r...,business,402
1,\r\nnetwork outages or performance degradatio...,risk_factors,496
2,Available Information\r\nOur reports filed wit...,business,356
3,\r\n 42 530\r\n \r\n \r\n \r\n \r\n \r\n 42 5...,financial_statements,359
4,8\r\nTable of Contents\r\ndevelopment employee...,business,582


In [0]:
df['label'].value_counts()

In [0]:
data = spark.createDataFrame(df)

train, test = data.randomSplit([0.8, 0.2], seed = 100)

In [0]:
from pyspark.sql.functions import col

train.groupBy("label") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

In [0]:
from pyspark.sql.functions import col

test.groupBy("label") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

## With Universal Encoder

In [0]:
document_assembler = nlp.DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("document") 

embeddings = nlp.UniversalSentenceEncoder.pretrained() \
    .setInputCols("document") \
    .setOutputCol("sentence_embeddings")

classsifierdl = finance.ClassifierDLApproach()\
    .setInputCols(["sentence_embeddings"])\
    .setOutputCol("class")\
    .setLabelColumn("label")\
    .setMaxEpochs(60)\
    .setEnableOutputLogs(True)

clf_pipeline = Pipeline(
    stages = [
        document_assembler,
        embeddings,
        classsifierdl
    ])

In [0]:
%%time
clf_pipelineModel = clf_pipeline.fit(train)

In [0]:
preds = clf_pipelineModel.transform(test)

In [0]:
preds_df = preds.select('label','text',"class.result").toPandas()
preds_df.head()

Unnamed: 0,label,text,result
0,risk_factors,\r\n\r\n \r\n\r\n\r\nNet cash provided by oper...,[financial_statements]
1,financial_statements,\r\n\r\n\r\n \r\n \r\n \r\n Identification of...,[financial_statements]
2,form_10k_summary,\r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n\...,[financial_statements]
3,financial_statements,\r\n \r\n 120\r\n \r\n \r\n \r\n 202\r\n \r\n...,[financial_statements]
4,risk_factors,\r\n \r\nAn assertion by a third party that w...,[risk_factors]


In [0]:
# The result is an array since in Spark NLP you can have multiple sentences.
# Let's explode the array and get the item(s) inside of result column out
preds_df['result'] = preds_df['result'].apply(lambda x : x[0])

In [0]:
# We are going to use sklearn to evalute the results on test dataset
from sklearn.metrics import classification_report

print (classification_report(preds_df['label'], preds_df['result']))

### Saving & loading back the trained model

In [0]:
clf_pipelineModel.stages

In [0]:
clf_pipelineModel.stages[-1].write().overwrite().save('/dbfs/Clf_Use')

In [0]:
# Load back  saved Classifier Model
ClfModel = finance.ClassifierDLModel.load('/dbfs/Clf_Use')

In [0]:
ld_pipeline = Pipeline(stages=[document_assembler, embeddings,ClfModel])
ld_pipeline_model = ld_pipeline.fit(spark.createDataFrame([['']]).toDF("text"))

In [0]:
# Apply Model Transform to testData
ld_preds = ld_pipeline_model.transform(test)

In [0]:
ld_preds_df = ld_preds.select('text','label',"class.result").toPandas()

In [0]:
ld_preds_df.head()

Unnamed: 0,text,label,result
0,\r\n\r\n \r\n\r\n\r\nNet cash provided by oper...,risk_factors,[financial_statements]
1,\r\n\r\n\r\n \r\n \r\n \r\n Identification of...,financial_statements,[financial_statements]
2,\r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n\...,form_10k_summary,[financial_statements]
3,\r\n \r\n 120\r\n \r\n \r\n \r\n 202\r\n \r\n...,financial_statements,[financial_statements]
4,\r\n \r\nAn assertion by a third party that w...,risk_factors,[risk_factors]


## With Bert Embeddings

We do not have Financial Sentence Embeddings yet, But we can use the Financial Word Embeddings and then average them.

In [0]:
embeddings = nlp.BertEmbeddings.pretrained("bert_embeddings_sec_bert_base","en") \
    .setInputCols(["document", "token"]) \
    .setOutputCol("embeddings")

In [0]:
document_assembler = nlp.DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("document")

tokenizer = nlp.Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("token")

embeddingsSentence = nlp.SentenceEmbeddings() \
    .setInputCols(["document", "embeddings"]) \
    .setOutputCol("sentence_embeddings") \
    .setPoolingStrategy("AVERAGE")

classsifierdl = finance.ClassifierDLApproach() \
    .setInputCols(["sentence_embeddings"]) \
    .setOutputCol("class") \
    .setLabelColumn("label")\
    .setMaxEpochs(8)\
    .setLr(0.001)\
    .setEnableOutputLogs(True)

clf_pipeline = Pipeline(
    stages = [
        document_assembler,
        tokenizer,
        embeddings,
        embeddingsSentence,
        classsifierdl
    ])

In [0]:
%%time
clf_pipelineModel = clf_pipeline.fit(train)

In [0]:
preds = clf_pipelineModel.transform(test)

In [0]:
preds_df = preds.select('label','text',"class.result").toPandas()

In [0]:
preds_df.head()

Unnamed: 0,label,text,result
0,risk_factors,\r\n\r\n \r\n\r\n\r\nNet cash provided by oper...,[financial_statements]
1,financial_statements,\r\n\r\n\r\n \r\n \r\n \r\n Identification of...,[financial_statements]
2,form_10k_summary,\r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n\...,[financial_statements]
3,financial_statements,\r\n \r\n 120\r\n \r\n \r\n \r\n 202\r\n \r\n...,[financial_statements]
4,risk_factors,\r\n \r\nAn assertion by a third party that w...,[risk_factors]


In [0]:
# Let's explode the array and get the item(s) inside of result column out
preds_df['result'] = preds_df['result'].apply(lambda x : x[0])

from sklearn.metrics import classification_report

print (classification_report(preds_df['label'], preds_df['result']))


# Save model

In [0]:
# Save a Spark NLP model
clf_pipelineModel.stages[-1].write().overwrite().save('/dbfs/ClfBert')