
![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)





# Train Legal Classifiers

## Setup

In [None]:
from johnsnowlabs import *

import json
import os
import numpy as np
import pandas as pd

spark = start_spark()

# Multilabel classifier training

## Loading the data

In [None]:
! wget -q https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/legal-nlp/data/finance_data.csv

In [None]:
df = pd.read_csv('./finance_data.csv')
df['label'] = df['label'].apply(eval)

In [None]:
data = spark.createDataFrame(df)

# If you have a single dataset, then split it or else you can load the test dataset the same way that you load the train data.
train, test = data.randomSplit([0.8, 0.2], seed = 123)

In [None]:
train.show(truncate=50)

In [None]:
from pyspark.sql.functions import col

train.groupBy("label") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

In [None]:
test.groupBy("label") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

 ## With Universal Encoder

In [None]:
document_assembler = nlp.DocumentAssembler() \
      .setInputCol("provision") \
      .setOutputCol("document") \
      .setCleanupMode("shrink")

embeddings = nlp.UniversalSentenceEncoder.pretrained() \
      .setInputCols("document") \
      .setOutputCol("sentence_embeddings")

classsifierdl = nlp.MultiClassifierDLApproach()\
      .setInputCols(["sentence_embeddings"])\
      .setOutputCol("class")\
      .setLabelColumn("label")\
      .setMaxEpochs(30)\
      .setEnableOutputLogs(True)

clf_pipeline = nlp.Pipeline(stages = [
        document_assembler,
        embeddings,
        classsifierdl
    ])

In [None]:
%%time
clf_pipelineModel = clf_pipeline.fit(train)

In [None]:
preds = clf_pipelineModel.transform(test)

In [None]:
preds_df = preds.select('label','provision',"class.result").toPandas()
preds_df.head()

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

mlb = MultiLabelBinarizer()

y_true = mlb.fit_transform(preds_df['label'])
y_pred = mlb.fit_transform(preds_df['result'])


print("Classification report: \n", (classification_report(y_true, y_pred)))
print("F1 micro averaging:",(f1_score(y_true, y_pred, average='micro')))
print("ROC: ",(roc_auc_score(y_true, y_pred, average="micro")))


## With RoBerta Embeddings

We do not have have any specific Legal Sentence Embeddings, but we can use Legal RoBerta Embeddings and then average them.

In [None]:
embeddings = nlp.RoBertaEmbeddings.pretrained("roberta_embeddings_legal_roberta_base","en") \
    .setInputCols(["document", "token"]) \
    .setOutputCol("embeddings") \
    .setMaxSentenceLength(512)

In [None]:
document_assembler = nlp.DocumentAssembler() \
    .setInputCol("provision") \
    .setOutputCol("document")

tokenizer = nlp.Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("token")

embeddingsSentence = nlp.SentenceEmbeddings() \
    .setInputCols(["document", "embeddings"]) \
    .setOutputCol("sentence_embeddings") \
    .setPoolingStrategy("AVERAGE")

classsifierdl = nlp.MultiClassifierDLApproach() \
    .setInputCols(["sentence_embeddings"]) \
    .setOutputCol("class") \
    .setLabelColumn("label")\
    .setMaxEpochs(6)\
    .setEnableOutputLogs(True)

clf_pipeline = nlp.Pipeline(stages = [
        document_assembler,
        tokenizer,
        embeddings,
        embeddingsSentence,
        classsifierdl
    ])

In [None]:
%%time
clf_pipelineModel = clf_pipeline.fit(train)

In [None]:
preds = clf_pipelineModel.transform(test)

In [None]:
preds_df = preds.select('provision','label',"class.result").toPandas()

In [None]:
preds_df.head()

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

mlb = MultiLabelBinarizer()

y_true = mlb.fit_transform(preds_df['label'])
y_pred = mlb.fit_transform(preds_df['result'])


print("Classification report: \n", (classification_report(y_true, y_pred)))
print("F1 micro averaging:",(f1_score(y_true, y_pred, average='micro')))
print("ROC: ",(roc_auc_score(y_true, y_pred, average="micro")))


### Saving & loading back the trained model

In [None]:
clf_pipelineModel.stages

In [None]:
clf_pipelineModel.stages[-1].write().overwrite().save('MultilabelClfRoBerta')

In [None]:
# Load back  saved Multilabel Classifier Model
MultilabelClfModel = nlp.MultiClassifierDLModel.load('MultilabelClfRoBerta')

In [None]:
ld_pipeline = nlp.Pipeline(stages=[document_assembler, tokenizer, embeddings, embeddingsSentence, MultilabelClfModel])

ld_pipeline_model = ld_pipeline.fit(spark.createDataFrame([['']]).toDF("provision"))

In [None]:
# Apply Model Transform to testData
ld_preds = ld_pipeline_model.transform(test)

In [None]:
ld_preds_df = ld_preds.select('provision','label',"class.result").toPandas()

In [None]:
ld_preds_df.head(10)

# Multiclass classifier training

## Loading the data

In [None]:
! wget -q https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/finance-nlp/data/finance_clf_data.csv

In [None]:
df = pd.read_csv('finance_clf_data.csv')

In [None]:
df.head()

In [None]:
df['label'].value_counts()

In [None]:
data = spark.createDataFrame(df)

train, test = data.randomSplit([0.8, 0.2], seed = 100)

In [None]:
from pyspark.sql.functions import col

train.groupBy("label") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

In [None]:
test.groupBy("label") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

 ## With Universal Encoder

In [None]:
document_assembler = nlp.DocumentAssembler() \
      .setInputCol("text") \
      .setOutputCol("document") 

embeddings = nlp.UniversalSentenceEncoder.pretrained() \
      .setInputCols("document") \
      .setOutputCol("sentence_embeddings")

classsifierdl = legal.ClassifierDLApproach()\
      .setInputCols(["sentence_embeddings"])\
      .setOutputCol("class")\
      .setLabelColumn("label")\
      .setMaxEpochs(30)\
      .setEnableOutputLogs(True)

clf_pipeline = nlp.Pipeline(stages = [
        document_assembler,
        embeddings,
        classsifierdl
    ])

In [None]:
clf_pipelineModel = clf_pipeline.fit(train)

In [None]:
preds = clf_pipelineModel.transform(test)

In [None]:
preds_df = preds.select('label','text',"class.result").toPandas()
preds_df.head()

In [None]:
# The result is an array since in Spark NLP you can have multiple sentences.
# Let's explode the array and get the item(s) inside of result column out
preds_df['result'] = preds_df['result'].apply(lambda x : x[0])

In [None]:
# We are going to use sklearn to evalute the results on test dataset
from sklearn.metrics import classification_report

print (classification_report(preds_df['label'], preds_df['result']))

### Saving & loading back the trained model

In [None]:
clf_pipelineModel.stages

In [None]:
clf_pipelineModel.stages[-1].write().overwrite().save('Clf_Use')

In [None]:
# Load back  saved Classifier Model
ClfModel = legal.ClassifierDLModel.load('Clf_Use')

In [None]:
ld_pipeline = nlp.Pipeline(stages=[document_assembler, embeddings,ClfModel])
ld_pipeline_model = ld_pipeline.fit(spark.createDataFrame([['']]).toDF("text"))

In [None]:
# Apply Model Transform to testData
ld_preds = ld_pipeline_model.transform(test)

In [None]:
ld_preds_df = ld_preds.select('text','label',"class.result").toPandas()

In [None]:
ld_preds_df.head()

## With RoBerta Embeddings

We do not have Legal Sentence Embeddings yet, But we can use the Legal RoBerta Embeddings and then average them.

In [None]:
embeddings = nlp.RoBertaEmbeddings.pretrained("roberta_embeddings_legal_roberta_base","en") \
    .setInputCols(["document", "token"]) \
    .setOutputCol("embeddings") \
    .setMaxSentenceLength(512)

In [None]:
document_assembler = nlp.DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("document")

tokenizer = nlp.Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("token")

embeddingsSentence = nlp.SentenceEmbeddings() \
    .setInputCols(["document", "embeddings"]) \
    .setOutputCol("sentence_embeddings") \
    .setPoolingStrategy("AVERAGE")

classsifierdl = legal.ClassifierDLApproach() \
    .setInputCols(["sentence_embeddings"]) \
    .setOutputCol("class") \
    .setLabelColumn("label")\
    .setMaxEpochs(8)\
    .setLr(0.001)\
    .setEnableOutputLogs(True)

clf_pipeline = nlp.Pipeline(stages = [
        document_assembler,
        tokenizer,
        embeddings,
        embeddingsSentence,
        classsifierdl
    ])

In [None]:
clf_pipelineModel = clf_pipeline.fit(train)

In [None]:
preds = clf_pipelineModel.transform(test)

In [None]:
preds_df = preds.select('label','text',"class.result").toPandas()

In [None]:
preds_df.head()

In [None]:
# Let's explode the array and get the item(s) inside of result column out
preds_df['result'] = preds_df['result'].apply(lambda x : x[0])

print (classification_report(preds_df['label'], preds_df['result']))


# Save model and Zip it for Modelshub Upload/Downloads

In [None]:
# Save a Spark NLP model
clf_pipelineModel.stages[-1].write().overwrite().save('ClfBert')

import shutil

shutil.make_archive('ClfBert', 'zip', 'ClfBert')