![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)

# 4.Training and Reusing Text Classification Models

**Relevant blogpost:** https://towardsdatascience.com/text-classification-in-spark-nlp-with-bert-and-universal-sentence-encoders-e644d618ca32

In [0]:
import json
import os
import string
import pandas as pd
import numpy as np

import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp.util import *

import pyspark.sql.functions as F
from pyspark.ml import Pipeline, PipelineModel
from pyspark.sql import functions as F


pd.set_option('display.max_columns', None)  
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', 100)

print('sparknlp.version : ',sparknlp.version())

spark

## Using Pretrained ClassifierDL and SentimentDL models

In [0]:
fake_classifier = ClassifierDLModel.pretrained('classifierdl_use_fakenews', 'en') \
                .setInputCols(["sentence_embeddings"]) \
                .setOutputCol("class")

fake_news classifier is trained on `https://raw.githubusercontent.com/joolsa/fake_real_news_dataset/master/fake_or_real_news.csv.zip`

In [0]:
fake_classifier.getClasses()

In [0]:
documentAssembler = DocumentAssembler()\
      .setInputCol("text")\
      .setOutputCol("document")

use = UniversalSentenceEncoder.pretrained(name="tfhub_use",lang="en") \
      .setInputCols(["document"])\
      .setOutputCol("sentence_embeddings")

nlpPipeline = Pipeline(stages=[
      documentAssembler, 
      use,
      fake_classifier
  ])

empty_data = spark.createDataFrame([[""]]).toDF("text")

fake_clf_model = nlpPipeline.fit(empty_data)


In [0]:
!wget -q https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/tutorials/Certification_Trainings/Public/data/spam_ham_dataset.csv
  
dbutils.fs.cp("file:/databricks/driver/spam_ham_dataset.csv", "dbfs:/") 

In [0]:
fake_lp_pipeline = LightPipeline(fake_clf_model)

text = 'BREAKING: Leaked Picture Of Obama Being Dragged Before A Judge In Handcuffs For Wiretapping Trump'

fake_lp_pipeline.annotate(text)

In [0]:
sample_data = spark.createDataFrame([[text]]).toDF("text")

sample_data.show(truncate=False)

In [0]:
pred = fake_clf_model.transform(sample_data)


In [0]:
pred.show()

In [0]:
pred.select('text','class.result').show(truncate=False)

you can find more samples here >> `https://github.com/KaiDMML/FakeNewsNet/tree/master/dataset`

## Generic classifier function

In [0]:
def get_clf_lp(model_name, sentiment_dl=False, pretrained=True):

  documentAssembler = DocumentAssembler()\
      .setInputCol("text")\
      .setOutputCol("document")

  use = UniversalSentenceEncoder.pretrained(lang="en") \
      .setInputCols(["document"])\
      .setOutputCol("sentence_embeddings")


  if pretrained:

    if sentiment_dl:

      document_classifier = SentimentDLModel.pretrained(model_name, 'en') \
                .setInputCols(["sentence_embeddings"]) \
                .setOutputCol("class")
    else:
      document_classifier = ClassifierDLModel.pretrained(model_name, 'en') \
                .setInputCols(["sentence_embeddings"]) \
                .setOutputCol("class")

  else:

    if sentiment_dl:

      document_classifier = SentimentDLModel.load(model_name) \
                .setInputCols(["sentence_embeddings"]) \
                .setOutputCol("class")
    else:
      document_classifier = ClassifierDLModel.load(model_name) \
                .setInputCols(["sentence_embeddings"]) \
                .setOutputCol("class")

  print ('classes:',document_classifier.getClasses())

  nlpPipeline = Pipeline(stages=[
                  documentAssembler, 
                  use,
                  document_classifier
  ])

  empty_data = spark.createDataFrame([[""]]).toDF("text")

  clf_pipelineFit = nlpPipeline.fit(empty_data)

  clf_lp_pipeline = LightPipeline(clf_pipelineFit)

  return clf_lp_pipeline


In [0]:
clf_lp_pipeline = get_clf_lp('classifierdl_use_trec50')

trained on the TREC datasets:

Classify open-domain, fact-based questions into one of the following broad semantic categories: 

```Abbreviation, Description, Entities, Human Beings, Locations or Numeric Values.```

In [0]:
text = 'What was the number of member nations of the U.N. in 2000?'

clf_lp_pipeline.annotate(text)['class']

In [0]:
clf_lp_pipeline.fullAnnotate(text)[0]['class'][0].result

In [0]:
clf_lp_pipeline.fullAnnotate(text)[0]['class'][0].metadata

In [0]:
text = 'What animal was the first mammal successfully cloned from adult cells?'

clf_lp_pipeline.annotate(text)['class']

In [0]:
clf_lp_pipeline = get_clf_lp('classifierdl_use_cyberbullying')


In [0]:
text ='RT @EBeisner @ahall012 I agree with you!! I would rather brush my teeth with sandpaper then watch football with a girl!!'

clf_lp_pipeline.annotate(text)['class']

In [0]:
clf_lp_pipeline = get_clf_lp('classifierdl_use_fakenews')


In [0]:
text ='Donald Trump a KGB Spy? 11/02/2016 In today’s video, Christopher Greene of AMTV reports Hillary Clinton campaign accusation that Donald Trump is a KGB spy is about as weak and baseless a claim as a Salem witch hunt or McCarthy era trial. It’s only because Hillary Clinton is losing that she is lobbing conspiracy theory. Citizen Quasar The way I see it, one of two things will happen: 1. Trump will win by a landslide but the election will be stolen via electronic voting, just like I have been predicting for over a decade, and the American People will accept the skewed election results just like they accept the TSA into their crotches. 2. Somebody will bust a cap in Hillary’s @$$ killing her and the election will be postponed. Follow AMTV!'

clf_lp_pipeline.annotate(text)['class']


In [0]:
text ='Sen. Marco Rubio (R-Fla.) is adding a veteran New Hampshire political operative to his team as he continues mulling a possible 2016 presidential bid, the latest sign that he is seriously preparing to launch a campaign later this year.Jim Merrill, who worked for former GOP presidential nominee Mitt Romney and ran his 2008 and 2012 New Hampshire primary campaigns, joined Rubio’s fledgling campaign on Monday, aides to the senator said.Merrill will be joining Rubio’s Reclaim America PAC to focus on Rubio’s New Hampshire and broader Northeast political operations."Marco has always been well received in New Hampshire, and should he run for president, he would be very competitive there," Terry Sullivan, who runs Reclaim America, said in a statement. "Jim certainly knows how to win in New Hampshire and in the Northeast, and will be a great addition to our team at Reclaim America.”News of Merrill’s hire was first reported by The New York Times.'

clf_lp_pipeline.annotate(text)['class']

In [0]:
sentiment_lp_pipeline = get_clf_lp('sentimentdl_use_twitter', sentiment_dl=True)

In [0]:
text ='I am SO happy the news came out in time for my birthday this weekend! My inner 7-year-old cannot WAIT!'

sentiment_lp_pipeline.annotate(text)['class']

In [0]:
sentiment_lp_pipeline = get_clf_lp('classifierdl_use_emotion', sentiment_dl=False)


In [0]:
sentiment_lp_pipeline.annotate(text)['class']

## ClassiferDL with Word Embeddings and Text Preprocessing

### Load Dataset

In [0]:
! wget -q https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/tutorials/Certification_Trainings/Public/data/news_category_train.csv
! wget -q https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/tutorials/Certification_Trainings/Public/data/news_category_test.csv
  
dbutils.fs.cp("file:/databricks/driver/news_category_train.csv", "dbfs:/")
dbutils.fs.cp("file:/databricks/driver/news_category_test.csv", "dbfs:/")

In [0]:
%sh cd /databricks/driver/ && ls -lt 

In [0]:
trainDataset = spark.read \
    .option("header", True) \
    .csv("/news_category_train.csv")\
    .sample(0.0001) # For demonstration purposes only sample a fraction of the dataset

trainDataset.show(truncate=50)

In [0]:
trainDataset.count()


In [0]:
from pyspark.sql.functions import col

trainDataset.groupBy("category") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

In [0]:
testDataset = spark.read \
    .option("header", True) \
    .csv("/news_category_test.csv")\
    .sample(0.001) # For demonstration purposes only sample a fraction of the dataset

testDataset.show(truncate=50)

In [0]:
testDataset.groupBy("category") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

In [0]:
# if we want to split the dataset
'''(trainData, testData) = trainDataset.randomSplit([0.7, 0.3], seed = 100)
print("Train Dataset Count: " + str(trainData.count()))
print("Test Dataset Count: " + str(testData.count()))'''


In [0]:
document_assembler = DocumentAssembler() \
      .setInputCol("description") \
      .setOutputCol("document")

tokenizer = Tokenizer() \
      .setInputCols(["document"]) \
      .setOutputCol("token")
    
normalizer = Normalizer() \
      .setInputCols(["token"]) \
      .setOutputCol("normalized")

stopwords_cleaner = StopWordsCleaner()\
      .setInputCols("normalized")\
      .setOutputCol("cleanTokens")\
      .setCaseSensitive(False)

lemma = LemmatizerModel.pretrained('lemma_antbnc') \
      .setInputCols(["cleanTokens"]) \
      .setOutputCol("lemma")


### with Glove 100d embeddings

In [0]:
%fs mkdirs dbfs:/clf_glove_logs

In [0]:

glove_embeddings = WordEmbeddingsModel.pretrained("glove_100d") \
      .setInputCols(["document",'lemma'])\
      .setOutputCol("embeddings")\
      .setCaseSensitive(False)

embeddingsSentence = SentenceEmbeddings() \
      .setInputCols(["document", "embeddings"]) \
      .setOutputCol("sentence_embeddings") \
      .setPoolingStrategy("AVERAGE")

classsifierdl = ClassifierDLApproach()\
      .setInputCols(["sentence_embeddings"])\
      .setOutputCol("class")\
      .setLabelColumn("category")\
      .setMaxEpochs(2)\
      .setBatchSize(64) \
      .setLr(5e-3) \
      .setDropout(0.5)\
      .setEnableOutputLogs(True)\
      .setOutputLogsPath('dbfs:/clf_glove_logs')

clf_pipeline = Pipeline(
    stages=[document_assembler, 
            tokenizer,
            normalizer,
            stopwords_cleaner, 
            lemma, 
            glove_embeddings,
            embeddingsSentence,
            classsifierdl])

In [0]:
'''
default classifierDL params:

maxEpochs -> 10,
lr -> 5e-3f,
dropout -> 0.5f,
batchSize -> 64,
enableOutputLogs -> false,
verbose -> Verbose.Silent.id,
validationSplit -> 0.0f,
outputLogsPath -> ""
'''

In [0]:
# Train (2 min for 2 epochs)

clf_pipelineModel = clf_pipeline.fit(trainDataset)

In [0]:
%sh cd /dbfs/clf_glove_logs/ && ls -lt 

In [0]:
%sh cat /dbfs/clf_glove_logs/* 

In [0]:
# get the predictions on test Set

preds = clf_pipelineModel.transform(testDataset)

preds.select('category','description',"class.result").show(10, truncate=80)

In [0]:
# due to bug in cluster mode (https://github.com/JohnSnowLabs/spark-nlp/issues/857) , as a workaround, you can just save the fitted model and then load back from dbfs and then transform on the test set. 
clf_pipelineModel.stages[-1].write().overwrite().save('dbfs:/databricks/driver/models/ClassifierDL_wordemb_g100d')
classsifierdlmodel_loaded = ClassifierDLModel.load('dbfs:/databricks/driver/models/ClassifierDL_wordemb_g100d')


clf_pipeline_pred = Pipeline(
    stages=[document_assembler, 
            tokenizer,
            normalizer,
            stopwords_cleaner, 
            lemma, 
            glove_embeddings,
            clf_pipelineModel.stages[-2],
            classsifierdlmodel_loaded])

empty_data = spark.createDataFrame([[""]]).toDF("description")

result = clf_pipeline_pred.fit(empty_data).transform(testDataset)

In [0]:
preds_df = result.select('category','description',"class.result").toPandas()

# The result is an array since in Spark NLP you can have multiple sentences.
# Let's explode the array and get the item(s) inside of result column out
preds_df['result'] = preds_df['result'].apply(lambda x : x[0])

# We are going to use sklearn to evalute the results on test dataset
from sklearn.metrics import classification_report

print (classification_report(preds_df['result'], preds_df['category']))


### with Bert Embeddings

In [0]:
document_assembler = DocumentAssembler() \
                .setInputCol("description") \
                .setOutputCol("document")

tokenizer = Tokenizer() \
                .setInputCols(["document"]) \
                .setOutputCol("token")
      
bert_embeddings = BertEmbeddings().pretrained(name='small_bert_L4_256', lang='en') \
                .setInputCols(["document",'token'])\
                .setOutputCol("embeddings")

embeddingsSentence = SentenceEmbeddings() \
                .setInputCols(["document", "embeddings"]) \
                .setOutputCol("sentence_embeddings") \
                .setPoolingStrategy("AVERAGE")

classsifierdl = ClassifierDLApproach()\
                .setInputCols(["sentence_embeddings"])\
                .setOutputCol("class")\
                .setLabelColumn("category")\
                .setMaxEpochs(2)\
                .setLr(0.001)\
                .setBatchSize(8)\
                .setEnableOutputLogs(True)\
                .setOutputLogsPath('dbfs:/clf_bert_logs')

bert_clf_pipeline = Pipeline(stages=[document_assembler,
                                     tokenizer,
                                     bert_embeddings,
                                     embeddingsSentence,
                                     classsifierdl])

In [0]:
# remove the existing logs

! rm -r /root/annotator_logs

In [0]:
# training will take some time due to Bert (use GPU runtime when possible)

bert_clf_pipelineModel = bert_clf_pipeline.fit(trainDataset)

In [0]:
%sh cd /dbfs/clf_bert_logs/ && ls -lt 

In [0]:
%sh cat /dbfs/clf_bert_logs/* 

In [0]:
# We are going to use sklearn to evalute the results on test dataset
from sklearn.metrics import classification_report

preds = bert_clf_pipelineModel.transform(testDataset)

preds_df = preds.select('category','description',"class.result").toPandas()

preds_df['result'] = preds_df['result'].apply(lambda x : x[0])

print (classification_report(preds_df['category'], preds_df['result']))

#### testDataset Parameter

In [0]:
embedding_pipeline = Pipeline(stages=[document_assembler,
                                     tokenizer,
                                     bert_embeddings,
                                     embeddingsSentence])

In [0]:
test_data = embedding_pipeline.fit(testDataset).transform(testDataset)

In [0]:
test_data.printSchema()

In [0]:
test_data.write.mode("overwrite").parquet('dbfs:/test_data.parquet')

In [0]:
classsifierdl = ClassifierDLApproach()\
                .setInputCols(["sentence_embeddings"])\
                .setOutputCol("class")\
                .setLabelColumn("category")\
                .setMaxEpochs(2)\
                .setLr(0.001)\
                .setBatchSize(8)\
                .setEnableOutputLogs(True)\
                .setTestDataset("dbfs:/test_data.parquet")\
                .setOutputLogsPath('clf_logs')

clf_pipeline = Pipeline(stages=[document_assembler,
                                tokenizer,
                                bert_embeddings,
                                embeddingsSentence,
                                classsifierdl])

In [0]:
%%time
clf_model = clf_pipeline.fit(trainDataset)

In [0]:
from sklearn.metrics import classification_report

preds = clf_model.transform(testDataset)

preds_df = preds.select('category','description',"class.result").toPandas()

preds_df['result'] = preds_df['result'].apply(lambda x : x[0])

print (classification_report(preds_df['category'], preds_df['result']))

### Getting prediction from Trained model

In [0]:
from sparknlp.base import LightPipeline

light_model = LightPipeline(clf_pipelineModel)

In [0]:
text='''
Fearing the fate of Italy, the centre-right government has threatened to be merciless with those who flout tough restrictions. 
As of Wednesday it will also include all shops being closed across Greece, with the exception of supermarkets. Banks, pharmacies, pet-stores, mobile phone stores, opticians, bakers, mini-markets, couriers and food delivery outlets are among the few that will also be allowed to remain open.
'''
result = light_model.annotate(text)

result['class']

In [0]:
light_model.annotate('the soccer games will be postponed.')

## ClassifierDL with Universal Sentence Embeddings

In [0]:
%fs mkdirs dbfs:/clf_use_logs

In [0]:
# actual content is inside description column
document = DocumentAssembler()\
      .setInputCol("description")\
      .setOutputCol("document")
    
# we can also use sentece detector here if we want to train on and get predictions for each sentence

use = UniversalSentenceEncoder.pretrained()\
     .setInputCols(["document"])\
     .setOutputCol("sentence_embeddings")

# the classes/labels/categories are in category column
classsifierdl = ClassifierDLApproach()\
      .setInputCols(["sentence_embeddings"])\
      .setOutputCol("class")\
      .setLabelColumn("category")\
      .setMaxEpochs(2)\
      .setLr(0.001)\
      .setBatchSize(8)\
      .setEnableOutputLogs(True)\
      .setOutputLogsPath('dbfs:/clf_use_logs')

use_clf_pipeline = Pipeline(
    stages = [
        document,
        use,
        classsifierdl
    ])

In [0]:
# 5 epochs takes around 8 min

use_pipelineModel = use_clf_pipeline.fit(trainDataset)

In [0]:
%sh cd  /dbfs/clf_use_logs/ && ls -l


In [0]:
%sh cat /dbfs/clf_use_logs/*

### Saving & loading back the trained model

In [0]:
use_pipelineModel.stages

In [0]:
use_pipelineModel.stages[2].write().overwrite().save('/databricks/driver/models/ClassifierDL_USE_e5')

In [0]:
classsifierdlmodel = ClassifierDLModel.load('dbfs:/databricks/driver/models/ClassifierDL_USE_e5')

In [0]:
clf_lp = get_clf_lp('dbfs:/databricks/driver/models/ClassifierDL_USE_e5', sentiment_dl=False,  pretrained=False)

In [0]:
clf_lp.annotate(text)['class']

# SentimentDL Classifier

see also here >> `https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/training/english/classification/SentimentDL_train_multiclass_sentiment_classifier.ipynb`

In [0]:
!wget -q aclimdb_train.csv https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/resources/en/sentiment-corpus/aclimdb/aclimdb_train.csv
!wget -q aclimdb_test.csv https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/resources/en/sentiment-corpus/aclimdb/aclimdb_test.csv
  
dbutils.fs.cp("file:/databricks/driver/aclimdb_train.csv", "dbfs:/")
dbutils.fs.cp("file:/databricks/driver/aclimdb_test.csv", "dbfs:/")

In [0]:
trainDataset = spark.read \
      .option("header", True) \
      .csv("/aclimdb_train.csv") \
      .sample(0.02) # For demonstration purposes only sample a fraction of the dataset

trainDataset.show(10)
trainDataset.count()

In [0]:
testDataset = spark.read \
      .option("header", True) \
      .csv("/aclimdb_test.csv") \
      .sample(0.02)

testDataset.show(10)
testDataset.count()

In [0]:
# actual content is inside description column
document = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

use = UniversalSentenceEncoder.pretrained() \
   .setInputCols(["document"])\
   .setOutputCol("sentence_embeddings")

# the classes/labels/categories are in category column
sentimentdl = SentimentDLApproach()\
  .setInputCols(["sentence_embeddings"])\
  .setOutputCol("class")\
  .setLabelColumn("label")\
  .setMaxEpochs(2)\
  .setEnableOutputLogs(True)

pipeline = Pipeline(stages = [
    document,
    use,
    sentimentdl
    ])


In [0]:
pipelineModel = pipeline.fit(trainDataset)

In [0]:
result = pipelineModel.transform(testDataset)

result_df = result.select('text','label',"class.result").toPandas()

result_df.head(10)

Unnamed: 0,text,label,result
0,"It's hard to believe that a movie this bad wasn't produced once, but four times! Most movies req...",negative,[negative]
1,"I saw this film for one reason: the tagline is ""Upset the head and you're dead!"" Cracking. It's...",negative,[negative]
2,"There have been several comments already on the site focusing on the ""prestige"" feel of the film...",positive,[positive]
3,Visconti's masterpiece! I admit that I am unfamiliar with much of his work but I cannot imagine ...,positive,[positive]
4,Let me just say - I love the horror genre to the extent that I see every single one that I can g...,positive,[positive]
5,"As other reviewers have noted, this movie is a cross between (i.e. stolen from) stories we have ...",negative,[negative]
6,This movie had potential. If it had been handled differently. What it needed was a different dir...,negative,[negative]
7,"An interesting comedy, taking place on a train from Stockholm to Berlin, December 1945. One can'...",positive,[positive]
8,"OK, if you would judge the movie to now a days it wouldn't fit in to well.If you watched FI now ...",positive,[positive]
9,Charlie Wilson's war is an excellent example of how films should be made. This movie is of the h...,positive,[positive]


# Multilabel Classifier

see also here >> `https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/training/english/classification/MultiClassifierDL_train_multi_label_toxic_classifier.ipynb`

In [0]:
!curl -O 'https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/resources/en/classifier-dl/toxic_comments/toxic_train.snappy.parquet'
!curl -O 'https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/resources/en/classifier-dl/toxic_comments/toxic_test.snappy.parquet'

dbutils.fs.cp("file:/databricks/driver/toxic_train.snappy.parquet", "dbfs:/")
dbutils.fs.cp("file:/databricks/driver/toxic_test.snappy.parquet", "dbfs:/")

In [0]:
trainDataset = spark.read.parquet("dbfs:/toxic_train.snappy.parquet").sample(0.02)
testDataset = spark.read.parquet("dbfs:/toxic_train.snappy.parquet").sample(0.02)

print(trainDataset.count())
print(testDataset.count())

In [0]:
# Let's use shrink to remove new lines in the comments
document = DocumentAssembler()\
  .setInputCol("text")\
  .setOutputCol("document")\
  .setCleanupMode("shrink")

# Here we use the state-of-the-art Universal Sentence Encoder model from TF Hub
embeddings = UniversalSentenceEncoder.pretrained() \
  .setInputCols(["document"])\
  .setOutputCol("sentence_embeddings")

# We will use MultiClassifierDL built by using Bidirectional GRU and CNNs inside TensorFlow that supports up to 100 classes
# We will use only 5 Epochs but feel free to increase it on your own dataset
multiClassifier = MultiClassifierDLApproach()\
  .setInputCols("sentence_embeddings")\
  .setOutputCol("category")\
  .setLabelColumn("labels")\
  .setBatchSize(128)\
  .setMaxEpochs(2)\
  .setLr(1e-3)\
  .setThreshold(0.5)\
  .setShufflePerEpoch(False)\
  .setEnableOutputLogs(True)\
  .setValidationSplit(0.1)

pipeline = Pipeline(
    stages = [
        document,
        embeddings,
        multiClassifier
    ])

In [0]:
pipelineModel = pipeline.fit(trainDataset)

In [0]:
preds = pipelineModel.transform(testDataset)
preds_df = preds.select('text','labels',"category.result").toPandas()
preds_df.head(10)

Unnamed: 0,text,labels,result
0,God is dead\nI don't mean to startle anyone but God is dead. We should not worry about him anymo...,[toxic],"[toxic, insult, obscene]"
1,"i have a dick, its bigger than yours! hahaha","[toxic, severe_toxic, obscene]","[toxic, insult, obscene]"
2,The only vandals are pathetic wiki administrators as encyclopediadramatica.com says right!,[toxic],"[toxic, insult]"
3,"""\n\n SHUT THE FUCK UP \n\n SHUT THE FUCK UP You bother me again, there will be problems ""","[toxic, obscene]","[toxic, insult, obscene]"
4,"You can't kill the party Soap, YOU CAN'T KILL THE PARTY!",[toxic],[toxic]
5,DreadedWalrus thinks you are gay. \n\nDreadedWalrus thinks you are gay.,[toxic],"[toxic, insult, obscene]"
6,picking on a 10 year old!!! soooo sad \n\nJust shut up okay?Im only 10 years old. I just wanted ...,"[toxic, insult]","[toxic, insult, obscene]"
7,"Hi. Welcome me to nothing bitch, I've been using Wikipedia for over 3 years. I don't vandalise...","[toxic, obscene, insult]","[toxic, insult]"
8,how is pointing out that you are ignorant a personal attack? I didn't make this up. It isn't y...,"[toxic, insult]","[toxic, insult, obscene]"
9,I'm sure you eat alot of tube steak,[toxic],"[toxic, insult, obscene]"


## Multilabel Classifier with Bert Embeddings

In [0]:
# Let's use shrink to remove new lines in the comments
document = DocumentAssembler()\
              .setInputCol("text")\
              .setOutputCol("document")\
              .setCleanupMode("shrink")

bert_sent = BertSentenceEmbeddings.pretrained('sent_small_bert_L8_512')\
              .setInputCols(["document"])\
              .setOutputCol("sentence_embeddings")

# We will use MultiClassifierDL built by using Bidirectional GRU and CNNs inside TensorFlow that supports up to 100 classes
# We will use only 5 Epochs but feel free to increase it on your own dataset

multiClassifier = MultiClassifierDLApproach()\
              .setInputCols("sentence_embeddings")\
              .setOutputCol("category")\
              .setLabelColumn("labels")\
              .setBatchSize(128)\
              .setMaxEpochs(2)\
              .setLr(1e-3)\
              .setThreshold(0.5)\
              .setShufflePerEpoch(False)\
              .setEnableOutputLogs(True)\
              .setValidationSplit(0.1)

pipeline = Pipeline(stages = [document,
                              bert_sent,
                              multiClassifier])

In [0]:
%%time
pipelineModel = pipeline.fit(trainDataset)

In [0]:
preds = pipelineModel.transform(testDataset)

In [0]:
preds_df = preds.select('text','labels',"category.result").toPandas()

In [0]:
preds_df.head(10)

Unnamed: 0,text,labels,result
0,God is dead\nI don't mean to startle anyone but God is dead. We should not worry about him anymo...,[toxic],[toxic]
1,"i have a dick, its bigger than yours! hahaha","[toxic, severe_toxic, obscene]","[toxic, insult, obscene]"
2,The only vandals are pathetic wiki administrators as encyclopediadramatica.com says right!,[toxic],[toxic]
3,"""\n\n SHUT THE FUCK UP \n\n SHUT THE FUCK UP You bother me again, there will be problems ""","[toxic, obscene]","[toxic, insult, obscene]"
4,"You can't kill the party Soap, YOU CAN'T KILL THE PARTY!",[toxic],"[toxic, insult, obscene]"
5,DreadedWalrus thinks you are gay. \n\nDreadedWalrus thinks you are gay.,[toxic],[toxic]
6,picking on a 10 year old!!! soooo sad \n\nJust shut up okay?Im only 10 years old. I just wanted ...,"[toxic, insult]","[toxic, insult]"
7,"Hi. Welcome me to nothing bitch, I've been using Wikipedia for over 3 years. I don't vandalise...","[toxic, obscene, insult]",[toxic]
8,how is pointing out that you are ignorant a personal attack? I didn't make this up. It isn't y...,"[toxic, insult]",[toxic]
9,I'm sure you eat alot of tube steak,[toxic],"[toxic, insult, obscene]"


# Case Study: Conference Title Classification

In [0]:
!wget -q https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/tutorials/Certification_Trainings/Public/data/title_conference.csv
  
dbutils.fs.cp("file:/databricks/driver/title_conference.csv", "dbfs:/")

In [0]:
import pandas as pd
df = pd.read_csv('title_conference.csv')
df

Unnamed: 0,Title,Conference
0,Innovation in Database Management: Computer Science vs. Engineering.,VLDB
1,High performance prime field multiplication for GPU.,ISCAS
2,enchanted scissors: a scissor interface for support in cutting and interactive fabrication.,SIGGRAPH
3,Detection of channel degradation attack by Intermediary Node in Linear Networks.,INFOCOM
4,Pinning a Complex Network through the Betweenness Centrality Strategy.,ISCAS
...,...,...
2502,A new QR-decomposition based recursive frequency estimator for multiple sinusoids in impulsive n...,ISCAS
2503,CNN Implementation of Spin Filters for Electronic Speckle Pattern Interferometry Applications.,ISCAS
2504,FaceKit: A Database Interface Design Toolkit.,VLDB
2505,On the trade-off between the number of scrolls and the operating frequency of the chaotic attrac...,ISCAS


In [0]:
df.Conference.value_counts()

In [0]:
trainDataset = spark.read \
      .option("header", True) \
      .csv("/title_conference.csv") \
      .sample(0.1)

(trainingData, testData) = trainDataset.randomSplit([0.8, 0.2], seed = 100)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))


In [0]:
document = DocumentAssembler()\
      .setInputCol("Title")\
      .setOutputCol("document")
    
# we can also use sentece detector here if we want to train on and get predictions for each sentence

use = UniversalSentenceEncoder.pretrained()\
      .setInputCols(["document"])\
      .setOutputCol("sentence_embeddings")

# the classes/labels/categories are in category column
classsifierdl = ClassifierDLApproach()\
      .setInputCols(["sentence_embeddings"])\
      .setOutputCol("class")\
      .setLabelColumn("Conference")\
      .setMaxEpochs(2)\
      .setEnableOutputLogs(True)

use_clf_pipeline = Pipeline(
    stages = [
        document,
        use,
        classsifierdl
    ])

In [0]:
use_pipelineModel = use_clf_pipeline.fit(trainingData)

In [0]:
use_pipelineModel.stages

In [0]:
use_pipelineModel.stages[-1].write().overwrite().save('dbfs:/databricks/driver/models/use_clf')
use_clf_loaded = ClassifierDLModel.load('dbfs:/databricks/driver/models/use_clf')

use_clf_pipeline_pred = Pipeline(
    stages=[document, 
            use_pipelineModel.stages[-2],
            use_clf_loaded])

empty_data = spark.createDataFrame([[""]]).toDF("description")

result = use_clf_pipeline_pred.fit(empty_data).transform(testData)

In [0]:
result.select('Title','Conference',"class.result").show(10, truncate=80)


In [0]:
# We are going to use sklearn to evalute the results on test dataset
preds_df = result.select('Conference','Title',"class.result").toPandas()

# Let's explode the array and get the item(s) inside of result column out
preds_df['result'] = preds_df['result'].apply(lambda x : x[0])

from sklearn.metrics import classification_report

print (classification_report(preds_df['result'], preds_df['Conference']))


In [0]:
from sklearn.metrics import confusion_matrix, accuracy_score
import numpy as np

preds = pd.DataFrame(confusion_matrix(preds_df['result'], preds_df['Conference']), columns = np.unique(preds_df['Conference']), index =  np.unique(preds_df['Conference']))
preds

Unnamed: 0,INFOCOM,ISCAS,SIGGRAPH,VLDB,WWW
INFOCOM,0,0,0,0,0
ISCAS,7,18,9,12,6
SIGGRAPH,0,0,0,0,0
VLDB,0,0,0,0,0
WWW,0,0,0,0,0


### Bert Sentence Embeddings

In [0]:
document = DocumentAssembler()\
              .setInputCol("Title")\
              .setOutputCol("document")
    
bert_sent = BertSentenceEmbeddings.pretrained('sent_small_bert_L8_512')\
              .setInputCols(["document"])\
              .setOutputCol("sentence_embeddings")

# the classes/labels/categories are in category column
classsifierdl = ClassifierDLApproach()\
              .setInputCols(["sentence_embeddings"])\
              .setOutputCol("class")\
              .setLabelColumn("Conference")\
              .setMaxEpochs(2)\
              .setLr(0.001)\
              .setBatchSize(8)\
              .setEnableOutputLogs(True)

bert_clf_pipeline = Pipeline(stages = [document,
                                      bert_sent,
                                      classsifierdl])

In [0]:
%%time
bert_pipelineModel = bert_clf_pipeline.fit(trainingData)

In [0]:
from sklearn.metrics import classification_report

preds = bert_pipelineModel.transform(testData)

preds_df = preds.select('Conference',"class.result").toPandas()

preds_df['result'] = preds_df['result'].apply(lambda x : x[0])

print (classification_report(preds_df['Conference'], preds_df['result']))

# Save model and Zip it for Modelshub Upload/Downloads

In [0]:
# Save a Spark NLP pipeline
bert_clf_pipeline.write().overwrite().save('file:/databricks/driver/models/my_nlp_pipeline')

In [0]:
# cd into saved dir and zip
! cd /databricks/driver/models/my_nlp_pipeline && zip -r my_nlp_pipeline.zip *

End of Notebook # 4