![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)

# Text Classification with ClassifierDL

In [3]:
import sparknlp

from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline
import pandas as pd

print("Spark NLP version", sparknlp.version())

print("Apache Spark version:", spark.version)

spark

## Load Dataset

In [5]:
! wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/tutorials/Certification_Trainings/Public/data/news_category_train.csv
! wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/tutorials/Certification_Trainings/Public/data/news_category_test.csv

In [6]:
dbutils.fs.cp("file:/databricks/driver/news_category_train.csv", "dbfs:/")
dbutils.fs.cp("file:/databricks/driver/news_category_test.csv", "dbfs:/")

In [7]:
trainDataset = spark.read \
      .option("header", True) \
      .csv("news_category_train.csv")

trainDataset.show(truncate=50)

In [8]:
trainDataset.count()


In [9]:
from pyspark.sql.functions import col

trainDataset.groupBy("category") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

In [10]:
testDataset = spark.read \
      .option("header", True) \
      .csv("news_category_test.csv")


testDataset.groupBy("category") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

In [11]:
# if we want to split the dataset
'''
(trainingData, testData) = trainDataset.randomSplit([0.7, 0.3], seed = 100)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))
'''

## ClassiferDL with Word Embeddings and Text Preprocessing

In [13]:
document_assembler = DocumentAssembler() \
    .setInputCol("description") \
    .setOutputCol("document")
    
tokenizer = Tokenizer() \
  .setInputCols(["document"]) \
  .setOutputCol("token")
    
normalizer = Normalizer() \
    .setInputCols(["token"]) \
    .setOutputCol("normalized")

stopwords_cleaner = StopWordsCleaner()\
      .setInputCols("normalized")\
      .setOutputCol("cleanTokens")\
      .setCaseSensitive(False)

lemma = LemmatizerModel.pretrained('lemma_antbnc') \
    .setInputCols(["cleanTokens"]) \
    .setOutputCol("lemma")

glove_embeddings = WordEmbeddingsModel().pretrained() \
 .setInputCols(["document",'lemma'])\
 .setOutputCol("embeddings")\
 .setCaseSensitive(False)

embeddingsSentence = SentenceEmbeddings() \
      .setInputCols(["document", "embeddings"]) \
      .setOutputCol("sentence_embeddings") \
      .setPoolingStrategy("AVERAGE")

classsifierdl = ClassifierDLApproach()\
  .setInputCols(["sentence_embeddings"])\
  .setOutputCol("class")\
  .setLabelColumn("category")\
  .setMaxEpochs(3)\
  .setEnableOutputLogs(True)

clf_pipeline = Pipeline(
    stages=[document_assembler, 
            tokenizer,
            normalizer,
            stopwords_cleaner, 
            lemma, 
            glove_embeddings,
            embeddingsSentence,
            classsifierdl])

In [15]:
# Train (8 min for 10 epochs)
clf_pipelineModel = clf_pipeline.fit(trainDataset)

In [16]:
clf_pipelineModel.stages[-1].write().overwrite().save('dbfs:/ClassifierDL_wordemb_e5')


In [18]:
# get the predictions on test Set

preds = clf_pipelineModel.transform(testDataset)


In [19]:
preds = clf_pipelineModel.transform(testDataset)

preds.select('category','description',"class.result").show(10, truncate=80)


In [20]:

preds_df = preds.select('category','description',"class.result").toPandas()

# The result is an array since in Spark NLP you can have multiple sentences.
# Let's explode the array and get the item(s) inside of result column out
preds_df['result'] = preds_df['result'].apply(lambda x : x[0])


In [21]:
# due to bug in cluster mode (https://github.com/JohnSnowLabs/spark-nlp/issues/857) , as a workaround, you can just save the fitted model and then load back from dbfs and then transform on the test set. 

clf_pipelineModel.stages[-1].write().overwrite().save('dbfs:/ClassifierDL_wordemb_e5')

classsifierdlmodel_loaded = ClassifierDLModel.load('dbfs:/ClassifierDL_wordemb_e5')

clf_pipeline_pred = Pipeline(
    stages=[document_assembler, 
            tokenizer,
            normalizer,
            stopwords_cleaner, 
            lemma, 
            glove_embeddings,
            clf_pipelineModel.stages[-2],
            classsifierdlmodel_loaded])

empty_data = spark.createDataFrame([[""]]).toDF("description")

result = clf_pipeline_pred.fit(empty_data).transform(testDataset)

In [22]:
preds_df = result.select('category','description',"class.result").toPandas()

# The result is an array since in Spark NLP you can have multiple sentences.
# Let's explode the array and get the item(s) inside of result column out
preds_df['result'] = preds_df['result'].apply(lambda x : x[0])

# We are going to use sklearn to evalute the results on test dataset
from sklearn.metrics import classification_report

print (classification_report(preds_df['result'], preds_df['category']))


## ClassifierDL with Universal Sentence Embeddings

In [24]:
# actual content is inside description column
document = DocumentAssembler()\
    .setInputCol("description")\
    .setOutputCol("document")
    
# we can also use sentece detector here if we want to train on and get predictions for each sentence

use = UniversalSentenceEncoder.pretrained()\
 .setInputCols(["document"])\
 .setOutputCol("sentence_embeddings")

# the classes/labels/categories are in category column
classsifierdl = ClassifierDLApproach()\
  .setInputCols(["sentence_embeddings"])\
  .setOutputCol("class")\
  .setLabelColumn("category")\
  .setMaxEpochs(5)\
  .setEnableOutputLogs(True)

use_clf_pipeline = Pipeline(
    stages = [
        document,
        use,
        classsifierdl
    ])

In [25]:
use_pipelineModel = use_clf_pipeline.fit(trainDataset)
# 5 epochs takes around 10 min


## Getting prediction from Trained model

In [27]:
from sparknlp.base import LightPipeline

light_model = LightPipeline(use_pipelineModel)

In [28]:
testDataset.select('description').take(2)

In [29]:
text='''
Fearing the fate of Italy, the centre-right government has threatened to be merciless with those who flout tough restrictions. 
As of Wednesday it will also include all shops being closed across Greece, with the exception of supermarkets. Banks, pharmacies, pet-stores, mobile phone stores, opticians, bakers, mini-markets, couriers and food delivery outlets are among the few that will also be allowed to remain open.
'''
result = light_model.annotate(text)

result['class']

### Saving the trained model

In [31]:
use_pipelineModel.stages

In [32]:
use_pipelineModel.stages[2].write().overwrite().save('dbfs:/ClassifierDL_USE_20200923_e5')

In [33]:
use_classsifierdlmodel_loaded = ClassifierDLModel.load('dbfs:/ClassifierDL_USE_20200923_e5')


In [34]:
use_pipeline_pred = Pipeline(
    stages=[document,
            use_pipelineModel.stages[1],
            use_classsifierdlmodel_loaded])

empty_data = spark.createDataFrame([[""]]).toDF("description")

use_result = use_pipeline_pred.fit(empty_data).transform(testDataset)


In [35]:
preds_df = use_result.select('category','description',"class.result").toPandas()

# The result is an array since in Spark NLP you can have multiple sentences.
# Let's explode the array and get the item(s) inside of result column out
preds_df['result'] = preds_df['result'].apply(lambda x : x[0])

# We are going to use sklearn to evalute the results on test dataset
from sklearn.metrics import classification_report

print (classification_report(preds_df['result'], preds_df['category']))
