

![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://githubtocolab.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/streamlit_notebooks/CLASSIFICATION_TR_NEWS.ipynb)




# **Classify Question Pairs**

## 1. Colab Setup

In [None]:
# Install PySpark and Spark NLP
! pip install -q pyspark==3.3.0 spark-nlp==4.2.8

## 2. Start Spark Session

In [2]:
import json
import pandas as pd
import numpy as np

import sparknlp
import pyspark.sql.functions as F

from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
from sparknlp.annotator import *
from sparknlp.base import *
from sparknlp.pretrained import PretrainedPipeline
from pyspark.sql.types import StringType, IntegerType

spark = sparknlp.start()

print("Spark NLP version", sparknlp.version())
print("Apache Spark version:", spark.version)

spark

Spark NLP version 4.2.8
Apache Spark version: 3.3.0


## 3. Some sample examples

In [3]:
q1_list = ["How is your studies going?", 'If the Universe was born at the Big Bang, what existed before then?', 
           'After Obama finishes his presidency, does he still receive Secret Service protection?', 
          'Am looking for motivational books to read?',  'Antonio has a deep prejudice against Shylock. Is Shylocks anger towards Antonio justified? Why or why not?']

q2_list = ["How is your days going?",  'What actually existed before the Big Bang?', 
           'Does President Obama and his family have secret service protection for the rest of their lives?',
          'What motivational books one should read?', 'Who is the hero of The Merchant of Venice?']

## 4. Define Spark NLP pipeline

In [4]:
pipeline = PretrainedPipeline("classifierdl_electra_questionpair_pipeline", "en")

classifierdl_electra_questionpair_pipeline download started this may take some time.
Approx size to download 1.2 GB
[OK!]


In [5]:
pipeline.model.stages

[DocumentAssembler_ea1cf6483ac6,
 BERT_SENTENCE_EMBEDDINGS_f583b1187cd8,
 ClassifierDLModel_dcc5daafb83e]

In [6]:
pipeline.model.stages[-1].getInputCols()

['document', 'sentence_embeddings']

In [7]:
pipeline.model.stages[-1].setInputCols(['sentence_embeddings'])

ClassifierDLModel_dcc5daafb83e

## 5. Run the pipeline

In [8]:
pipeline.fullAnnotate(f"q1: How is your studies going q2: How is your days going?")

Before _validateStagesInputCols


[{'document': [Annotation(document, 0, 56, q1: How is your studies going q2: How is your days going?, {}, [])],
  'sentence_embeddings': [Annotation(sentence_embeddings, 0, 56, q1: How is your studies going q2: How is your days going?, {'sentence': '0', 'token': 'q1: How is your studies going q2: How is your days going?', 'pieceId': '-1', 'isWordStart': 'true'}, [])],
  'class': [Annotation(category, 0, 56, almost_same, {'sentence': '0', 'almost_same': '1.0', 'not_same': '1.4908887E-22'}, [])]}]

In [9]:
## Getting one result

res =  pipeline.fullAnnotate(f"q1: How is your studies going q2: How is your days going?")

print(res[0]['class'][0].result)

Before _validateStagesInputCols
almost_same


In [10]:
## Get all the results and save it in a dict

results = {}
for i, q1 in enumerate(q1_list):
    for j, q2 in enumerate(q1_list):
        result =  pipeline.fullAnnotate(f"q1: {q1} q2: {q2}") 
        a = result[0]['class'][0].result
        
        results[f'{i}{j}'] = a

Before _validateStagesInputCols
Before _validateStagesInputCols
Before _validateStagesInputCols
Before _validateStagesInputCols
Before _validateStagesInputCols
Before _validateStagesInputCols
Before _validateStagesInputCols
Before _validateStagesInputCols
Before _validateStagesInputCols
Before _validateStagesInputCols
Before _validateStagesInputCols
Before _validateStagesInputCols
Before _validateStagesInputCols
Before _validateStagesInputCols
Before _validateStagesInputCols
Before _validateStagesInputCols
Before _validateStagesInputCols
Before _validateStagesInputCols
Before _validateStagesInputCols
Before _validateStagesInputCols
Before _validateStagesInputCols
Before _validateStagesInputCols
Before _validateStagesInputCols
Before _validateStagesInputCols
Before _validateStagesInputCols


## 6. Visualize results

In [11]:
print(results)

{'00': 'almost_same', '01': 'not_same', '02': 'not_same', '03': 'not_same', '04': 'not_same', '10': 'not_same', '11': 'almost_same', '12': 'not_same', '13': 'not_same', '14': 'not_same', '20': 'not_same', '21': 'not_same', '22': 'almost_same', '23': 'not_same', '24': 'not_same', '30': 'not_same', '31': 'not_same', '32': 'not_same', '33': 'almost_same', '34': 'not_same', '40': 'not_same', '41': 'not_same', '42': 'not_same', '43': 'not_same', '44': 'almost_same'}
