![JohnSnowLabs](https://sparknlp.org/assets/images/logo.png)

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/text-similarity/Spark_NLP_Spark_ML_Text_Similarity.ipynb)

# Calculating Text Similarity

In [None]:
# Only run this cell when you are using Spark NLP on Google Colab
!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash

In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import *
from pyspark.sql.functions import *
from sparknlp.annotator import *
from sparknlp.base import *
from sparknlp.common import *
from sparknlp.functions import *

import sparknlp

spark = sparknlp.start()

print("Spark NLP version: ", sparknlp.version())
print("Apache Spark version: ", spark.version)

Spark NLP version:  4.3.1
Apache Spark version:  3.3.0


In [None]:
primaryCorpus = spark.read.option("header","true").csv("file1.csv")
secondaryCorpus = spark.read.option("header","true").csv("file2.csv")

In [None]:
from pyspark.ml.feature import Normalizer

documentAssembler = DocumentAssembler().setInputCol("text").setOutputCol("document")

sentence = SentenceDetector()\
    .setInputCols("document")\
    .setOutputCol("sentence")\
    .setExplodeSentences(False)

tokenizer = Tokenizer()\
    .setInputCols(['sentence'])\
    .setOutputCol('token')

bertEmbeddings = BertEmbeddings.pretrained('bert_base_cased', 'en') \
    .setInputCols(["sentence",'token'])\
    .setOutputCol("bert")\
    .setCaseSensitive(False)

embeddingsSentence = SentenceEmbeddings() \
    .setInputCols(["sentence", "bert"]) \
    .setOutputCol("sentence_embeddings") \
    .setPoolingStrategy("AVERAGE")

embeddingsFinisher = EmbeddingsFinisher() \
    .setInputCols(["sentence_embeddings","bert"]) \
    .setOutputCols("sentence_embeddings_vectors", "bert_vectors") \
    .setOutputAsVector(True)\
    .setCleanAnnotations(False)


explodeVectors = SQLTransformer() \
    .setStatement("SELECT EXPLODE(sentence_embeddings_vectors) AS features, * FROM __THIS__")

vectorNormalizer = Normalizer() \
    .setInputCol("features") \
    .setOutputCol("normFeatures") \
    .setP(1.0)

similarityChecker = BucketedRandomProjectionLSH(inputCol="features", outputCol="hashes", bucketLength=6.0,numHashTables=6)

bert_base_cased download started this may take some time.
Approximate size to download 389.1 MB
[OK!]


In [None]:
pipeline = Pipeline().setStages([
    documentAssembler,
    sentence,
    tokenizer,
    bertEmbeddings,
    embeddingsSentence,
    embeddingsFinisher,
    explodeVectors,
    vectorNormalizer,
    similarityChecker
])

In [None]:
pipelineModel = pipeline.fit(primaryCorpus)
primaryDF = pipelineModel.transform(primaryCorpus)
secondaryDF = pipelineModel.transform(secondaryCorpus)

In [None]:
dfA = primaryDF.select("text","features","normFeatures").withColumn("lookupKey", md5("text")).withColumn("id",monotonically_increasing_id())
dfA.show()

+--------------------+--------------------+--------------------+--------------------+---+
|                text|            features|        normFeatures|           lookupKey| id|
+--------------------+--------------------+--------------------+--------------------+---+
|Wall Decals Lamp ...|[0.04242564737796...|[2.48993627607806...|bbc5a89d7cf3354ea...|  0|
|iphone charger ph...|[0.37093448638916...|[0.00200630526885...|37c2b6ab956f9ebd6...|  1|
+--------------------+--------------------+--------------------+--------------------+---+



In [None]:
dfB = secondaryDF.select("text","features","normFeatures").withColumn("id",monotonically_increasing_id())
dfB.show()

+--------------------+--------------------+--------------------+---+
|                text|            features|        normFeatures| id|
+--------------------+--------------------+--------------------+---+
|Curtains & Valanc...|[0.30033871531486...|[0.00192763000744...|  0|
|iphone case Apple...|[0.44015255570411...|[0.00236218518925...|  1|
+--------------------+--------------------+--------------------+---+



In [None]:
#print("Approximately joining dfA and dfB :")
pipelineModel.stages[8].approxSimilarityJoin(dfA, dfB, 100, distCol="distance")\
     .where(col("datasetA.id") == col("datasetB.id")) \
     .select(col("datasetA.text").alias("idA"), \
            col("datasetB.text").alias("idB"), \
            col("distance")).show()


+--------------------+--------------------+------------------+
|                 idA|                 idB|          distance|
+--------------------+--------------------+------------------+
|Wall Decals Lamp ...|Curtains & Valanc...|3.7816639073044893|
|iphone charger ph...|iphone case Apple...| 5.666233511624179|
+--------------------+--------------------+------------------+



## Approach 2

In [None]:
from pyspark.sql.functions import PandasUDFType, pandas_udf
import pyspark.sql.functions as F

dfA = dfA.withColumnRenamed('text','primaryText').withColumnRenamed('features', 'primaryFeatures')

dfB = dfB.withColumnRenamed('text','secondaryText').withColumnRenamed('features', 'secondaryFeatures')

joinedDF = dfA.join(dfB, "id", "inner").drop("id","normFeatures")

joinedDF.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+
|         primaryText|     primaryFeatures|           lookupKey|       secondaryText|   secondaryFeatures|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|Wall Decals Lamp ...|[0.04242564737796...|bbc5a89d7cf3354ea...|Curtains & Valanc...|[0.30033871531486...|
|iphone charger ph...|[0.37093448638916...|37c2b6ab956f9ebd6...|iphone case Apple...|[0.44015255570411...|
+--------------------+--------------------+--------------------+--------------------+--------------------+



In [None]:
from scipy.spatial.distance import cosine

finalDF = joinedDF.toPandas()

finalDF['cosine'] = finalDF.apply(lambda row: 1-cosine(row['primaryFeatures'], row['secondaryFeatures']), axis=1)
finalDF

Unnamed: 0,primaryText,primaryFeatures,lookupKey,secondaryText,secondaryFeatures,cosine
0,Wall Decals Lamp Shades Armchairs Bed Sheets N...,"[0.042425647377967834, -0.226881206035614, -0....",bbc5a89d7cf3354ea4887c3690404ad8,Curtains & Valances Wall Decals & Stickers Bed...,"[0.3003387153148651, -0.022465573623776436, -0...",0.942328
1,iphone charger phone Gift case iPhone holder s...,"[0.37093448638916016, 0.07500777393579483, -0....",37c2b6ab956f9ebd6dccebd7623bf8c1,iphone case Apple ipod,"[0.4401525557041168, -0.09592525660991669, 0.0...",0.885493
