![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)

# Financial Word and Sentence Embeddings

# Finance Word and Sentence Embeddings visualization using PCA (Principal Component Analysis)

Modern NLP models work with a numerical representation of texts and their menaning. For token classification problems (inferring a class for a token, for example Name Entity Recognition) Word Embeddings are required. For sentences, paragraph, document classification - we use Sentence Embeddings.

In this notebook, we got token embeddings using Spark NLP Finance Word Embeddings(**bert_embeddings_sec_bert_base**) and using these token embeddings we got sentence embeddings by sparknlp annotator SentenceEmbeddings to get those numerical representations of the semantics of the texts. The result is a 768 embeddings matrix, impossible to process by the human eye.

There are many techniques we can use to visualize those embeddings. We are using one of them - Principal Component Analysis, a dimensionality reduction process, carried out by Spark MLLib. Both embeddings have 768 dimensions, so we will reduced this dimensions from **768** to **3** (X, Y, Z) and will use a color for the word / sentence legend.

# Installation

In [0]:
from johnsnowlabs import * 
import pandas as pd

# Get sample text

In [0]:
# Downloading sample datasets.
!wget -q https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/tutorials/Certification_Trainings_JSL/Finance/data/finance_pca_samples.csv
dbutils.fs.cp("file:/databricks/driver/finance_pca_samples.csv", "dbfs:/")

In [0]:
import pandas as pd

df = pd.read_csv("finance_pca_samples.csv")

In [0]:
# Create spark dataframe
sdf = spark.createDataFrame(df)
sdf.show()

# Pipeline with Spark NLP and Spark MLLIB

In [0]:
# We defined a generic pipeline for word and sentence embeddings

def generic_pipeline():
  document_assembler = nlp.DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

  tokenizer = nlp.Tokenizer()\
      .setInputCols("document")\
      .setOutputCol("token")

  word_embeddings = nlp.BertEmbeddings.pretrained("bert_embeddings_sec_bert_base","en")\
      .setInputCols(["document", "token"])\
      .setOutputCol("word_embeddings")

  pipeline = nlp.Pipeline(stages = [
      document_assembler,
      tokenizer,
      word_embeddings
  ])

  return pipeline



## Sentence Embeddings

In [0]:
embeddings_sentence = nlp.SentenceEmbeddings()\
    .setInputCols(["document", "word_embeddings"])\
    .setOutputCol("sentence_embeddings")\
    .setPoolingStrategy("AVERAGE")
# We used sparknlp SentenceEmbeddings anootator to get each sentence embeddings from token embeddings

# Custom transform to retrieve the numerical embeddings from Spark NLP and pass it to Spark MLLib

In [0]:
# This class extracts the embeddings from the Spark NLP Annotation object
# from pyspark import ml as ML
## < 2.0 -> pyspark.ml.util.keyword_only
from pyspark.ml import Transformer
from pyspark import ml as ML
from pyspark import keyword_only 

class EmbeddingsUDF(
    Transformer, ML.param.shared.HasInputCol, ML.param.shared.HasOutputCol,
    ML.util.DefaultParamsReadable, ML.util.DefaultParamsWritable
):
    @keyword_only
    def __init__(self):
        super(EmbeddingsUDF, self).__init__()

        def _sum(r):
            result = 0.0
            for e in r:
                result += e
            return result

        self.udfs = {
            'convertToVectorUDF': F.udf(lambda vs: ML.linalg.Vectors.dense(vs), ML.linalg.VectorUDT()),
            'sumUDF': F.udf(lambda r: _sum(r), T.FloatType())
        }

    def _transform(self, dataset):

        results = dataset.select(
            "*", F.explode("sentence_embeddings.embeddings").alias("embeddings")
        )
        results = results.withColumn(
            "features",
            self.udfs['convertToVectorUDF'](F.col("embeddings"))
        )
        results = results.withColumn(
            "emb_sum",
            self.udfs['sumUDF'](F.col("embeddings"))
        )
        # Remove those with embeddings all zeroes (so we can calculate cosine distance)
        results = results.where(F.col("emb_sum")!=0.0)

        return results

In [0]:
from pyspark.sql import functions as F
import pyspark.sql.types as T

embeddings_for_pca = EmbeddingsUDF()

In [0]:
DIMENSIONS  = 3

In [0]:
pca = ML.feature.PCA(k=DIMENSIONS, inputCol="features", outputCol="pca_features")

### Full Spark NLP + Spark MLLib pipeline

In [0]:
# We did all process in one pipeline
pipeline = nlp.Pipeline().setStages([generic_pipeline(), embeddings_sentence, embeddings_for_pca, pca])

In [0]:
model = pipeline.fit(sdf)

In [0]:
result = model.transform(sdf)

In [0]:
result.select('pca_features', 'label').show(truncate=False)

In [0]:
df = result.select('pca_features', 'label').toPandas()

df
# As you see, dimension values are inside a list

Unnamed: 0,pca_features,label
0,"[3.395761733456288, -1.0603583667210579, -1.56...",Accounts
1,"[2.366080448286008, 0.8591978691981672, -0.806...",Accounts
2,"[0.6867676011553301, 1.4823995675537482, 0.006...",Credit Cards
3,"[-0.28834654292887507, 1.0031584374166118, -0....",Credit Cards
4,"[-0.5037846809116963, -1.3771537751758807, 0.4...",Credit Reporting
5,"[1.0397520771220115, -1.7194098205845816, 1.85...",Credit Reporting
6,"[2.7731648234189037, 1.1680314504169131, 1.394...",Debt Collection
7,"[-0.45951420852073444, 0.8339738969601689, 0.5...",Debt Collection
8,"[0.2703033212234329, 1.1069461667059015, -0.42...",Loans
9,"[0.8662474660433654, 1.1435305015895525, 0.870...",Loans


In [0]:
# We extract the dimension values out off the list

df["x"] = df["pca_features"].apply(lambda x: x[0])

df["y"] = df["pca_features"].apply(lambda x: x[1])

df["z"] = df["pca_features"].apply(lambda x: x[2])

df = df[["x", "y", "z", "label"]]

df

Unnamed: 0,x,y,z,label
0,3.395762,-1.060358,-1.568793,Accounts
1,2.36608,0.859198,-0.806617,Accounts
2,0.686768,1.4824,0.006591,Credit Cards
3,-0.288347,1.003158,-0.79638,Credit Cards
4,-0.503785,-1.377154,0.444973,Credit Reporting
5,1.039752,-1.71941,1.85394,Credit Reporting
6,2.773165,1.168031,1.394944,Debt Collection
7,-0.459514,0.833974,0.505171,Debt Collection
8,0.270303,1.106946,-0.424756,Loans
9,0.866247,1.143531,0.870355,Loans


In [0]:
import plotly.express as px

fig = px.scatter_3d(df, x = 'x', y = 'y', z = 'z', color = 'label', width=800, height=600)

fig.show()

### Word Embeddings

We can also visualize the semantics of words, instead of full texts, by using Word Embeddings. We will add a Tokenizer and a WordEmbeddings model to get those embeddings, and them apply PCA as before. Firstly we splitted the pipeline in two to get all token embeddings

In [0]:
model = generic_pipeline().fit(sdf)

In [0]:
result = model.transform(sdf)

In [0]:
from pyspark.sql import functions as F

result_df = result.select("label", F.explode(F.arrays_zip(result.token.result, result.word_embeddings.embeddings)).alias("cols"))\
      .select(F.expr("cols['0']").alias("token"),
              "label",
              F.expr("cols['1']").alias("embeddings"))

result_df.show(truncate = 80)


In [0]:
# Here we defined inheritance class from that defined previously EmbeddingsUDF class
class WordEmbeddingsUDF(EmbeddingsUDF):    
    def _transform(self, dataset):
        
        results = dataset.select('token', 'label', 'embeddings') # We changed this line because our embedding cloumn is already exploded

        results = results.withColumn(
            "features",
            self.udfs['convertToVectorUDF'](F.col("embeddings"))
        )
        results = results.withColumn(
            "emb_sum",
            self.udfs['sumUDF'](F.col("embeddings"))
        )
        # Remove those with embeddings all zeroes (so we can calculate cosine distance)
        results = results.where(F.col("emb_sum")!=0.0)

        return results

In [0]:
embeddings_for_pca = WordEmbeddingsUDF()

In [0]:
DIMENSIONS  = 3

In [0]:
pca = ML.feature.PCA(k=DIMENSIONS, inputCol="features", outputCol="pca_features")

### Full Spark NLP + Spark MLLib pipeline

In [0]:
# We run the second part of the pipeline. Here 768 dimensions is reduced to 3 dimensions

pipeline = nlp.Pipeline().setStages([embeddings_for_pca, pca])


In [0]:
model = pipeline.fit(result_df)

In [0]:
result = model.transform(result_df)

In [0]:
result.select("token", "label", "pca_features").show(truncate = 60)

In [0]:
df = result.select('token', 'label', 'pca_features').toPandas()

df

Unnamed: 0,token,label,pca_features
0,I,Accounts,"[9.85046796699116, 0.021824760616261294, 1.712..."
1,called,Accounts,"[0.5703253366247248, 0.34666249136598587, -2.8..."
2,Huntington,Accounts,"[8.635446663422016, 0.8802328948982534, -0.841..."
3,Bank,Accounts,"[9.39106134939763, 0.4506675629754985, -1.2157..."
4,to,Accounts,"[-2.093784238264132, -1.1261847621412933, 4.47..."
...,...,...,...
1364,the,Mortgage,"[0.20783244995060898, 1.2121737442831522, 2.34..."
1365,company,Mortgage,"[0.9758776803817245, 1.1525679357974887, 1.548..."
1366,never,Mortgage,"[-0.009449005811853118, -1.3605046537433145, -..."
1367,responds,Mortgage,"[-1.3105358968915206, -0.39519741635318867, -1..."


In [0]:
df["x"] = df["pca_features"].apply(lambda x: x[0])

df["y"] = df["pca_features"].apply(lambda x: x[1])

df["z"] = df["pca_features"].apply(lambda x: x[2])

df = df[["token", "label", "x", "y", "z"]]

df

Unnamed: 0,token,label,x,y,z
0,I,Accounts,9.850468,0.021825,1.712889
1,called,Accounts,0.570325,0.346662,-2.867727
2,Huntington,Accounts,8.635447,0.880233,-0.841711
3,Bank,Accounts,9.391061,0.450668,-1.215742
4,to,Accounts,-2.093784,-1.126185,4.473377
...,...,...,...,...,...
1364,the,Mortgage,0.207832,1.212174,2.345685
1365,company,Mortgage,0.975878,1.152568,1.548877
1366,never,Mortgage,-0.009449,-1.360505,-0.080950
1367,responds,Mortgage,-1.310536,-0.395197,-1.634090


In [0]:
import plotly.express as px

fig = px.scatter_3d(df, x = 'x', y = 'y', z = 'z', color = "label", width=1000, height = 800, hover_data = ["token", "label"])

fig.show()