In [7]:
!pip install pyspark
!pip install spark-nlp
import sparknlp
from pyspark.ml import Pipeline

from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.sql import SparkSession
print('Packages installed successfully')

Packages installed successfully


Starting a sparknlp session

In [3]:
spark = sparknlp.start()

In [39]:
sample_text = """
No I am not gong to give you a PT. No I am not going to name a ticker that is clearly over extended and ready for the RSI to cool off. This stock is in a nice dip here and it has the potntial to move this week.
"""

Make a dataframe out of the given text to make it easier for processing

In [40]:
data = spark.createDataFrame([[sample_text]]).toDF('text')

In [41]:
data.show(truncate=False)

+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|text                                                                                                                                                                                                                |
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
No I am not gong to give you a PT. No I am not going to name a ticker that is clearly over extended and ready for the RSI to cool off. This stock is in a nice dip here and it has the potntial to move this week.
|
+-------------------------------------------------------------------------------------------------------------------------------------------

In [42]:
# Form a document out of the given dataframe to pass it down the NER pipeline
document = DocumentAssembler().setInputCol('text').setOutputCol('document').setCleanupMode('shrink')

'''
1) Indicate that the input column is the dataframe containing the sample text
2) The output column will be named document
3) setCleanUpMode('shrink') is used to remove any unwanted spaces in the inputted text
'''

"\n1) Indicate that the input column is the dataframe containing the sample text\n2) The output column will be named document\n3) setCleanUpMode('shrink') is used to remove any unwanted spaces in the inputted text\n"

Stage 1 - Split up the given text into its constituent sentences

In [43]:
# Since the inputted text is made up of multiple sentences, split the sentences into their own columns
sentence = SentenceDetector().setInputCols('document').setOutputCol('sentence')
sentence.setExplodeSentences(True)

'''
1) Similar to the above document assembler, this line of code splits up the given sample text into its own individual rows, 
under the column named sentence
2) setExplodeSentence(True) is used to place each separate sentence in its own dataframe row
3) This is for efficiency purposes
'''

'\n1) Similar to the above document assembler, this line of code splits up the given sample text into its own individual rows, \nunder the column named sentence\n2) setExplodeSentence(True) is used to place each separate sentence in its own dataframe row\n3) This is for efficiency purposes\n'

Stage 2 - Split up each sentence into separate meaningful tokens

In [44]:
tokenizer = Tokenizer().setInputCols('sentence').setOutputCol('token')

Stage 3 - Correct any spelling errors (since this is reddit user data)

In [45]:
checker = NorvigSweetingModel.pretrained().setInputCols(['token']).setOutputCol('checked')

# Downloads a pretrained model on the system

spellcheck_norvig download started this may take some time.
Approximate size to download 4.2 MB
[OK!]


Stage 4 - Using the BERT embeddings model compared to standard one due to its ability to infer context

In [46]:
embeddings = WordEmbeddingsModel.pretrained().setInputCols(['sentence', 'checked']).setOutputCol('embeddings')

'''
Bert carries out bidirectional text parsing to infer context from a sentence with greater accuracy
Outputs the result into the embeddings column
'''

glove_100d download started this may take some time.
Approximate size to download 145.3 MB
[OK!]


'\nBert carries out bidirectional text parsing to infer context from a sentence with greater accuracy\nOutputs the result into the embeddings column\n'

Stage 5 - NER stage to tag each token

In [47]:
ner = NerDLModel.pretrained().setInputCols(['sentence', 'checked', 'embeddings']).setOutputCol('ner')
converter = NerConverter().setInputCols(['sentence', 'checked', 'ner']).setOutputCol('chunk')

'''
1) The pretrained NER model takes in the above parameters and produces the respective taggings in the ner column
2) The converter then takes in the ner taggings and convert them into something meaningful we can understand using the 
CoNLL format so that we can identify what is the meaning of each tag
'''

ner_dl download started this may take some time.
Approximate size to download 13.6 MB
[OK!]


'\n1) The pretrained NER model takes in the above parameters and produces the respective taggings in the ner column\n2) The converter then takes in the ner taggings and convert them into something meaningful we can understand using the \nCoNLL format so that we can identify what is the meaning of each tag\n'

Final stage - Forming the pipeline

In [48]:
pipeline = Pipeline().setStages([document, sentence, tokenizer, checker, embeddings, ner, converter])

In [49]:
model = pipeline.fit(data)

In [50]:
result = model.transform(data)

Analysing the result at each stage in the pipeline

In [51]:
result.select('sentence.result').show(truncate=False)

+-----------------------------------------------------------------------------------------------------+
|result                                                                                               |
+-----------------------------------------------------------------------------------------------------+
|[No I am not gong to give you a PT.]                                                                 |
|[No I am not going to name a ticker that is clearly over extended and ready for the RSI to cool off.]|
|[This stock is in a nice dip here and it has the potntial to move this week.]                        |
+-----------------------------------------------------------------------------------------------------+



Checking spell check output

In [52]:
result.select('checked.result').show(truncate=False)

+----------------------------------------------------------------------------------------------------------------------------+
|result                                                                                                                      |
+----------------------------------------------------------------------------------------------------------------------------+
|[No, I, am, not, gong, to, give, you, a, PT, .]                                                                             |
|[No, I, am, not, going, to, name, a, ticker, that, is, clearly, over, extended, and, ready, for, the, RSI, to, cool, off, .]|
|[This, stock, is, in, a, nice, dip, here, and, it, has, the, potential, to, move, this, week, .]                            |
+----------------------------------------------------------------------------------------------------------------------------+



In [53]:
# As seen above, the checker managed to correct 1 of two intentionally introduced spelling errors (potntial to potential), could not fix gong to going

Checking the ner result

In [54]:
result.select('ner.result').show(truncate=False)

+-------------------------------------------------------------------------+
|result                                                                   |
+-------------------------------------------------------------------------+
|[O, O, O, O, O, O, O, O, O, B-ORG, O]                                    |
|[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, B-ORG, O, O, O, O]|
|[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]                   |
+-------------------------------------------------------------------------+



In [55]:
# The model has identified the metrics of a stock, price target (PT) and RSI (Relative strength index) as beginnings of organisations. Some room for improvement here.