

![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/streamlit_notebooks/NER_BTC.ipynb)






# **Detect Entities in Twitter texts**

## 1. Colab Setup

In [None]:
# Install PySpark and Spark NLP
! pip install -q pyspark==3.3.0 spark-nlp==4.2.8

# Install Spark NLP Display lib
! pip install --upgrade -q spark-nlp-display

In [2]:
import json
import pandas as pd
import numpy as np

import sparknlp
import pyspark.sql.functions as F

from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
from sparknlp.annotator import *
from sparknlp.base import *
from sparknlp.pretrained import PretrainedPipeline
from pyspark.sql.types import StringType, IntegerType

## 2. Start Spark Session

In [3]:
spark = sparknlp.start()
print ("Spark NLP Version :", sparknlp.version())
spark

Spark NLP Version : 4.2.8


## 3. Some sample examples

In [4]:
text_list = test_sentences = ["""Wengers big mistakes is not being ruthless enough with bad players.""",
                              """Aguero goal . From being someone previously so reliable , he 's been terrible this year .""",
                              """Paul Scholes approached Alex Ferguson about making a comeback . Ferguson clearly only too happy to accommodate him .""",
                              """Wikipedia today , as soon as you load the website , hit ESC to prevent the 'blackout ' from loading.""",
                              """David Attenborough shows us a duck billed platypus.""",
                              """London GET UPDATES FROM Peter Hotez""",
                              """Pentagram's Dominic Lippa is working on a new identity for University of Arts London """]

## 4. Define Spark NLP pipeline

In [5]:
document = DocumentAssembler()\
        .setInputCol("text")\
        .setOutputCol("document")

tokenizer = Tokenizer()\
        .setInputCols("document")\
        .setOutputCol("token")  

tokenClassifier = BertForTokenClassification.pretrained("bert_token_classifier_ner_btc", "en")\
        .setInputCols("token", "document")\
        .setOutputCol("ner")\
        .setCaseSensitive(True)

ner_converter = NerConverter()\
        .setInputCols(["document","token","ner"])\
        .setOutputCol("ner_chunk")\
        

pipeline =  Pipeline(
    stages=[
        document, 
        tokenizer, 
        tokenClassifier, 
        ner_converter])



bert_token_classifier_ner_btc download started this may take some time.
Approximate size to download 385.3 MB
[OK!]


## 5. Run the pipeline

In [6]:
df = spark.createDataFrame(text_list, StringType()).toDF("text")
result = pipeline.fit(df).transform(df)


## 6. Visualize results

In [7]:
result.select(F.explode(F.arrays_zip(result.document.result, 
                                     result.ner_chunk.result,
                                     result.ner_chunk.metadata)).alias("cols")) \
      .select(F.expr("cols['1']").alias("chunk"),
              F.expr("cols['2'].entity").alias('result')).show(truncate=False)

+-------------------------+------+
|chunk                    |result|
+-------------------------+------+
|Wengers                  |PER   |
|Aguero                   |PER   |
|Paul Scholes             |PER   |
|Alex Ferguson            |PER   |
|Ferguson                 |PER   |
|Wikipedia                |ORG   |
|David Attenborough       |PER   |
|London                   |LOC   |
|Peter Hotez              |PER   |
|Pentagram's              |ORG   |
|Dominic Lippa            |PER   |
|University of Arts London|ORG   |
+-------------------------+------+



In [8]:
from sparknlp_display import NerVisualizer

for i in range(len(text_list)):
  NerVisualizer().display(
      result = result.collect()[i],
      label_col = 'ner_chunk',
      document_col = 'document'
  )

