![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/streamlit_notebooks/RobertaTokenClassifier.ipynb)

# `RobertaTokenClassifier` **Models**

## 1. Colab Setup

In [None]:
# Installing pyspark and spark-nlp
! pip install -q pyspark==3.2.0 spark-nlp

# Installing Spark NLP Display Library for visualization
! pip install -q spark-nlp-display

Import Libraries

In [2]:
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from sparknlp.annotator import *
from sparknlp.base import *
from sparknlp.pretrained import PretrainedPipeline
import sparknlp
import pandas as pd
import numpy as np
import json
import os

from pyspark.sql.types import StringType, IntegerType

from sparknlp_display import NerVisualizer

## 2. Start Spark Session

In [3]:
spark = sparknlp.start(spark32 = True) 

print("Spark NLP version", sparknlp.version())

spark

Spark NLP version 3.4.2


## 3.Define Spark NLP pipeline

In [4]:
def run_pipeline(model, text, lang = "en"): 

    documentAssembler = DocumentAssembler() \
        .setInputCol('text') \
        .setOutputCol('document')

    sentenceDetector = SentenceDetectorDLModel.pretrained("sentence_detector_dl", "xx")\
        .setInputCols(["document"])\
        .setOutputCol("sentence")

    tokenizer = Tokenizer() \
        .setInputCols(['sentence']) \
        .setOutputCol('token')

    ner_converter = NerConverter()\
        .setInputCols(["sentence", "token", "ner"])\
        .setOutputCol("ner_chunk")


    if model == "roberta_token_classifier_ticker" or model == "roberta_token_classifier_timex_semeval" or model == "roberta_token_classifier_bne_capitel_ner" or model == "roberta_token_classifier_icelandic_ner" or model == "roberta_token_classifier_pos_tagger": 

      tokenClassifier = RoBertaForTokenClassification.pretrained(model, lang)\
        .setInputCols(["sentence",'token'])\
        .setOutputCol("ner")

    else:

      tokenClassifier = RoBertaForTokenClassification \
        .pretrained(model, lang) \
        .setInputCols(['token', 'sentence']) \
        .setOutputCol('ner') \
        .setCaseSensitive(True) \
        .setMaxSentenceLength(512)



    pipeline = Pipeline(stages=[documentAssembler, sentenceDetector, tokenizer, tokenClassifier, ner_converter])

    empty_data = spark.createDataFrame([[""]]).toDF("text")

    pipeline_model = pipeline.fit(empty_data)

    df = spark.createDataFrame(text, StringType()).toDF("text")

    result = pipeline_model.transform(df)

    result.select(F.explode(F.arrays_zip(result.ner_chunk.result, result.ner_chunk.metadata)).alias("cols"))\
                      .select(F.expr("cols['0']").alias("chunk"),
                              F.expr("cols['1']['entity']").alias("ner_label")).show(truncate=False)
    
    NerVisualizer().display(
            result = result.collect()[0],
            label_col = 'ner_chunk',
            document_col = 'document')


## 4.RobertaForTokenClassification Models and Outputs

### `roberta_token_classifier_ticker` model

In [None]:
model = "roberta_token_classifier_ticker"

text = ["""There are some serious purchases and sales of AMZN, NFLX and GPRO stock today"""]

run_pipeline(model, text)

sentence_detector_dl download started this may take some time.
Approximate size to download 514.9 KB
[OK!]
roberta_token_classifier_ticker download started this may take some time.
Approximate size to download 443.8 MB
[OK!]
+-----+---------+
|chunk|ner_label|
+-----+---------+
|AMZN |TICKER   |
|NFLX |TICKER   |
|GPRO |TICKER   |
+-----+---------+



### `roberta_token_classifier_timex_semeval` model

In [None]:
model = "roberta_token_classifier_timex_semeval"

text = ["""Model training was started at 22:12C and it took 3 days from Tuesday to Friday."""]

run_pipeline(model, text)

sentence_detector_dl download started this may take some time.
Approximate size to download 514.9 KB
[OK!]
roberta_token_classifier_timex_semeval download started this may take some time.
Approximate size to download 419.1 MB
[OK!]
+-------+-----------------+
|chunk  |ner_label        |
+-------+-----------------+
|22:12C |Period           |
|3      |Number           |
|days   |Calendar-Interval|
|Tuesday|Day-Of-Week      |
|to     |Between          |
|Friday |Day-Of-Week      |
+-------+-----------------+



### `roberta_large_token_classifier_conll03` model

In [None]:
model = "roberta_large_token_classifier_conll03"

text = ["""My name is John!"""]

run_pipeline(model, text)

sentence_detector_dl download started this may take some time.
Approximate size to download 514.9 KB
[OK!]
roberta_large_token_classifier_conll03 download started this may take some time.
Approximate size to download 1.2 GB
[OK!]
+-----+---------+
|chunk|ner_label|
+-----+---------+
|John |PER      |
+-----+---------+



### `roberta_base_token_classifier_ontonotes` model

In [None]:
model = "roberta_base_token_classifier_ontonotes"

text = ["""My name is John!"""]

run_pipeline(model, text)

sentence_detector_dl download started this may take some time.
Approximate size to download 514.9 KB
[OK!]
roberta_base_token_classifier_ontonotes download started this may take some time.
Approximate size to download 434.7 MB
[OK!]
+-----+---------+
|chunk|ner_label|
+-----+---------+
|John |PERSON   |
+-----+---------+



### `roberta_large_token_classifier_ontonotes` model

In [None]:
model = "roberta_large_token_classifier_ontonotes"

text = ["""My name is John!"""]

run_pipeline(model, text)

sentence_detector_dl download started this may take some time.
Approximate size to download 514.9 KB
[OK!]
roberta_large_token_classifier_ontonotes download started this may take some time.
Approximate size to download 1.2 GB
[OK!]
+-----+---------+
|chunk|ner_label|
+-----+---------+
|John |PERSON   |
+-----+---------+



### `roberta_token_classifier_pos_tagger` model

In [None]:
model = "roberta_token_classifier_pos_tagger"

text = ["""Budi sedang pergi ke pasar."""]

run_pipeline(model, text, lang="id")

sentence_detector_dl download started this may take some time.
Approximate size to download 514.9 KB
[OK!]
roberta_token_classifier_pos_tagger download started this may take some time.
Approximate size to download 444.6 MB
[OK!]
+------+---------+
|chunk |ner_label|
+------+---------+
|Budi  |NNO      |
|sedang|ADK      |
|pergi |VBI      |
|ke    |PPO      |
|pasar |NNO      |
|.     |SYM      |
+------+---------+



### `roberta_token_classifier_icelandic_ner` model

In [5]:
model = "roberta_token_classifier_icelandic_ner"

text = ["""LeBron James verður annar stigahæsti leikmaður NBA-deildarinnar í körfubolta frá upphafi þegar hann fer upp fyrir Karl Malone."""]

run_pipeline(model, text, lang="is")

sentence_detector_dl download started this may take some time.
Approximate size to download 514.9 KB
[OK!]
roberta_token_classifier_icelandic_ner download started this may take some time.
Approximate size to download 435.8 MB
[OK!]
+----------------+------------+
|chunk           |ner_label   |
+----------------+------------+
|LeBron James    |Person      |
|NBA-deildarinnar|Organization|
|Karl Malone     |Person      |
+----------------+------------+



### `roberta_token_classifier_bne_capitel_ner` model

In [None]:
model = "roberta_token_classifier_bne_capitel_ner"

text = ["""Me llamo Antonio y trabajo en la fábrica de Mercedes-Benz en Madrid."""]

run_pipeline(model, text, lang="es")

sentence_detector_dl download started this may take some time.
Approximate size to download 514.9 KB
[OK!]
roberta_token_classifier_bne_capitel_ner download started this may take some time.
Approximate size to download 437.6 MB
[OK!]
+------------------------+---------+
|chunk                   |ner_label|
+------------------------+---------+
|Antonio                 |PER      |
|fábrica de Mercedes-Benz|ORG      |
|Madrid                  |LOC      |
+------------------------+---------+

