![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/regex-tokenizer/regex_tokenizer_examples.ipynb)

# Tokenization using RegexTokenizer

In [None]:
# Only run this cell when you are using Spark NLP on Google Colab
!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash

## Start Spark NLP session

In [None]:
# Import Spark NLP
from sparknlp.base import *
from sparknlp.annotator import *
import sparknlp

spark =sparknlp.start()

In [None]:
spark

## Regex Tokenizer annotator

In [None]:
from pyspark.ml import Pipeline
from pyspark.sql.types import StringType
from sparknlp.base import *
from sparknlp.annotator import *

content = "1. T1-T2 DATE**[12/24/13] $1.99 () (10/12), ph+ 90%"
pattern = "\\s+|(?=[-.:;*+,$&%\\[\\]])|(?<=[-.:;*+,$&%\\[\\]])"

df = spark.createDataFrame([content], StringType()).withColumnRenamed("value", "text")

df.show()

documentAssembler = DocumentAssembler() \
    .setInputCol('text') \
    .setOutputCol('document')

sentenceDetector = SentenceDetector() \
      .setInputCols(["document"]) \
      .setOutputCol("sentence")

regexTokenizer = RegexTokenizer() \
      .setInputCols(["sentence"]) \
      .setOutputCol("regexToken") \
      .setPattern(pattern) \
      .setPositionalMask(False)

docPatternRemoverPipeline = \
  Pipeline() \
    .setStages([
        documentAssembler,
        sentenceDetector,
        regexTokenizer])

ds = docPatternRemoverPipeline.fit(df).transform(df)

ds.show(10, False)

+--------------------+
|                text|
+--------------------+
|1. T1-T2 DATE**[1...|
+--------------------+

+---------------------------------------------------+---------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------