In [None]:
!sudo apt-get install openjdk-8-jdk
!java -version
!pip install --ignore-installed -q pyspark==2.4.4
!pip install spark-nlp
!pip install streamlit
!wget https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip
!unzip -qq ngrok-stable-linux-amd64.zip

In [1]:
import pandas as pd
import numpy as np
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
import json
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from sparknlp.annotator import *
from sparknlp.base import *
import sparknlp
from sparknlp.pretrained import PretrainedPipeline

In [2]:
spark = sparknlp.start()

In [None]:
# Demo of the following annotators: 
 #SentenceDetector
 #Tokenizer
 #Normalizer
 #Stemmer
 #Lemmatizer
 #Stop Words removed


In [3]:
## Generating Example Files ##

text_list = ["""The Geneva Motor Show, the first major car show of the year, opens tomorrow with U.S. Car makers hoping to make new inroads into European markets due to the cheap dollar, automobile executives said. Ford Motor Co and General Motors Corp sell cars in Europe, where about 10.5 mln new cars a year are bought. GM also makes a few thousand in North American plants for European export.""",
             ]

In [None]:
#getting lemma files
!wget https://raw.githubusercontent.com/mahavivo/vocabulary/master/lemmas/AntBNC_lemmas_ver_001.txt

In [6]:
documentAssembler = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

sentenceDetector = SentenceDetector()\
    .setInputCols(['document'])\
    .setOutputCol('sentences')

tokenizer = Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("token")

normalizer = Normalizer() \
    .setInputCols(["token"]) \
    .setOutputCol("normalized")\
    .setLowercase(True)\
    .setCleanupPatterns(["[^\w\d\s]"])

stopwords_cleaner = StopWordsCleaner()\
    .setInputCols("token")\
    .setOutputCol("removed_stopwords")\
    .setCaseSensitive(False)\

stemmer = Stemmer() \
    .setInputCols(["token"]) \
    .setOutputCol("stem")


lemmatizer = Lemmatizer() \
    .setInputCols(["token"]) \
    .setOutputCol("lemma") \
    .setDictionary("./AntBNC_lemmas_ver_001.txt", value_delimiter ="\t", key_delimiter = "->")

#chunker = Chunker() \
 #   .setInputCols(["document", "pos"]) \
 #   .setOutputCol("chunk") \
 #   .setRegexParsers(["‹NNP›+", "‹DT|PP\\$›?‹JJ›*‹NN›"])

nlpPipeline = Pipeline(stages=[documentAssembler,
                               sentenceDetector,
                               tokenizer,
                               normalizer,
                               stopwords_cleaner,
                               stemmer,
                               lemmatizer,
                               ])
empty_df = spark.createDataFrame([['']]).toDF("text")

pipelineModel = nlpPipeline.fit(empty_df)

df = spark.createDataFrame(pd.DataFrame({'text':text_list}))
result = pipelineModel.transform(df)


In [7]:
result.select(F.explode(F.arrays_zip('sentences.result')).alias("cols")) \
.select(F.expr("cols['0']").alias("sentences")).show(truncate=False)


+----------------------------------------------------------------------------------------------------------------+
|sentences                                                                                                       |
+----------------------------------------------------------------------------------------------------------------+
|The Geneva Motor Show, the first major car show of the year, opens tomorrow with U.S.                           |
|Car makers hoping to make new inroads into European markets due to the cheap dollar, automobile executives said.|
|Ford Motor Co and General Motors Corp sell cars in Europe, where about 10.5 mln new cars a year are bought.     |
|GM also makes a few thousand in North American plants for European export.                                      |
+----------------------------------------------------------------------------------------------------------------+



In [8]:
result.select(F.explode(F.arrays_zip('token.result')).alias("cols")) \
.select(F.expr("cols['0']").alias("tokens")).show(truncate=False)

+--------+
|tokens  |
+--------+
|The     |
|Geneva  |
|Motor   |
|Show    |
|,       |
|the     |
|first   |
|major   |
|car     |
|show    |
|of      |
|the     |
|year    |
|,       |
|opens   |
|tomorrow|
|with    |
|U.S     |
|.       |
|Car     |
+--------+
only showing top 20 rows



In [9]:
#eliminated punctuation
result.select(F.explode(F.arrays_zip('normalized.result')).alias("cols")) \
.select(F.expr("cols['0']").alias("normalized_tokens")).show(truncate=False)

+-----------------+
|normalized_tokens|
+-----------------+
|the              |
|geneva           |
|motor            |
|show             |
|the              |
|first            |
|major            |
|car              |
|show             |
|of               |
|the              |
|year             |
|opens            |
|tomorrow         |
|with             |
|us               |
|car              |
|makers           |
|hoping           |
|to               |
+-----------------+
only showing top 20 rows



In [10]:
#stemmed tokens
result.select(F.explode(F.arrays_zip('stem.result')).alias("cols")) \
.select(F.expr("cols['0']").alias("token_stems")).show(truncate=False)

+-----------+
|token_stems|
+-----------+
|the        |
|geneva     |
|motor      |
|show       |
|,          |
|the        |
|first      |
|major      |
|car        |
|show       |
|of         |
|the        |
|year       |
|,          |
|open       |
|tomorrow   |
|with       |
|u.         |
|.          |
|car        |
+-----------+
only showing top 20 rows



In [11]:
#removed_stopwords
result.select(F.explode(F.arrays_zip('removed_stopwords.result')).alias("cols")) \
.select(F.expr("cols['0']").alias("removed_stopwords")).show(truncate=False)

+-----------------+
|removed_stopwords|
+-----------------+
|Geneva           |
|Motor            |
|Show             |
|,                |
|first            |
|major            |
|car              |
|show             |
|year             |
|,                |
|opens            |
|tomorrow         |
|U.S              |
|.                |
|Car              |
|makers           |
|hoping           |
|make             |
|new              |
|inroads          |
+-----------------+
only showing top 20 rows



In [12]:
#lemmatization
result.select(F.explode(F.arrays_zip('lemma.result')).alias("cols")) \
.select(F.expr("cols['0']").alias("lemma")).show(truncate=False)

+--------+
|lemma   |
+--------+
|The     |
|Geneva  |
|Motor   |
|Show    |
|,       |
|the     |
|first   |
|major   |
|car     |
|show    |
|of      |
|the     |
|year    |
|,       |
|open    |
|tomorrow|
|with    |
|U.S     |
|.       |
|Car     |
+--------+
only showing top 20 rows

