

![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://githubtocolab.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/streamlit_notebooks/GRAMMAR_EN.ipynb)


# **Extract Part of speech tags and perform dependency parsing on a text**

## 1. Colab Setup

In [1]:
!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash
# !bash colab.sh
# -p is for pyspark
# -s is for spark-nlp
# !bash colab.sh -p 3.1.1 -s 3.0.1
# by default they are set to the latest

openjdk version "11.0.10" 2021-01-19
OpenJDK Runtime Environment (build 11.0.10+9-Ubuntu-0ubuntu1.18.04)
OpenJDK 64-Bit Server VM (build 11.0.10+9-Ubuntu-0ubuntu1.18.04, mixed mode, sharing)
setup Colab for PySpark 3.1.1 and Spark NLP 3.0.0
[K     |████████████████████████████████| 212.3MB 74kB/s 
[K     |████████████████████████████████| 143kB 56.4MB/s 
[K     |████████████████████████████████| 204kB 54.8MB/s 
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone


In [2]:
import pandas as pd
import numpy as np
import json
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from sparknlp.annotator import *
from sparknlp.base import *
import sparknlp
from sparknlp.pretrained import PretrainedPipeline

## 2. Start Spark Session

In [3]:
spark = sparknlp.start()

## 3. Select the DL model

In [4]:

MODEL_NAME='dependency_typed_conllu'

## 4. Some sample examples

In [5]:
## Generating Example Files ##
text_list = [
             """John Snow is a good man. He knows a lot about science.""",
             """In what country is the WTO headquartered?""",
             """I was wearing my dark blue shirt and tie.""",
             """The Geneva Motor Show is the most popular car show of the year.""",
             """Bill Gates and Steve Jobs had periods of civility.""",
]


## 5. Define Spark NLP pipeline

In [6]:
documentAssembler = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

tokenizer = Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("token")

pos = PerceptronModel.pretrained("pos_anc", 'en')\
        .setInputCols("document", "token")\
        .setOutputCol("pos")

dep_parser = DependencyParserModel.pretrained('dependency_conllu')\
        .setInputCols(["document", "pos", "token"])\
        .setOutputCol("dependency")


typed_dep_parser = TypedDependencyParserModel.pretrained('dependency_typed_conllu')\
        .setInputCols(["token", "pos", "dependency"])\
        .setOutputCol("dependency_type")


nlpPipeline = Pipeline(
      stages = [
          documentAssembler,
          tokenizer,
          pos,
          dep_parser,
          typed_dep_parser
      ])



pos_anc download started this may take some time.
Approximate size to download 3.9 MB
[OK!]
dependency_conllu download started this may take some time.
Approximate size to download 16.7 MB
[OK!]
dependency_typed_conllu download started this may take some time.
Approximate size to download 2.3 MB
[OK!]


## 6. Select the example to test

In [7]:
index=0

## 7. Run the pipeline on selected example

In [8]:
empty_df = spark.createDataFrame([['']]).toDF("text")
pipelineModel = nlpPipeline.fit(empty_df)

df = spark.createDataFrame(pd.DataFrame({"text":[text_list[index]]}))
result = pipelineModel.transform(df)


## 8. Visualize results

In [12]:

result.select(F.explode(F.arrays_zip('token.result',
                                     'token.begin',
                                     'token.end', 
                                     'pos.result', 
                                     'dependency.result', 
                                                  'dependency.metadata',
                                                  'dependency_type.result')).alias("cols"))\
                                                  .select(F.expr("cols['0']").alias("chunk"),
                                                          F.expr("cols['1']").alias("begin"),
                                                          F.expr("cols['2']").alias("end"),
                                                          F.expr("cols['3']").alias("pos"),
                                                          F.expr("cols['4']").alias("dependency"),
                                                          F.expr("cols['5']").alias("dependency_start"),
                                                          F.expr("cols['6']").alias("dependency_type")).show(truncate=False)


+-------+-----+---+---+----------+----------------------------------------------+---------------+
|chunk  |begin|end|pos|dependency|dependency_start                              |dependency_type|
+-------+-----+---+---+----------+----------------------------------------------+---------------+
|John   |0    |3  |NNP|knows     |{head -> 9, head.begin -> 28, head.end -> 32} |nsubj          |
|Snow   |5    |8  |NNP|man       |{head -> 6, head.begin -> 20, head.end -> 22} |flat           |
|is     |10   |11 |VBZ|man       |{head -> 6, head.begin -> 20, head.end -> 22} |nsubj          |
|a      |13   |13 |DT |man       |{head -> 6, head.begin -> 20, head.end -> 22} |nsubj          |
|good   |15   |18 |JJ |man       |{head -> 6, head.begin -> 20, head.end -> 22} |amod           |
|man    |20   |22 |NN |John      |{head -> 1, head.begin -> 0, head.end -> 3}   |flat           |
|.      |23   |23 |.  |knows     |{head -> 9, head.begin -> 28, head.end -> 32} |punct          |
|He     |25   |26 |P