

![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/streamlit_notebooks/GRAMMAR_EN.ipynb)


# **Extract Part of speech tags and perform dependency parsing on a text**

## 0. Colab Setup

In [None]:
!sudo apt-get install openjdk-8-jdk
!java -version
!pip install --ignore-installed -q pyspark==2.4.4
!pip install spark-nlp

In [None]:
import pandas as pd
import numpy as np
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
import json
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from sparknlp.annotator import *
from sparknlp.base import *
import sparknlp
from sparknlp.pretrained import PretrainedPipeline
#import svgwrite

## 1. Start Spark Session

In [None]:
spark = sparknlp.start()

## 2. Select the DL model

In [None]:

MODEL_NAME='dependency_typed_conllu'

## 3. Some sample examples

In [None]:
## Generating Example Files ##
text_list = [
             """John Snow is a good man. He knows a lot about science.""",
             """In what country is the WTO headquartered?""",
             """I was wearing my dark blue shirt and tie.""",
             """The Geneva Motor Show is the most popular car show of the year.""",
             """Bill Gates and Steve Jobs had periods of civility.""",
]


## 4. Define Spark NLP pipeline

In [None]:
documentAssembler = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

tokenizer = Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("token")

pos = PerceptronModel.pretrained("pos_anc", 'en')\
        .setInputCols("document", "token")\
        .setOutputCol("pos")

dep_parser = DependencyParserModel.pretrained('dependency_conllu')\
        .setInputCols(["document", "pos", "token"])\
        .setOutputCol("dependency")


typed_dep_parser = TypedDependencyParserModel.pretrained('dependency_typed_conllu')\
        .setInputCols(["token", "pos", "dependency"])\
        .setOutputCol("dependency_type")


nlpPipeline = Pipeline(
      stages = [
          documentAssembler,
          tokenizer,
          pos,
          chunker,
          dep_parser,
          typed_dep_parser
      ])



## 5. Select the example to test

In [None]:
index=0

## 6. Run the pipeline on selected example

In [None]:
empty_df = spark.createDataFrame([['']]).toDF("text")
pipelineModel = nlpPipeline.fit(empty_df)

df = spark.createDataFrame(pd.DataFrame({"text":[text_list[index]]}))
result = pipelineModel.transform(df)


## 7. Visualize results

In [None]:

result.select(F.explode(F.arrays_zip('token.result',
                                     'token.begin',
                                     'token.end', 
                                     'pos.result', 
                                     'dependency.result', 
                                                  'dependency.metadata',
                                                  'dependency_type.result')).alias("cols"))\
                                                  .select(F.expr("cols['0']").alias("chunk"),
                                                          F.expr("cols['1']").alias("begin"),
                                                          F.expr("cols['2']").alias("end"),
                                                          F.expr("cols['3']").alias("pos"),
                                                          F.expr("cols['4']").alias("dependency"),
                                                          F.expr("cols['5']").alias("dependency_start"),
                                                          F.expr("cols['6']").alias("dependency_type")).toPandas()
