

![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/streamlit_notebooks/GRAMMAR_EN.ipynb)


# **Extract Part of speech tags and perform dependency parsing on a text**

## 1. Colab Setup

In [1]:
# Install java
!apt-get update -qq
!apt-get install -y openjdk-8-jdk-headless -qq > /dev/null
!java -version

# Install pyspark
!pip install --ignore-installed -q pyspark==2.4.4

# Install Sparknlp
!pip install --ignore-installed spark-nlp

openjdk version "11.0.8" 2020-07-14
OpenJDK Runtime Environment (build 11.0.8+10-post-Ubuntu-0ubuntu118.04.1)
OpenJDK 64-Bit Server VM (build 11.0.8+10-post-Ubuntu-0ubuntu118.04.1, mixed mode, sharing)
[K     |████████████████████████████████| 215.7MB 61kB/s 
[K     |████████████████████████████████| 204kB 40.5MB/s 
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
Collecting spark-nlp
[?25l  Downloading https://files.pythonhosted.org/packages/b5/a2/5c2e18a65784442ded6f6c58af175ca4d99649337de569fac55b04d7ed8e/spark_nlp-2.5.5-py2.py3-none-any.whl (124kB)
[K     |████████████████████████████████| 133kB 2.6MB/s 
[?25hInstalling collected packages: spark-nlp
Successfully installed spark-nlp-2.5.5


In [2]:
import pandas as pd
import numpy as np
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]
import json
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from sparknlp.annotator import *
from sparknlp.base import *
import sparknlp
from sparknlp.pretrained import PretrainedPipeline

## 2. Start Spark Session

In [3]:
spark = sparknlp.start()

## 3. Select the DL model

In [4]:

MODEL_NAME='dependency_typed_conllu'

## 4. Some sample examples

In [5]:
## Generating Example Files ##
text_list = [
             """John Snow is a good man. He knows a lot about science.""",
             """In what country is the WTO headquartered?""",
             """I was wearing my dark blue shirt and tie.""",
             """The Geneva Motor Show is the most popular car show of the year.""",
             """Bill Gates and Steve Jobs had periods of civility.""",
]


## 5. Define Spark NLP pipeline

In [7]:
documentAssembler = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

tokenizer = Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("token")

pos = PerceptronModel.pretrained("pos_anc", 'en')\
        .setInputCols("document", "token")\
        .setOutputCol("pos")

dep_parser = DependencyParserModel.pretrained('dependency_conllu')\
        .setInputCols(["document", "pos", "token"])\
        .setOutputCol("dependency")


typed_dep_parser = TypedDependencyParserModel.pretrained('dependency_typed_conllu')\
        .setInputCols(["token", "pos", "dependency"])\
        .setOutputCol("dependency_type")


nlpPipeline = Pipeline(
      stages = [
          documentAssembler,
          tokenizer,
          pos,
          dep_parser,
          typed_dep_parser
      ])



pos_anc download started this may take some time.
Approximate size to download 4.3 MB
[OK!]
dependency_conllu download started this may take some time.
Approximate size to download 16.6 MB
[OK!]
dependency_typed_conllu download started this may take some time.
Approximate size to download 257.4 KB
[OK!]


## 6. Select the example to test

In [8]:
index=0

## 7. Run the pipeline on selected example

In [9]:
empty_df = spark.createDataFrame([['']]).toDF("text")
pipelineModel = nlpPipeline.fit(empty_df)

df = spark.createDataFrame(pd.DataFrame({"text":[text_list[index]]}))
result = pipelineModel.transform(df)


## 8. Visualize results

In [10]:

result.select(F.explode(F.arrays_zip('token.result',
                                     'token.begin',
                                     'token.end', 
                                     'pos.result', 
                                     'dependency.result', 
                                                  'dependency.metadata',
                                                  'dependency_type.result')).alias("cols"))\
                                                  .select(F.expr("cols['0']").alias("chunk"),
                                                          F.expr("cols['1']").alias("begin"),
                                                          F.expr("cols['2']").alias("end"),
                                                          F.expr("cols['3']").alias("pos"),
                                                          F.expr("cols['4']").alias("dependency"),
                                                          F.expr("cols['5']").alias("dependency_start"),
                                                          F.expr("cols['6']").alias("dependency_type"))


Unnamed: 0,chunk,begin,end,pos,dependency,dependency_start,dependency_type
0,John,0,3,NNP,knows,"{'head': '9', 'head.end': '32', 'head.begin': ...",nsubj
1,Snow,5,8,NNP,man,"{'head': '6', 'head.end': '22', 'head.begin': ...",flat
2,is,10,11,VBZ,man,"{'head': '6', 'head.end': '22', 'head.begin': ...",nsubj
3,a,13,13,DT,man,"{'head': '6', 'head.end': '22', 'head.begin': ...",nsubj
4,good,15,18,JJ,man,"{'head': '6', 'head.end': '22', 'head.begin': ...",amod
5,man,20,22,NN,John,"{'head': '1', 'head.end': '3', 'head.begin': '0'}",flat
6,.,23,23,.,knows,"{'head': '9', 'head.end': '32', 'head.begin': ...",punct
7,He,25,26,PRP,knows,"{'head': '9', 'head.end': '32', 'head.begin': ...",nsubj
8,knows,28,32,VBZ,ROOT,"{'head': '0', 'head.end': '-1', 'head.begin': ...",root
9,a,34,34,DT,lot,"{'head': '11', 'head.end': '38', 'head.begin':...",nsubj
