

![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://githubtocolab.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/streamlit_notebooks/Graph_RE.ipynb)


In [None]:
import json
import os

from google.colab import files

license_keys = files.upload()

with open(list(license_keys.keys())[0]) as f:
    license_keys = json.load(f)

# Defining license key-value pairs as local variables
locals().update(license_keys)

# Adding license key-value pairs to environment variables
os.environ.update(license_keys)

In [2]:
# Installing pyspark and spark-nlp
! pip install --upgrade -q pyspark==3.1.2 spark-nlp==$PUBLIC_VERSION

# Installing Spark NLP Healthcare
! pip install --upgrade -q spark-nlp-jsl==$JSL_VERSION  --extra-index-url https://pypi.johnsnowlabs.com/$SECRET

# Installing Spark NLP Display Library for visualization
! pip install -q spark-nlp-display

In [3]:
import json
import os
from pyspark.ml import Pipeline,PipelineModel
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

from sparknlp.annotator import *
from sparknlp_jsl.annotator import *
from sparknlp.base import *
import sparknlp_jsl
import sparknlp

import warnings
warnings.filterwarnings('ignore')

params = {"spark.driver.memory":"16G", 
          "spark.kryoserializer.buffer.max":"2000M", 
          "spark.driver.maxResultSize":"2000M"} 

print ("Spark NLP Version :", sparknlp.version())
print ("Spark NLP_JSL Version :", sparknlp_jsl.version())

spark = sparknlp_jsl.start(license_keys['SECRET'],params=params)

spark

Spark NLP Version : 3.3.4
Spark NLP_JSL Version : 3.3.4


# Graph Extraction (Workshop)

In [4]:
documentAssembler = DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("document")

sentence = SentenceDetector() \
    .setInputCols(["document"]) \
    .setOutputCol("sentence")

tokenizer = Tokenizer() \
    .setInputCols(["sentence"]) \
    .setOutputCol("token")

embeddings = WordEmbeddingsModel.pretrained() \
    .setInputCols(["sentence", "token"]) \
    .setOutputCol("embeddings")

ner_tagger = NerDLModel.pretrained() \
    .setInputCols(["document", "token", "embeddings"]) \
    .setOutputCol("ner")   

ner_chunker = NerConverterInternal()\
    .setInputCols(["sentence", "token", "ner"])\
    .setOutputCol("ner_chunk")

posTagger = PerceptronModel.pretrained() \
    .setInputCols(["sentence", "token"]) \
    .setOutputCol("pos")

dependencyParser = DependencyParserModel.pretrained() \
    .setInputCols(["sentence", "pos", "token"]) \
    .setOutputCol("dependency")

typedDependencyParser = TypedDependencyParserModel.pretrained() \
    .setInputCols(["dependency", "pos", "token"]) \
    .setOutputCol("dependency_type")

graph_extraction = GraphExtraction()\
    .setInputCols(["sentence", "token", "ner"]) \
    .setOutputCol("graph") \
    .setMergeEntities(True)\
    .setExplodeEntities(True)\
#    .setEntityTypes(["DRUG-DOSAGE"])\
#    .setRelationshipTypes(["prefer-LOC"])

all_pipeline = Pipeline().setStages([
    documentAssembler,
    sentence,
    tokenizer,
    embeddings,
    ner_tagger,
    ner_chunker,
    posTagger,
    dependencyParser,
    typedDependencyParser,
    graph_extraction
])

empty_data = spark.createDataFrame([[""]]).toDF("text")

all_model = all_pipeline.fit(empty_data)

glove_100d download started this may take some time.
Approximate size to download 145.3 MB
[OK!]
ner_dl download started this may take some time.
Approximate size to download 13.6 MB
[OK!]
pos_anc download started this may take some time.
Approximate size to download 3.9 MB
[OK!]
dependency_conllu download started this may take some time.
Approximate size to download 16.7 MB
[OK!]
dependency_typed_conllu download started this may take some time.
Approximate size to download 2.3 MB
[OK!]


In [5]:
def get_graph_result(result):
    result_pd = result.select('id', 'graph').toPandas()
    result_pd = result_pd.explode('graph')

    result_pd.graph = result_pd.graph.apply(lambda x : {"graph" : x[3], 
                                                        "entities": x[4]['entities'],
                                                        "right_path" : x[4]['right_path'].split(","),
                                                        "left_path" : x[4]['left_path'].split(",")} 
                                                       )

    df_graph = pd.DataFrame()
    df_graph['graph'] = result_pd['graph'].apply(lambda x:x['graph'])
    df_graph['relation'] = result_pd['graph'].apply(lambda x:x['entities'])
    df_graph['entity1'] = result_pd['graph'].apply(lambda x:x['entities'].split(",")[-1])
    df_graph['chunk1'] = result_pd['graph'].apply(lambda x:x['left_path'][-1])
    df_graph['chunk2'] = result_pd['graph'].apply(lambda x:x['right_path'][-1])
    df_graph['right_path'] = result_pd['graph'].apply(lambda x:x['right_path'])
    df_graph['left_path'] = result_pd['graph'].apply(lambda x:x['left_path'])
    df_graph['length_left_path'] = result_pd['graph'].apply(lambda x:len(x['left_path']))
    df_graph['entity2'] = result_pd['graph'].apply(lambda x:x['entities'].split(",")[0])

    
    #display(result_pd.head())
    #display(df_graph.head())

    return df_graph.reset_index()

In [6]:
import pandas as pd

text = ["John Snow was born in England. John Snow lives in New York.",
        "Peter lives in New York. Peter works at Microsoft",
        "Mary was born in Paris. Mary works in Google",
        "Google is based on US",
        "Mike works in Washington.",
        "Washington is the capital of US",
        "England is located in Europe",
        "Paris is the capital of France",
        "US is located in North America"
       ]

ids =  list(range(1, len(text)+1))

df = spark.createDataFrame(pd.DataFrame({"id": ids, "text" : text }))

In [7]:
results = all_model.transform(df)
# results.select('graph').show(truncate=False, vertical = True)

In [8]:
# results.show()

In [10]:
results_pd = get_graph_result(results)
results_pd

Unnamed: 0,index,graph,relation,entity1,chunk1,chunk2,right_path,left_path,length_left_path,entity2
0,0,born,"PER,LOC",LOC,John Snow,England,"[born, flat, England]","[born, flat, John Snow]",3,PER
1,0,lives,"PER,LOC",LOC,John Snow,New York,"[lives, flat, New York]","[lives, flat, John Snow]",3,PER
2,1,lives,"PER,LOC",LOC,Peter,New York,"[lives, flat, New York]","[lives, flat, Peter]",3,PER
3,1,works,"PER,ORG",ORG,Peter,Microsoft,"[works, nsubj, Microsoft]","[works, nsubj, Peter]",3,PER
4,2,born,"PER,LOC",LOC,Mary,Paris,"[born, flat, Paris]","[born, flat, Mary]",3,PER
5,2,works,"PER,ORG",ORG,Mary,Google,"[works, nsubj, Google]","[works, nsubj, Mary]",3,PER
6,3,based,"ORG,LOC",LOC,Google,US,"[based, flat, US]","[based, flat, Google]",3,ORG
7,4,works,"PER,LOC",LOC,Mike,Washington,"[works, nsubj, Washington]","[works, nsubj, Mike]",3,PER
8,5,capital,"LOC,LOC",LOC,Washington,US,"[capital, flat, US]","[capital, flat, Washington]",3,LOC
9,6,located,"LOC,LOC",LOC,England,Europe,"[located, flat, Europe]","[located, flat, England]",3,LOC


In [11]:
x = results_pd.right_path[0]

def ac(x):
    liste = []
    for a in range(0, len(x)-2, 2):
        liste.append((x[a], x[a+1], x[a+2]))
    return liste

ac(x)

[('born', 'flat', 'England')]

In [12]:
rel_df = pd.DataFrame()
rel_df["n1"] = results_pd.chunk1
rel_df["relation"] = results_pd.right_path.apply(ac).explode().apply(lambda x : x[0])
rel_df["relation_type"] = results_pd.right_path.apply(ac).explode().apply(lambda x : x[1])
rel_df["n2"] = results_pd.right_path.apply(ac).explode().apply(lambda x : x[2])

In [13]:
rel_df

Unnamed: 0,n1,relation,relation_type,n2
0,John Snow,born,flat,England
1,John Snow,lives,flat,New York
2,Peter,lives,flat,New York
3,Peter,works,nsubj,Microsoft
4,Mary,born,flat,Paris
5,Mary,works,nsubj,Google
6,Google,based,flat,US
7,Mike,works,nsubj,Washington
8,Washington,capital,flat,US
9,England,located,flat,Europe


In [14]:
rel_df.to_csv('relation_df.csv', index=False)

In [17]:
import pandas as pd

rel_df = pd.read_csv('relation_df.csv')

In [18]:
rel_df

Unnamed: 0,n1,relation,relation_type,n2
0,John Snow,born,flat,England
1,John Snow,lives,flat,New York
2,Peter,lives,flat,New York
3,Peter,works,nsubj,Microsoft
4,Mary,born,flat,Paris
5,Mary,works,nsubj,Google
6,Google,based,flat,US
7,Mike,works,nsubj,Washington
8,Washington,capital,flat,US
9,England,located,flat,Europe
