In [1]:
import json
import os
from pyspark.ml import Pipeline
import pyspark.sql.functions as F
import pandas as pd

import sparknlp
from sparknlp.base import *
from sparknlp.annotator  import *

#spark = sparknlp.start(spark32=True) #If we are using spark = 3.2.0 and above
spark = sparknlp.start()

#print("Spark NLP version: ", sparknlp.version())
#print("Apache Spark version: ", spark.version)

spark

#userStory = str(input())
# userStory = 'I would like to visualize profit achieved last year and also the loss incurred, What is the targeted revenue and profit for the next year.'
userStory = 'Show me the revenue across different region, different verticals and across different deparment.'

sample_data = spark.createDataFrame([[userStory]]).toDF("text")

def get_ann_model():
    document = DocumentAssembler()\
        .setInputCol("text")\
        .setOutputCol("document")

    sentence = SentenceDetector()\
        .setInputCols(['document'])\
        .setOutputCol('sentence')

    token = Tokenizer()\
        .setInputCols(['sentence'])\
        .setOutputCol('token')

    glove_embeddings = WordEmbeddingsModel.pretrained('glove_100d')\
        .setInputCols(["document", "token"])\
        .setOutputCol("embeddings")
  
# load trained model
    loaded_ner_model = NerDLModel.load("Vwb_NER_glove_e5_b32")\
        .setInputCols(["sentence", "token", "embeddings"])\
        .setOutputCol("ner")

    converter = NerConverter()\
        .setInputCols(["document", "token", "ner"])\
        .setOutputCol("ner_span")

    ner_prediction_pipeline = Pipeline(stages = [
        document,
        sentence,
        token,
        glove_embeddings,
        loaded_ner_model,
        converter
    ])

    empty_data = spark.createDataFrame([['']]).toDF("text")

    prediction_model = ner_prediction_pipeline.fit(empty_data)\

    preds = prediction_model.transform(sample_data)
     
    preds.select(F.explode(F.arrays_zip(preds.ner_span.result,preds.ner_span.metadata)).alias("entities")) \
      .select(F.expr("entities['0']").alias("chunk"),
              F.expr("entities['1'].entity").alias("entity")).show(truncate=False)
    
    pdf = preds.select(F.explode(F.arrays_zip(preds.ner_span.result,preds.ner_span.metadata)).alias("entities")) \
      .select(F.expr("entities['0']").alias("chunk"),
              F.expr("entities['1'].entity").alias("entity")).toPandas()
    
    #pdf = pd.DataFrame([{"chunk":"profit","entity":"Goal"},{"chunk":"last year","entity":"Tmln"},{"chunk":"targeted revenue","entity":"Goal"},{"chunk":"profit","entity":"Goal"},{"chunk":"next year","entity":"Tmln"}])
    parameter = [{'Categorical Dimension' : [],
             'Goal Measure' : [],
             'Measure' : [],
             'Timeline Dimension' : []}]
    for i in range(len(pdf)):
        if pdf.loc[i, "entity"] == 'cat':
            parameter[0]['Categorical Dimension'].append(pdf.loc[i, "chunk"])
        if pdf.loc[i, "entity"] == 'Goal':
            parameter[0]['Goal Measure'].append(pdf.loc[i, "chunk"])
        if pdf.loc[i, "entity"] == 'Msr':
            parameter[0]['Measure'].append(pdf.loc[i, "chunk"])
        if pdf.loc[i, "entity"] == 'Tmln':
            parameter[0]['Timeline Dimension'].append(pdf.loc[i, "chunk"])
    print('\n')
    print('parameter[0]',parameter[0])
    
    print('\n pdf->')
    print(pdf)
    
    jason = pdf.to_json(orient = 'records')
    
    print('\n')
    print(jason)
    
    print('\n')
    print ("Spark NLP NER Entities are created")
    
    return parameter[0]


model = get_ann_model()

#get_ann_model()

glove_100d download started this may take some time.
Approximate size to download 145.3 MB
[OK!]
+---------+------+
|chunk    |entity|
+---------+------+
|revenue  |Msr   |
|region   |Cat   |
|verticals|Cat   |
|deparment|Cat   |
+---------+------+



parameter[0] {'Categorical Dimension': [], 'Goal Measure': [], 'Measure': ['revenue'], 'Timeline Dimension': []}

 pdf->
       chunk entity
0    revenue    Msr
1     region    Cat
2  verticals    Cat
3  deparment    Cat


[{"chunk":"revenue","entity":"Msr"},{"chunk":"region","entity":"Cat"},{"chunk":"verticals","entity":"Cat"},{"chunk":"deparment","entity":"Cat"}]


Spark NLP NER Entities are created


In [7]:
print(model)
print('\n')

{'Categorical Dimension': [], 'Goal Measure': ['revenue'], 'Measure': [], 'Timeline Dimension': []}


