

![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/streamlit_notebooks/NER_HINDI_ENGLISH.ipynb)






# **Detect Entities in hindi and english language texts**

## 1. Colab Setup

In [None]:
# Install PySpark and Spark NLP
! pip install -q pyspark==3.3.0 spark-nlp==4.2.1

## 2. Start Spark Session

In [None]:
import sparknlp
# let's start Spark with Spark NLP
spark = sparknlp.start()

In [None]:
import json
import pandas as pd
import numpy as np

from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType, IntegerType
import pyspark.sql.functions as F
from sparknlp.annotator import *
from sparknlp.base import *
import sparknlp
from sparknlp.pretrained import PretrainedPipeline

## 3. Define Spark NLP pipeline

In [None]:
document_assembler = DocumentAssembler()\
        .setInputCol('text')\
        .setOutputCol('document')

sentence_detector = SentenceDetector() \
        .setInputCols(['document'])\
        .setOutputCol('sentence')

tokenizer = Tokenizer()\
        .setInputCols(['sentence']) \
        .setOutputCol('token')

tokenClassifier_loaded = BertForTokenClassification.pretrained("bert_token_classifier_hi_en_ner","hi")\
        .setInputCols(["sentence",'token'])\
        .setOutputCol("ner")

ner_converter = NerConverter()\
        .setInputCols(["sentence","token","ner"])\
        .setOutputCol("ner_chunk")
        
nlp_pipeline = Pipeline(stages=[document_assembler, 
                                sentence_detector, 
                                tokenizer, 
                                tokenClassifier_loaded, 
                                ner_converter])

text_list =["""वॉरेन एडवर्ड बफेट (Warren Buffet) (अगस्त 30 (August 30), 1930 को ओमाहा (Omaha), नेब्रास्का (Nebraska) में पैदा हुए) एक अमेरिकी निवेशक (investor), व्यवसायी और परोपकारी (philanthropist) व्यक्तित्व हैं।"""]

df = spark.createDataFrame(text_list, StringType()).toDF("text")
result = nlp_pipeline.fit(df).transform(df)

bert_token_classifier_hi_en_ner download started this may take some time.
Approximate size to download 634.9 MB
[OK!]


## 4. Visualize results

In [None]:
result.select(F.explode(F.arrays_zip(result.ner_chunk.result, 
                                     result.ner_chunk.metadata)).alias("cols")) \
      .select(F.expr("cols['0']").alias("chunk"),
              F.expr("cols['1']['entity']").alias("ner_label"))\
      .show(truncate=False)


+-----------------+---------+
|chunk            |ner_label|
+-----------------+---------+
|वॉरेन एडवर्ड बफेट|PERSON   |
|Warren Buffet    |PERSON   |
|ओमाहा            |PLACE    |
|Omaha            |PLACE    |
|नेब्रास्का       |PLACE    |
|Nebraska         |PLACE    |
+-----------------+---------+

