![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)

# Installs

In [None]:
# install PySpark
# %pip install -q pyspark==3.3.1 spark-nlp==5.1.0

In [None]:
# %pip install spacy[transformers]

# Initializing Spark

In [None]:
import sparknlp

from sparknlp.base import *
from sparknlp.annotator import *

from pyspark.ml import Pipeline

print("Spark NLP version", sparknlp.version())

spark

In [None]:
!pip show spacy

In [None]:
spark.conf.get("spark.sql.execution.arrow.pyspark.enabled")

In [None]:
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "false")
spark.conf.get("spark.sql.execution.arrow.pyspark.enabled")

# Get Data

In [None]:
!wget -q https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/open-source-nlp/data/news_category_train.csv

In [None]:
dbutils.fs.cp("file:/databricks/driver/news_category_train.csv", "dbfs:/") 

In [None]:
spark_df = spark.read.csv('/news_category_train.csv').toDF("category",'text')
spark_df.write.mode("overwrite").parquet("/Users/halil@johnsnowlabs.com/temp/news")
spark_df= spark.read.parquet("/Users/halil@johnsnowlabs.com/temp/news").repartition(10000)
spark_df.show(5,False)

In [None]:
spark_df.count()

# Spark NLP

In [None]:
from sparknlp.base import *
from sparknlp.annotator import *

NER extraction

In [None]:
document_assembler = DocumentAssembler()\
      .setInputCol("text")\
      .setOutputCol("document")\

tokenizer = Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("token")

glove_embeddings = WordEmbeddingsModel.pretrained('glove_100d').\
    setInputCols(["document", 'token']).\
    setOutputCol("embeddings")

public_ner = NerDLModel.pretrained("ner_dl", 'en') \
    .setInputCols(["document", "token", "embeddings"]) \
    .setOutputCol("ner")

pipeline = Pipeline(stages=[document_assembler,
                              tokenizer,
                              glove_embeddings,
                              public_ner
                           ])

fitted_pipeline = pipeline.fit(spark.createDataFrame([[""]]).toDF("text"))


In [None]:
%%time
spark_result = fitted_pipeline.transform(spark_df)

spark_result.select('ner').collect() 

Roberta sentence embeddings

In [None]:
document_assembler = DocumentAssembler()\
      .setInputCol("text")\
      .setOutputCol("document")\

embeddings = RoBertaSentenceEmbeddings.pretrained("sent_roberta_base", "en") \
      .setInputCols("document") \
      .setOutputCol("embeddings")

pipeline= Pipeline(stages=[document_assembler,
                            embeddings
                           ])

fitted_pipeline= pipeline.fit(spark.createDataFrame([[""]]).toDF("text"))

In [None]:
%%time

result= fitted_pipeline.transform(spark_df)
result.select("embeddings.embeddings").collect() 

# Define UDF Functions

In [None]:
import spacy
from pyspark.sql.functions import pandas_udf
import pandas as pd
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.sql import Row

from pyspark.sql.types import ArrayType, FloatType, StringType

In [None]:
! python -m spacy download en_core_web_sm

In [None]:
# Initialize SpaCy
nlp_token = spacy.load("en_core_web_sm", exclude=["ner","tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer"])

# Define a UDF that takes a column of text data as input 
@pandas_udf(ArrayType(StringType()))
def tokenize_with_spacy(text_series):
    tokenized_text = []
    for text in text_series:
        doc = nlp_token(text)
        tokens = [token.text for token in doc]
        tokenized_text.append(tokens)

    # Convert the tokenized_text list to a Pandas Series
    result_series = pd.Series(tokenized_text)

    return result_series

In [None]:
nlp_ner = spacy.load("en_core_web_sm", exclude=["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer"])

# Define a UDF to  perform NER
@pandas_udf(ArrayType(StringType()))
def ner_with_spacy(text_series):
    entities_list = []
    for text in text_series:
        doc = nlp_ner(text)
        entities = [f"{ent.text}:::{ent.label_}" for ent in doc.ents]
        entities_list.append(entities)
    return pd.Series(entities_list)

In [None]:
!python -m spacy download en_core_web_trf

In [None]:
nlp_embeddings = spacy.load("en_core_web_trf")

# Define a UDF to get sentence embeddings 
@pandas_udf(ArrayType(FloatType()))
def embeddings_with_spacy(text_series):
    embeddings_list = []
    for text in text_series:
        doc = nlp_embeddings(text)
        embeddings = doc._.trf_data.tensors[-1][0]
        embeddings_list.append(embeddings)
    return pd.Series(embeddings_list)

# Spacy with Arrow Enabled

clear all variables, and rerun initialize spark, get data and define udf function sections 

In [None]:
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")

In [None]:
spark.conf.get("spark.sql.execution.arrow.pyspark.enabled")

NER extraction

In [None]:
%%time

# Apply the UDF to your Spark DataFrame
result = spark_df.withColumn("named_entities", ner_with_spacy(spark_df["text"]))


result.collect() 

Get embeddings

In [None]:
%%time

# Apply the UDF to your Spark DataFrame
result = spark_df.withColumn("embeddings", embeddings_with_spacy(spark_df["text"]))


result.select("embeddings").collect() 

# Spacy with Arrow Disabled

clear all variables, and rerun initialize spark, get data and define udf function sections 

In [None]:
spark.conf.get("spark.sql.execution.arrow.pyspark.enabled")

In [None]:
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "false")

In [None]:
spark.conf.get("spark.sql.execution.arrow.pyspark.enabled")

NER extraction

In [None]:
%%time

# Apply the UDF to your Spark DataFrame
result = spark_df.withColumn("named_entities", ner_with_spacy(spark_df["text"]))


result.collect() 

Get embeddings

In [None]:
%%time

# Apply the UDF to your Spark DataFrame
result = spark_df.withColumn("embeddings", embeddings_with_spacy(spark_df["text"]))


result.collect() 