# Lemmatize PATSTATusing Spark NLP

In [1]:
from sparknlp.base import *
from sparknlp.annotator import *
import sparknlp
from pyspark.ml import Pipeline
import pyspark.sql.functions as F
from pyspark.sql.types import StringType
from pathlib import Path

## 1. Read papers and concatenate the `title` and `paperAbstract` fields

In [7]:
%%time

# Loading patents table text fields, and concatenating them for lemmatization
patents = spark.sql("SELECT appln_id, appln_title, appln_abstract FROM parquet.`/export/ml4ds/IntelComp/Datalake/PATSTAT/2021_Autumn/patstat_appln.parquet`")
patents = patents.repartition(numPartitions=20000)
##For development purposes only
#patents = patents.sample(fraction=0.0001)

#Concatenate text fields to lemmatize
patents = (
    patents.withColumn("rawtext",F.concat_ws('. ', "appln_title", "appln_abstract"))
    .drop("appln_title")
    .drop("appln_abstract")
)

print('Number of patents before language filtering:', patents.count())

22/03/22 13:49:40 WARN metastore.ObjectStore: Failed to get database parquet, returning NoSuchObjectException
[Stage 17:>                                                         (0 + 1) / 1]

Number of patents before language filtering: 114690034
CPU times: user 17.1 ms, sys: 27.8 ms, total: 44.9 ms
Wall time: 14.8 s


                                                                                

## 2. Filter patents abstracts that are not in English Language

In [8]:
%%time

#Pipeline for language detection
documentAssembler = DocumentAssembler() \
    .setInputCol("rawtext") \
    .setOutputCol("document")

languageDetector = LanguageDetectorDL.pretrained() \
    .setInputCols("document") \
    .setOutputCol("language")

pipeline = Pipeline() \
    .setStages([
      documentAssembler,
      languageDetector
    ])

#Apply language detection pipeline
patents = pipeline.fit(patents).transform(patents)
patents = (
    patents.filter(F.col("language.result")[0]=="en")
    .drop("language")
)

print('Number of patents in English:', patents.count())

ld_wiki_tatoeba_cnn_21 download started this may take some time.
Approximate size to download 7.1 MB
[OK!]


22/03/22 13:50:26 ERROR scheduler.TaskSchedulerImpl: Lost executor 45 on node66.cluster.tsc.uc3m.es: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
22/03/22 13:50:26 WARN scheduler.TaskSetManager: Lost task 182.0 in stage 18.0 (TID 61776) (node66.cluster.tsc.uc3m.es executor 45): ExecutorLostFailure (executor 45 exited caused by one of the running tasks) Reason: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
22/03/22 13:50:26 WARN scheduler.TaskSetManager: Lost task 95.0 in stage 18.0 (TID 61752) (node66.cluster.tsc.uc3m.es executor 45): ExecutorLostFailure (executor 45 exited caused by one of the running tasks) Reason: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
22/03/22 13:50:26 WARN scheduler.TaskSetManager: Lost task



Number of patents in English: 76348837
CPU times: user 925 ms, sys: 172 ms, total: 1.1 s
Wall time: 1h 29min 20s


[Stage 20:>                                                         (0 + 1) / 1]                                                                                

## 3. Define and Run Lemmatization Pipeline

   - We work on documents created in Subsection 2
   - Sentence Detection and Tokenizer applied to detect tokens
   - Lemmatization is carried out
   - Stopwords are applied
   - Punctuation symbols are removed
   - Result is converted back from Spark NLP annotations to string format

In [9]:
%%time 

#Next, we carry out the lemmatization pipeline

sentenceDetector = SentenceDetector() \
    .setInputCols(["document"]) \
    .setOutputCol("sentence")

tokenizer = Tokenizer() \
    .setInputCols(["sentence"]) \
    .setOutputCol("token")

lemmatizer = LemmatizerModel.pretrained() \
    .setInputCols(["token"]) \
    .setOutputCol("lemma")

stopWords = StopWordsCleaner() \
    .setInputCols(["lemma"]) \
    .setOutputCol("cleanlemma")

normalizer = Normalizer() \
    .setInputCols(["cleanlemma"]) \
    .setOutputCol("normalizedlemma") \
    .setLowercase(True) \
    .setCleanupPatterns(["""[^\w\d\s]"""])

finisher = Finisher() \
     .setInputCols(['normalizedlemma'])

pipeline = Pipeline() \
    .setStages([
      sentenceDetector,
      tokenizer,
      lemmatizer,
      stopWords,
      normalizer,
      finisher
])

#We apply pipeline and recover lemmas as string
patents = pipeline.fit(patents).transform(patents)

udf_back2str = F.udf(lambda x:' '.join(list(x)), StringType() )
patents = (
    patents.withColumn("lemmas",udf_back2str(F.col("finished_normalizedlemma")))
    .drop("rawtext")
    .drop("finished_normalizedlemma")
)

#Show results of validation for n papers
#patents.show(n=10, truncate=120, vertical=True)

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[ | ]lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[ / ]Download done! Loading the resource.




[ \ ]



[OK!]
CPU times: user 127 ms, sys: 14.9 ms, total: 142 ms
Wall time: 14.4 s


## 4. Save a table with `appln_id` and `lemmas` to HDFS

In [10]:
%%time

#Save calculated lemmas to HDFS
dir_parquet = Path("/export/ml4ds/IntelComp/Datalake/PATSTAT/2021_Autumn")

patents.coalesce(1000).write.parquet(
    dir_parquet.joinpath(f"patstat_appln_NLP.parquet").as_posix(),
    mode="overwrite",
)

22/03/22 15:20:42 ERROR scheduler.TaskSchedulerImpl: Lost executor 8 on node89.cluster.tsc.uc3m.es: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
22/03/22 15:20:42 WARN scheduler.TaskSetManager: Lost task 99.0 in stage 23.0 (TID 82163) (node89.cluster.tsc.uc3m.es executor 8): ExecutorLostFailure (executor 8 exited caused by one of the running tasks) Reason: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
22/03/22 15:20:42 WARN scheduler.TaskSetManager: Lost task 91.0 in stage 23.0 (TID 82139) (node89.cluster.tsc.uc3m.es executor 8): ExecutorLostFailure (executor 8 exited caused by one of the running tasks) Reason: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
22/03/22 15:20:42 WARN scheduler.TaskSetManager: Lost task 164.0

                                                                                

CPU times: user 1.61 s, sys: 385 ms, total: 2 s
Wall time: 1h 59min 33s


## 5. Optional: Check that the generated table looks OK

In [12]:
%%time

#Test that the saved table is correct
patents = spark.sql("SELECT * FROM parquet.`/export/ml4ds/IntelComp/Datalake/PATSTAT/2021_Autumn/patstat_appln_NLP.parquet`")
print('Number of lemmatized patents:', patents.count())
patents.show(n=10, truncate=120, vertical=True)

22/03/22 18:00:34 WARN metastore.ObjectStore: Failed to get database parquet, returning NoSuchObjectException
                                                                                

Number of lemmatized patents: 76348837
-RECORD 0----------------------------------------------------------------------------------------------------------------------------
 appln_id | 496467995                                                                                                                
 lemmas   | medicinal material sun room utility model disclose medicinal material sun room include room body glass cover equip ot... 
-RECORD 1----------------------------------------------------------------------------------------------------------------------------
 appln_id | 4882879                                                                                                                  
 lemmas   | high expression locus vector based ferritin heavy chain gene locus high expression locus vector base part ferritin he... 
-RECORD 2----------------------------------------------------------------------------------------------------------------------------
 appln_id | 7630854    