# Lemmatize NIH project abstracts using Spark NLP

In [1]:
from sparknlp.base import *
from sparknlp.annotator import *
import sparknlp
from pyspark.ml import Pipeline
import pyspark.sql.functions as F
from pyspark.sql.types import StringType
from pathlib import Path
import pandas as pd

## 1. Read projects and abstracts. Concatenate text fields

**Important:** We will use Pandas to parse the files, because pySpark does not seem to process NIH CSV files correctly when there are additional '"' in some of the files. Pandas seems to do it OK.

We will also do some basic cleanup of the text fields. This is directly done in Pandas, it may be better to do it in Spark

Result is saved in a CSV file in the local folder

In [2]:
NIHprojects = pd.read_csv('file:///export/data_ml4ds/IntelComp/Datasets/nih/csv/20220119/projects.csv', low_memory=False)
NIHabstracts = pd.read_csv('file:///export/data_ml4ds/IntelComp/Datasets/nih/csv/20220119/abstracts.csv', low_memory=False)

In [3]:
NIHprojects = NIHprojects[["APPLICATION_ID", "PROJECT_TITLE", "PROJECT_TERMS", "PHR"]]
NIHprojects = NIHprojects.fillna("")
NIHprojects["PROJECT_TERMS"] = NIHprojects["PROJECT_TERMS"].apply(lambda x: x.replace(";", "; "))
NIHprojects["PHR"] = (NIHprojects["PHR"]
                          .apply(str)
                          .replace("PROJECT NARRATIVE ", "")
                          .replace('"', '')
                          .apply(lambda x: " ".join(x.split()))
                     )
NIHabstracts = NIHabstracts[["APPLICATION_ID", "ABSTRACT_TEXT"]]
NIHabstracts["ABSTRACT_TEXT"] = (NIHabstracts["ABSTRACT_TEXT"]
                                    .apply(str)
                                    .replace("PROJECT SUMMARY/ABSTRACT", "")
                                    .replace("PROJECT SUMMARY", "")
                                    .replace('"', '')
                                    .apply(lambda x: " ".join(x.split()))
                                )

In [4]:
NIHprojects = NIHprojects.merge(NIHabstracts, how="inner", on="APPLICATION_ID")

In [5]:
NIHprojects["rawtext"] = NIHprojects["PROJECT_TITLE"] + ". " + \
                         NIHprojects["ABSTRACT_TEXT"] + ". " + \
                         NIHprojects["PHR"] + ". " + \
                         NIHprojects["PROJECT_TERMS"]
NIHprojects = NIHprojects[["APPLICATION_ID", "rawtext"]]

In [6]:
#Dataframe will be saved to temporary file
NIHprojects.to_csv("./NIHrawtext.csv", index=False)

## 2. Filter abstracts that are not in English Language

In [19]:
%%time
NIHraw = spark.read.csv("file:///export/usuarios_ml4ds/jarenas/github/IntelComp/ITMT/topicmodeler/NIHrawtext.csv", header=True)
NIHraw = NIHraw.repartition(2000)
print("Number of projects before language filtering:", NIHraw.count())
#NIHraw.show(n=10, truncate=120, vertical=True)

22/04/03 19:57:59 ERROR scheduler.AsyncEventQueue: Listener EventLoggingListener threw an exception
java.util.ConcurrentModificationException
	at java.util.Hashtable$Enumerator.next(Hashtable.java:1387)
	at scala.collection.convert.Wrappers$JPropertiesWrapper$$anon$6.next(Wrappers.scala:424)
	at scala.collection.convert.Wrappers$JPropertiesWrapper$$anon$6.next(Wrappers.scala:420)
	at scala.collection.Iterator.foreach(Iterator.scala:941)
	at scala.collection.Iterator.foreach$(Iterator.scala:941)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1429)
	at scala.collection.IterableLike.foreach(IterableLike.scala:74)
	at scala.collection.IterableLike.foreach$(IterableLike.scala:73)
	at scala.collection.AbstractIterable.foreach(Iterable.scala:56)
	at scala.collection.mutable.MapLike.toSeq(MapLike.scala:75)
	at scala.collection.mutable.MapLike.toSeq$(MapLike.scala:72)
	at scala.collection.mutable.AbstractMap.toSeq(Map.scala:82)
	at org.apache.spark.scheduler.EventLoggingListener.r

Number of projects before language filtering: 2278732
CPU times: user 36.7 ms, sys: 3.49 ms, total: 40.2 ms
Wall time: 30.6 s


                                                                                

In [20]:
%%time

#Pipeline for language detection
documentAssembler = DocumentAssembler() \
    .setInputCol("rawtext") \
    .setOutputCol("document")

languageDetector = LanguageDetectorDL.pretrained() \
    .setInputCols("document") \
    .setOutputCol("language")

pipeline = Pipeline() \
    .setStages([
      documentAssembler,
      languageDetector
    ])

#Apply language detection pipeline
NIHraw = pipeline.fit(NIHraw).transform(NIHraw)
NIHraw = (
    NIHraw.filter(F.col("language.result")[0]=="en")
    .drop("language")
)

print('Number of projects in English:', NIHraw.count())

ld_wiki_tatoeba_cnn_21 download started this may take some time.
Approximate size to download 7.1 MB
[OK!]


22/04/03 19:59:13 ERROR scheduler.TaskSchedulerImpl: Lost executor 18 on node62.cluster.tsc.uc3m.es: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
22/04/03 19:59:13 WARN scheduler.TaskSetManager: Lost task 12.0 in stage 19.0 (TID 2281) (node62.cluster.tsc.uc3m.es executor 18): ExecutorLostFailure (executor 18 exited caused by one of the running tasks) Reason: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
22/04/03 19:59:13 WARN scheduler.TaskSetManager: Lost task 2.0 in stage 19.0 (TID 2271) (node62.cluster.tsc.uc3m.es executor 18): ExecutorLostFailure (executor 18 exited caused by one of the running tasks) Reason: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
22/04/03 19:59:13 WARN scheduler.TaskSetManager: Lost task 32.

Number of projects in English: 2231208
CPU times: user 203 ms, sys: 31.6 ms, total: 235 ms
Wall time: 16min 58s


                                                                                

## 3. Define and Run Lemmatization Pipeline

   - We work on documents created in Subsection 2
   - Sentence Detection and Tokenizer applied to detect tokens
   - Lemmatization is carried out
   - Stopwords are applied
   - Punctuation symbols are removed
   - Result is converted back from Spark NLP annotations to string format

In [22]:
%%time 

#Next, we carry out the lemmatization pipeline

sentenceDetector = SentenceDetector() \
    .setInputCols(["document"]) \
    .setOutputCol("sentence")

tokenizer = Tokenizer() \
    .setInputCols(["sentence"]) \
    .setOutputCol("token")

lemmatizer = LemmatizerModel.pretrained() \
    .setInputCols(["token"]) \
    .setOutputCol("lemma")

stopWords = StopWordsCleaner() \
    .setInputCols(["lemma"]) \
    .setOutputCol("cleanlemma")

normalizer = Normalizer() \
    .setInputCols(["cleanlemma"]) \
    .setOutputCol("normalizedlemma") \
    .setLowercase(True) \
    .setCleanupPatterns(["""[^\w\d\s]"""])

finisher = Finisher() \
     .setInputCols(['normalizedlemma'])

pipeline = Pipeline() \
    .setStages([
      sentenceDetector,
      tokenizer,
      lemmatizer,
      stopWords,
      normalizer,
      finisher
])

#We apply pipeline and recover lemmas as string
NIHraw = pipeline.fit(NIHraw).transform(NIHraw)

udf_back2str = F.udf(lambda x:' '.join(list(x)), StringType() )
NIHraw = (
    NIHraw.withColumn("lemmas",udf_back2str(F.col("finished_normalizedlemma")))
    .drop("rawtext")
    .drop("finished_normalizedlemma")
)

#Show results of validation for n papers
NIHraw.show(n=10, truncate=120, vertical=True)

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[ | ]lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[ / ]Download done! Loading the resource.




[ \ ]

                                                                                

[OK!]


[Stage 25:>                                                         (0 + 1) / 1]

-RECORD 0----------------------------------------------------------------------------------------------------------------------------------
 APPLICATION_ID | 6930018                                                                                                                  
 lemmas         | il17 mediated muc gene expression airway epithelium description provide applicant mucus hypersecretion persistent air... 
-RECORD 1----------------------------------------------------------------------------------------------------------------------------------
 APPLICATION_ID | 6878069                                                                                                                  
 lemmas         | maturation cognitiverelated brain potentials description provide applicant axiomatic developmental increase performan... 
-RECORD 2----------------------------------------------------------------------------------------------------------------------------------
 APPLICATION_ID | 69

                                                                                

## 4. Save a table with `id` and `lemmas` to HDFS

In [23]:
%%time

#Save calculated lemmas to HDFS
dir_parquet = Path("/export/ml4ds/IntelComp/Datalake/NIH/20220119")

NIHraw.coalesce(1000).write.parquet(
    dir_parquet.joinpath(f"NIH_NLP.parquet").as_posix(),
    mode="overwrite",
)

2022-04-03 20:50:47,263:9854(0x7f7d74add640):ZOO_ERROR@handle_socket_error_msg@1782: Socket [10.0.12.18:2181] zk retcode=-4, errno=112(Host is down): failed while receiving a server response
I0403 20:50:47.264180 10638 group.cpp:452] Lost connection to ZooKeeper, attempting to reconnect ...
2022-04-03 20:50:47,270:9854(0x7f7d74add640):ZOO_INFO@check_events@1764: initiated connection to server [10.0.12.60:2181]
2022-04-03 20:50:47,313:9854(0x7f7d74add640):ZOO_ERROR@handle_socket_error_msg@1782: Socket [10.0.12.60:2181] zk retcode=-4, errno=112(Host is down): failed while receiving a server response
2022-04-03 20:50:47,313:9854(0x7f7d74add640):ZOO_INFO@check_events@1764: initiated connection to server [10.0.12.76:2181]
2022-04-03 20:50:47,314:9854(0x7f7d74add640):ZOO_ERROR@handle_socket_error_msg@1782: Socket [10.0.12.76:2181] zk retcode=-4, errno=112(Host is down): failed while receiving a server response
2022-04-03 20:50:47,314:9854(0x7f7d74add640):ZOO_INFO@check_events@1764: initiated

CPU times: user 830 ms, sys: 220 ms, total: 1.05 s
Wall time: 30min 39s


## 5. Delete temporary file

In [24]:
import os 

file = "./NIHrawtext.csv"

if(os.path.exists(file) and os.path.isfile(file)):
    os.remove(file)
    print("file deleted")
else:
    print("file not found")

file deleted
