# Lemmatize NIH project abstracts using Spark NLP

In [1]:
from sparknlp.base import *
from sparknlp.annotator import *
import sparknlp
from pyspark.ml import Pipeline
import pyspark.sql.functions as F
from pyspark.sql.types import StringType
from pathlib import Path
import pandas as pd

## 1. Read projects and abstracts. Concatenate text fields

**Important:** We will use Pandas to parse the files, because pySpark does not seem to process NIH CSV files correctly when there are additional '"' in some of the files. Pandas seems to do it OK.

We will also do some basic cleanup of the text fields. This is directly done in Pandas, it may be better to do it in Spark

Result is saved in a CSV file in the local folder

In [2]:
NIHprojects = pd.read_csv('file:///export/data_ml4ds/IntelComp/Datasets/nih/csv/20220119/projects.csv', low_memory=False)
NIHabstracts = pd.read_csv('file:///export/data_ml4ds/IntelComp/Datasets/nih/csv/20220119/abstracts.csv', low_memory=False)

In [3]:
NIHprojects = NIHprojects[["APPLICATION_ID", "PROJECT_TITLE", "PROJECT_TERMS", "PHR"]]
NIHprojects = NIHprojects.fillna("")
NIHprojects["PROJECT_TERMS"] = NIHprojects["PROJECT_TERMS"].apply(lambda x: x.replace(";", "; "))
NIHprojects["PHR"] = (NIHprojects["PHR"]
                          .apply(str)
                          .replace("PROJECT NARRATIVE ", "")
                          .replace('"', '')
                          .apply(lambda x: " ".join(x.split()))
                     )
NIHabstracts = NIHabstracts[["APPLICATION_ID", "ABSTRACT_TEXT"]]
NIHabstracts["ABSTRACT_TEXT"] = (NIHabstracts["ABSTRACT_TEXT"]
                                    .apply(str)
                                    .replace("PROJECT SUMMARY/ABSTRACT", "")
                                    .replace("PROJECT SUMMARY", "")
                                    .replace('"', '')
                                    .apply(lambda x: " ".join(x.split()))
                                )

In [4]:
NIHprojects = NIHprojects.merge(NIHabstracts, how="inner", on="APPLICATION_ID")

In [5]:
NIHprojects["rawtext"] = NIHprojects["PROJECT_TITLE"] + ". " + \
                         NIHprojects["ABSTRACT_TEXT"] + ". " + \
                         NIHprojects["PHR"] + ". " + \
                         NIHprojects["PROJECT_TERMS"]
NIHprojects = NIHprojects[["APPLICATION_ID", "rawtext"]]

In [6]:
#Dataframe will be saved to temporary file
NIHprojects.to_csv("./NIHrawtext.csv", index=False)

In [7]:
pwd

'/export/usuarios_ml4ds/jarenas/github/IntelComp/ITMT/topicmodeler/aux/lemmatization'

## 2. Filter abstracts that are not in English Language

In [8]:
%%time
NIHraw = spark.read.csv("file:///export/usuarios_ml4ds/jarenas/github/IntelComp/ITMT/topicmodeler/aux/lemmatization/NIHrawtext.csv", header=True)
NIHraw = NIHraw.repartition(2000)
print("Number of projects before language filtering:", NIHraw.count())
#NIHraw.show(n=10, truncate=120, vertical=True)



Number of projects before language filtering: 2278732
CPU times: user 44.2 ms, sys: 12.5 ms, total: 56.7 ms
Wall time: 42 s


                                                                                

In [9]:
%%time

#Pipeline for language detection
documentAssembler = DocumentAssembler() \
    .setInputCol("rawtext") \
    .setOutputCol("document")

languageDetector = LanguageDetectorDL.pretrained() \
    .setInputCols("document") \
    .setOutputCol("language")

pipeline = Pipeline() \
    .setStages([
      documentAssembler,
      languageDetector
    ])

#Apply language detection pipeline
NIHraw = pipeline.fit(NIHraw).transform(NIHraw)
NIHraw = (
    NIHraw.filter(F.col("language.result")[0]=="en")
    .drop("language")
)

print('Number of projects in English:', NIHraw.count())

ld_wiki_tatoeba_cnn_21 download started this may take some time.
Approximate size to download 7.1 MB
[ | ]ld_wiki_tatoeba_cnn_21 download started this may take some time.
Approximate size to download 7.1 MB
Download done! Loading the resource.
[ / ]



[ — ]



[ \ ]



[ | ]

2022-05-12 12:51:00.039582: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


[OK!]




Number of projects in English: 2231208
CPU times: user 176 ms, sys: 31.8 ms, total: 208 ms
Wall time: 3min 43s


                                                                                

## 3. Define and Run Lemmatization Pipeline

   - We work on documents created in Subsection 2
   - Sentence Detection and Tokenizer applied to detect tokens
   - Lemmatization is carried out
   - Stopwords are applied
   - Punctuation symbols are removed
   - Result is converted back from Spark NLP annotations to string format

In [10]:
%%time 

#Next, we carry out the lemmatization pipeline

sentenceDetector = SentenceDetector() \
    .setInputCols(["document"]) \
    .setOutputCol("sentence")

tokenizer = Tokenizer() \
    .setInputCols(["sentence"]) \
    .setOutputCol("token")

lemmatizer = LemmatizerModel.pretrained() \
    .setInputCols(["token"]) \
    .setOutputCol("lemma")

stopWords = StopWordsCleaner() \
    .setInputCols(["lemma"]) \
    .setOutputCol("cleanlemma")

normalizer = Normalizer() \
    .setInputCols(["cleanlemma"]) \
    .setOutputCol("normalizedlemma") \
    .setLowercase(True) \
    .setCleanupPatterns(["""[^\w\d\s]"""])

finisher = Finisher() \
     .setInputCols(['normalizedlemma'])

pipeline = Pipeline() \
    .setStages([
      sentenceDetector,
      tokenizer,
      lemmatizer,
      stopWords,
      normalizer,
      finisher
])

#We apply pipeline and recover lemmas as string
NIHraw = pipeline.fit(NIHraw).transform(NIHraw)

udf_back2str = F.udf(lambda x:' '.join(list(x)), StringType() )
NIHraw = (
    NIHraw.withColumn("lemmas",udf_back2str(F.col("finished_normalizedlemma")))
    .drop("finished_normalizedlemma")
)

#Show results of validation for n papers
NIHraw.show(n=10, truncate=120, vertical=True)

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[ | ]lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
Download done! Loading the resource.
[ / ]



[ — ]

                                                                                

[OK!]


[Stage 13:>                                                         (0 + 1) / 1]

-RECORD 0----------------------------------------------------------------------------------------------------------------------------------
 APPLICATION_ID | 8685183                                                                                                                  
 rawtext        | Targeting Endoplasmic Reticulum Stress Response for Cancer Therapy. DESCRIPTION (provided by applicant): The overall ... 
 lemmas         | targeting endoplasmic reticulum stress response cancer therapy description provide applicant overall goal proposal te... 
-RECORD 1----------------------------------------------------------------------------------------------------------------------------------
 APPLICATION_ID | 8839547                                                                                                                  
 rawtext        | Bifunctional antibodies with targeted CNS delivery against West Nile Virus. PROJECT SUMMARY/ABSTRACT West Nile Virus ... 
 lemmas         | bi

                                                                                

## 4. Save a table with `APPLICATION_ID` and `lemmas` to HDFS

In [11]:
%%time

#Save calculated lemmas to HDFS
dir_parquet = Path("/export/ml4ds/IntelComp/Datalake/NIH/20220119")

NIHraw.coalesce(1000).write.parquet(
    dir_parquet.joinpath(f"projects_NLP.parquet").as_posix(),
    mode="overwrite",
)

                                                                                

CPU times: user 458 ms, sys: 102 ms, total: 560 ms
Wall time: 6min 52s


## 5. Delete temporary file

In [12]:
import os 

file = "./NIHrawtext.csv"

if(os.path.exists(file) and os.path.isfile(file)):
    os.remove(file)
    print("file deleted")
else:
    print("file not found")

file deleted
