# Lemmatize CORDIS projects using Spark NLP

In [2]:
from sparknlp.base import *
from sparknlp.annotator import *
import sparknlp
from pyspark.ml import Pipeline
import pyspark.sql.functions as F
from pyspark.sql.types import StringType
from pathlib import Path

## 1. Read project information and concatenate the `title` and `objective` fields

In [3]:
%%time

# Loading papers table text fields, and concatenating them for lemmatization
projects = spark.sql("SELECT projectID, title, objective FROM parquet.`/export/ml4ds/IntelComp/Datalake/CORDIS/20220221/new_parquet/projects.parquet`")
#projects = spark.read.parquet('/export/ml4ds/IntelComp/Datalake/CORDIS/20220221/new_parquet/projects.parquet')
projects = projects.repartition(numPartitions=1000)

#Concatenate text fields to lemmatize
projects = (
    projects.withColumn("rawtext",F.concat_ws('. ', "title", "objective"))
    .select("projectID","rawtext")
)

print('Number of projects before language filtering:', projects.count())

22/05/12 12:35:24 WARN metastore.ObjectStore: Failed to get database parquet, returning NoSuchObjectException

Number of projects before language filtering: 61163
CPU times: user 41.5 ms, sys: 6.88 ms, total: 48.4 ms
Wall time: 21.2 s


[Stage 4:>                                                          (0 + 1) / 1]                                                                                

## 2. Filter project abstracts that are not in English Language

In [4]:
%%time

#Pipeline for language detection
documentAssembler = DocumentAssembler() \
    .setInputCol("rawtext") \
    .setOutputCol("document")

languageDetector = LanguageDetectorDL.pretrained() \
    .setInputCols("document") \
    .setOutputCol("language")

pipeline = Pipeline() \
    .setStages([
      documentAssembler,
      languageDetector
    ])

#Apply language detection pipeline
projects = pipeline.fit(projects).transform(projects)
projects = (
    projects.filter(F.col("language.result")[0]=="en")
    .drop("language")
)

print('Number of projects in English:', projects.count())

ld_wiki_tatoeba_cnn_21 download started this may take some time.
Approximate size to download 7.1 MB
[ | ]ld_wiki_tatoeba_cnn_21 download started this may take some time.
Approximate size to download 7.1 MB
[ / ]Download done! Loading the resource.
[ — ]



[ \ ]

                                                                                

[ | ]



[ / ]

2022-05-12 12:36:25.522788: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


[OK!]




Number of projects in English: 60626
CPU times: user 74.1 ms, sys: 18.1 ms, total: 92.2 ms
Wall time: 36.8 s




## 3. Define and Run Lemmatization Pipeline

   - We work on documents created in Subsection 2
   - Sentence Detection and Tokenizer applied to detect tokens
   - Lemmatization is carried out
   - Stopwords are applied
   - Punctuation symbols are removed
   - Result is converted back from Spark NLP annotations to string format

In [5]:
%%time 

#Next, we carry out the lemmatization pipeline

sentenceDetector = SentenceDetector() \
    .setInputCols(["document"]) \
    .setOutputCol("sentence")

tokenizer = Tokenizer() \
    .setInputCols(["sentence"]) \
    .setOutputCol("token")

lemmatizer = LemmatizerModel.pretrained() \
    .setInputCols(["token"]) \
    .setOutputCol("lemma")

stopWords = StopWordsCleaner() \
    .setInputCols(["lemma"]) \
    .setOutputCol("cleanlemma")

normalizer = Normalizer() \
    .setInputCols(["cleanlemma"]) \
    .setOutputCol("normalizedlemma") \
    .setLowercase(True) \
    .setCleanupPatterns(["""[^\w\d\s]"""])

finisher = Finisher() \
     .setInputCols(['normalizedlemma'])

pipeline = Pipeline() \
    .setStages([
#      documentAssembler,
      sentenceDetector,
      tokenizer,
      lemmatizer,
      stopWords,
      normalizer,
      finisher
])

#We apply pipeline and recover lemmas as string
projects = pipeline.fit(projects).transform(projects)

udf_back2str = F.udf(lambda x:' '.join(list(x)), StringType() )
projects = (
    projects.withColumn("lemmas",udf_back2str(F.col("finished_normalizedlemma")))
    .drop("finished_normalizedlemma")
)

#Show results of validation for n papers
projects.show(n=10, truncate=120, vertical=True)

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[ | ]lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
Download done! Loading the resource.




[ — ]

                                                                                

[OK!]


[Stage 14:>                                                         (0 + 1) / 1]

-RECORD 0-----------------------------------------------------------------------------------------------------------------------------
 projectID | 963941                                                                                                                   
 rawtext   | Acquiring assembly skills by robot learning. Present-day industrial robots are made for the purpose of repeating seve... 
 lemmas    | acquiring assembly skill robot learn presentday industrial robot make purpose repeat several task thousands time them... 
-RECORD 1-----------------------------------------------------------------------------------------------------------------------------
 projectID | 268351                                                                                                                   
 rawtext   | Manycore Application Development and Modeling Environment. The multicore revolution (parallel computing on a chip) is... 
 lemmas    | manycore application development modeling 

                                                                                

## 4. Save a table with `projectID`, `rawtext` and `lemmas` to HDFS

In [6]:
%%time

#Save calculated lemmas to HDFS
dir_parquet = Path("/export/ml4ds/IntelComp/Datalake/CORDIS/20220221/new_parquet")

projects.coalesce(40).write.parquet(
    dir_parquet.joinpath(f"projects_NLP.parquet").as_posix(),
    mode="overwrite",
)

                                                                                

CPU times: user 55.4 ms, sys: 2.32 ms, total: 57.8 ms
Wall time: 33 s


## 5. Optional: Check that the generated table looks OK

In [21]:
%%time

#Test that the saved table is correct
projects = spark.read.parquet('/export/ml4ds/IntelComp/Datalake/CORDIS/20220221/new_parquet/projects_NLP.parquet')
print('Number of lemmatized projects:', projects.count())
projects.show(n=10, truncate=120, vertical=True)



Number of lemmatized projects: 61163
-RECORD 0-----------------------------------------------------------------------------------------------------------------------------
 projectID | 689074                                                                                                                   
 lemmas    | holistic optimisation ship design operation life cycle maritime product typically associate large investment seldom b... 
-RECORD 1-----------------------------------------------------------------------------------------------------------------------------
 projectID | 251681                                                                                                                   
 lemmas    | development nitric oxide releasing stent treatment coronary artery disease coronary artery disease one lead cause dea... 
-RECORD 2-----------------------------------------------------------------------------------------------------------------------------
 projectID | 75605



In [22]:
%%time

projects = spark.read.parquet('/export/ml4ds/IntelComp/Datalake/CORDIS/20220221/new_parquet/projects.parquet')
lemmas = spark.read.parquet('/export/ml4ds/IntelComp/Datalake/CORDIS/20220221/new_parquet/projects_NLP.parquet')

projects_lemmas = (projects.join(lemmas, projects.projectID ==  lemmas., "right")
                      .drop(lemmas.id)
                )

print('Number of projects in joint table:', projects_lemmas.count())
projects_lemmas.show(n=10, truncate=120, vertical=True)

AttributeError: 'DataFrame' object has no attribute 'id'

In [18]:
%%time

#Using SQL Expression
papers_lemmas = papers_lemmas.filter(F.array_contains("fieldsOfStudy", 'Computer Science'))
print('Number of papers in Computer Science:', papers_lemmas.count())
papers_lemmas.show(n=10, truncate=120, vertical=True)

                                                                                

Number of papers in Computer Science: 13654631


[Stage 70:>                                                         (0 + 1) / 1]

-RECORD 0---------------------------------------------------------------------------------------------------------------------------------
 id            | 00004ddfe8089303589fb12cddc05fefc7a0bd96                                                                                 
 fieldsOfStudy | [Computer Science]                                                                                                       
 lemmas        | using static total causal ordering protocols achieve ordered view synchrony view synchronous communication vsc servic... 
-RECORD 1---------------------------------------------------------------------------------------------------------------------------------
 id            | 000051c2d8eff18654e5eaf3e636c02028ef96bb                                                                                 
 fieldsOfStudy | [Computer Science]                                                                                                       
 lemmas        | author thi

                                                                                