# Lemmatize Semantic Scholar papers using Spark NLP

In [1]:
from sparknlp.base import *
from sparknlp.annotator import *
import sparknlp
from pyspark.ml import Pipeline
import pyspark.sql.functions as F
from pyspark.sql.types import StringType
from pathlib import Path

## 1. Read papers and concatenate the `title` and `paperAbstract` fields

In [2]:
%%time

# Loading papers table text fields, and concatenating them for lemmatization
S2papers = spark.sql("SELECT id, title, paperAbstract FROM parquet.`/export/ml4ds/IntelComp/Datalake/SemanticScholar/20220201/papers.parquet`")
S2papers = S2papers.repartition(numPartitions=20000)
##For development purposes only
#S2papers = S2papers.sample(fraction=0.0001)

#Concatenate text fields to lemmatize
S2papers = (
    S2papers.withColumn("rawtext",F.concat_ws('. ', "title", "paperAbstract"))
    .drop("title")
    .drop("paperAbstract")
)

print('Number of papers before language filtering:', S2papers.count())

22/05/11 19:50:00 WARN conf.HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
22/05/11 19:50:00 WARN conf.HiveConf: HiveConf of name hive.stats.retries.wait does not exist
22/05/11 19:50:02 WARN metastore.ObjectStore: Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 2.3.0
22/05/11 19:50:02 WARN metastore.ObjectStore: setMetaStoreSchemaVersion called but recording version is disabled: version = 2.3.0, comment = Set by MetaStore jarenas@192.168.148.225
22/05/11 19:50:03 WARN metastore.ObjectStore: Failed to get database global_temp, returning NoSuchObjectException
22/05/11 19:50:03 WARN metastore.ObjectStore: Failed to get database parquet, returning NoSuchObjectException
[Stage 3:>                                                          (0 + 1) / 1]

Number of papers before language filtering: 204457855
CPU times: user 145 ms, sys: 24.9 ms, total: 170 ms
Wall time: 1min 1s



                                                                                

## 2. Filter abstracts that are not in English Language

In [3]:
%%time

#Pipeline for language detection
documentAssembler = DocumentAssembler() \
    .setInputCol("rawtext") \
    .setOutputCol("document")

languageDetector = LanguageDetectorDL.pretrained() \
    .setInputCols("document") \
    .setOutputCol("language")

pipeline = Pipeline() \
    .setStages([
      documentAssembler,
      languageDetector
    ])

#Apply language detection pipeline
S2papers = pipeline.fit(S2papers).transform(S2papers)
S2papers = (
    S2papers.filter(F.col("language.result")[0]=="en")
    .drop("language")
)

print('Number of papers in English:', S2papers.count())

ld_wiki_tatoeba_cnn_21 download started this may take some time.
Approximate size to download 7.1 MB
[ | ]ld_wiki_tatoeba_cnn_21 download started this may take some time.
Approximate size to download 7.1 MB
[ / ]Download done! Loading the resource.
[ — ]




[ \ ]



[ / ]

2022-05-11 19:51:24.343330: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


[OK!]


[Stage 9:>                                                          (0 + 1) / 1]

Number of papers in English: 150077543
CPU times: user 2.13 s, sys: 368 ms, total: 2.5 s
Wall time: 3h 10min 26s



                                                                                

## 3. Define and Run Lemmatization Pipeline

   - We work on documents created in Subsection 2
   - Sentence Detection and Tokenizer applied to detect tokens
   - Lemmatization is carried out
   - Stopwords are applied
   - Punctuation symbols are removed
   - Result is converted back from Spark NLP annotations to string format

In [4]:
%%time 

#Next, we carry out the lemmatization pipeline

sentenceDetector = SentenceDetector() \
    .setInputCols(["document"]) \
    .setOutputCol("sentence")

tokenizer = Tokenizer() \
    .setInputCols(["sentence"]) \
    .setOutputCol("token")

lemmatizer = LemmatizerModel.pretrained() \
    .setInputCols(["token"]) \
    .setOutputCol("lemma")

stopWords = StopWordsCleaner() \
    .setInputCols(["lemma"]) \
    .setOutputCol("cleanlemma")

normalizer = Normalizer() \
    .setInputCols(["cleanlemma"]) \
    .setOutputCol("normalizedlemma") \
    .setLowercase(True) \
    .setCleanupPatterns(["""[^\w\d\s]"""])

finisher = Finisher() \
     .setInputCols(['normalizedlemma'])

pipeline = Pipeline() \
    .setStages([
      sentenceDetector,
      tokenizer,
      lemmatizer,
      stopWords,
      normalizer,
      finisher
])

#We apply pipeline and recover lemmas as string
S2papers = pipeline.fit(S2papers).transform(S2papers)

udf_back2str = F.udf(lambda x:' '.join(list(x)), StringType() )
S2papers = (
    S2papers.withColumn("lemmas",udf_back2str(F.col("finished_normalizedlemma")))
    .drop("finished_normalizedlemma")
)

#Show results of validation for n papers
#S2papers.show(n=10, truncate=120, vertical=True)

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[ | ]lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
Download done! Loading the resource.




[ — ]



                                                                                

[OK!]
CPU times: user 101 ms, sys: 20 ms, total: 121 ms
Wall time: 9.62 s


## 4. Save a table with `id`, `rawtext` and `lemmas` to HDFS

In [5]:
%%time

#Save calculated lemmas to HDFS
dir_parquet = Path("/export/ml4ds/IntelComp/Datalake/SemanticScholar/20220201")

S2papers.coalesce(1000).write.parquet(
    dir_parquet.joinpath(f"papers_NLP.parquet").as_posix(),
    mode="overwrite",
)

                                                                                

CPU times: user 3.11 s, sys: 546 ms, total: 3.66 s
Wall time: 4h 6min 27s


## 5. Optional: Check that the generated table looks OK

In [6]:
%%time

#Test that the saved table is correct
S2papers = spark.sql("SELECT * FROM parquet.`/export/ml4ds/IntelComp/Datalake/SemanticScholar/20220201/papers_NLP.parquet`")
print('Number of lemmatized papers:', S2papers.count())
S2papers.show(n=10, truncate=120, vertical=True)

22/05/12 09:34:49 WARN metastore.ObjectStore: Failed to get database parquet, returning NoSuchObjectException
                                                                                

Number of lemmatized papers: 150077544
-RECORD 0---------------------------------------------------------------------------------------------------------------------------
 id      | cdb32d6345b0b4243d98b2317e76bdf23eddb77c                                                                                 
 rawtext | ANALYZING FACTORS AFFECTING ON THE ENERGY PRODUCTIVITY IN IRAN’S AGRICULTURAL SECTOR.                                    
 lemmas  | analyzing factors affecting energy productivity irans agricultural sector                                                
-RECORD 1---------------------------------------------------------------------------------------------------------------------------
 id      | 6bf0a5640aacbfc75ec2b9940d2121ac348972bd                                                                                 
 rawtext | 2456 Diamagnetic susceptibility of C2H9NaO5.                                                                             
 lemmas  | 2456 diamagnetic su

In [7]:
%%time

papers = spark.sql("SELECT id, fieldsOfStudy FROM parquet.`/export/ml4ds/IntelComp/Datalake/SemanticScholar/20220201/papers.parquet`")
lemmas = spark.sql("SELECT id, rawtext, lemmas FROM parquet.`/export/ml4ds/IntelComp/Datalake/SemanticScholar/20220201/papers_NLP.parquet`")

papers_lemmas = (papers.join(lemmas, papers.id ==  lemmas.id, "right")
                      .drop(lemmas.id)
                )

print('Number of papers in joint table:', papers_lemmas.count())
papers_lemmas.show(n=10, truncate=120, vertical=True)

22/05/12 09:51:01 WARN metastore.ObjectStore: Failed to get database parquet, returning NoSuchObjectException
22/05/12 09:51:01 WARN metastore.ObjectStore: Failed to get database parquet, returning NoSuchObjectException
                                                                                

Number of papers in joint table: 150077686


[Stage 26:>                                                         (0 + 1) / 1]

-RECORD 0---------------------------------------------------------------------------------------------------------------------------------
 id            | 000007d87901458ae6f88092ab0ac01388b11fcf                                                                                 
 fieldsOfStudy | []                                                                                                                       
 rawtext       | The Philippines. What people in other countries call vernacular architecture we call folk architecture here, but most... 
 lemmas        | philippines people country call vernacular architecture call folk architecture mostly identify rural bahay kubo liter... 
-RECORD 1---------------------------------------------------------------------------------------------------------------------------------
 id            | 00001c5efb21112a47918810af4281e3922803b8                                                                                 
 fieldsOfStudy | []        


                                                                                

In [8]:
%%time

#Using SQL Expression
papers_lemmas = papers_lemmas.filter(F.array_contains("fieldsOfStudy", 'Computer Science'))
print('Number of papers in Computer Science:', papers_lemmas.count())
papers_lemmas.show(n=10, truncate=120, vertical=True)

                                                                                

Number of papers in Computer Science: 13654632


[Stage 33:>                                                         (0 + 1) / 1]

-RECORD 0---------------------------------------------------------------------------------------------------------------------------------
 id            | 00004ddfe8089303589fb12cddc05fefc7a0bd96                                                                                 
 fieldsOfStudy | [Computer Science]                                                                                                       
 rawtext       | Using Static Total Causal Ordering Protocols to Achieve Ordered View Synchrony. A View Synchronous Communication (VSC... 
 lemmas        | using static total causal ordering protocols achieve ordered view synchrony view synchronous communication vsc servic... 
-RECORD 1---------------------------------------------------------------------------------------------------------------------------------
 id            | 000051c2d8eff18654e5eaf3e636c02028ef96bb                                                                                 
 fieldsOfStudy | [Computer 


                                                                                