In [1]:
sc.version

'3.1.1'

In [2]:
from pathlib import Path
from pyspark.sql.types import StringType, ArrayType
import pyspark.sql.functions as F
import requests

### Auxiliary functions

In [3]:
# Aux functions
def normalize(text):
    if isinstance(text, str):
        text = " ".join(text.split())
    return text


take_id = F.udf(lambda x: normalize(x[0] if len(x) > 0 else None), StringType())
take_authors_ids = F.udf(
    lambda x: [normalize(el[0] if len(el) > 0 else None) for el in x],
    ArrayType(StringType()),
)
norm_string = F.udf(normalize, StringType())

def get_pdf(pdf_list):
    pdf_list = [pdf for pdf in pdf_list if pdf.endswith(".pdf")]
    if len(pdf_list) > 0:
        return pdf_list[0]
    else:
        return None

get_first_pdf = F.udf(get_pdf, StringType())

### Define directories

In [6]:
dir_data = "/export/ml4ds/IntelComp/Datalake/SemanticScholar/prueba/"
dir_out = Path("/export/ml4ds/IntelComp/Datalake/SemanticScholar/out_prueba/")

### Read data files

In [10]:
df = spark.read.json(dir_data)
print(df.count())



330179


                                                                                

### Save authors

In [13]:
df_authors = df.select(F.explode("authors").alias("authors"))
df_authors = (
    df_authors.select("authors.ids", "authors.name")
    .withColumn("ids", take_id("ids"))
    .withColumn("name", norm_string("name"))
    .withColumnRenamed("ids", "id")
    .drop_duplicates(subset=["id"])
    .dropna(subset=["id"])
)
df_authors.write.parquet(
    dir_out.joinpath("parquet/authors.parquet").as_posix(), mode="overwrite"
)


                                                                                

### Save papers

In [9]:
columns = [
    "id",
    "title",
    "paperAbstract",
    "s2Url",
    "pdfUrls",
    "year",
    "sources",
    "doi",
    "doiUrl",
    "pmid",
    "magId",
    "fieldsOfStudy",
    "journalName",
    "journalPages",
    "journalVolume",
    "venue",
    "inCitations",
    "outCitations",
    "authors",
]
df_papers = df.select(columns)
df_papers = df_papers.withColumn("authors", take_authors_ids("authors.ids"))
for c in columns:
    if df.select(c).dtypes[0][1] == "string":
        df_papers = df_papers.withColumn(c, norm_string(c))

df_papers.write.parquet(
    dir_out.joinpath("parquet/papers.parquet").as_posix(),
    mode="overwrite",
)


                                                                                

### Download PDFs

In [24]:
pdf_urls = (
    df.select(["id", "pdfUrls"])
    .withColumn("pdfUrls", get_first_pdf("pdfUrls"))
    .filter(F.length("pdfUrls") > 0)
)
# pdf_urls.show(5, truncate=False)

def download_pdf(x):
    r = requests.get(x["pdfUrls"], stream=True)
    
    with dir_out.joinpath(f"pdfs/{x['id']}.pdf").open("wb") as f:
        f.write(r.content)

pdf_urls.foreach(download_pdf)


22/02/07 12:25:47 WARN scheduler.TaskSetManager: Lost task 5.0 in stage 30.0 (TID 1195) (node28.cluster.tsc.uc3m.es executor 8): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/export/workdir/mesos/work/slaves/398d8fdd-ddd7-4a42-a8ea-a162fc6bbcf5-S25/frameworks/eee63918-d2dc-408c-9403-2e3d0577668d-0074/executors/8/runs/740c24e6-7d9a-4b8c-95ca-647346ffa302/spark-3.1.1-bin-2.8.3/python/lib/pyspark.zip/pyspark/worker.py", line 604, in main
    process()
  File "/export/workdir/mesos/work/slaves/398d8fdd-ddd7-4a42-a8ea-a162fc6bbcf5-S25/frameworks/eee63918-d2dc-408c-9403-2e3d0577668d-0074/executors/8/runs/740c24e6-7d9a-4b8c-95ca-647346ffa302/spark-3.1.1-bin-2.8.3/python/lib/pyspark.zip/pyspark/worker.py", line 594, in process
    out_iter = func(split_index, iterator)
  File "/opt/spark-3.1.1-bin-2.8.3/python/pyspark/rdd.py", line 2916, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/opt/spark-3.1.1-bin-2.8.3/python/pyspa

22/02/07 12:25:48 WARN scheduler.TaskSetManager: Lost task 4.0 in stage 30.0 (TID 1197) (node21.cluster.tsc.uc3m.es executor 0): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/export/workdir/mesos/work/slaves/398d8fdd-ddd7-4a42-a8ea-a162fc6bbcf5-S12/frameworks/eee63918-d2dc-408c-9403-2e3d0577668d-0074/executors/0/runs/4826b956-2534-4ab6-b7b9-8feef6fa4923/spark-3.1.1-bin-2.8.3/python/lib/pyspark.zip/pyspark/worker.py", line 604, in main
    process()
  File "/export/workdir/mesos/work/slaves/398d8fdd-ddd7-4a42-a8ea-a162fc6bbcf5-S12/frameworks/eee63918-d2dc-408c-9403-2e3d0577668d-0074/executors/0/runs/4826b956-2534-4ab6-b7b9-8feef6fa4923/spark-3.1.1-bin-2.8.3/python/lib/pyspark.zip/pyspark/worker.py", line 594, in process
    out_iter = func(split_index, iterator)
  File "/opt/spark-3.1.1-bin-2.8.3/python/pyspark/rdd.py", line 2916, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/opt/spark-3.1.1-bin-2.8.3/python/pyspa

22/02/07 12:25:51 WARN scheduler.TaskSetManager: Lost task 9.0 in stage 30.0 (TID 1210) (node66.cluster.tsc.uc3m.es executor 4): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/export/workdir/mesos/work/slaves/398d8fdd-ddd7-4a42-a8ea-a162fc6bbcf5-S53/frameworks/eee63918-d2dc-408c-9403-2e3d0577668d-0074/executors/4/runs/42483942-b133-4dca-a4f3-b0a06215ef30/spark-3.1.1-bin-2.8.3/python/lib/pyspark.zip/pyspark/worker.py", line 604, in main
    process()
  File "/export/workdir/mesos/work/slaves/398d8fdd-ddd7-4a42-a8ea-a162fc6bbcf5-S53/frameworks/eee63918-d2dc-408c-9403-2e3d0577668d-0074/executors/4/runs/42483942-b133-4dca-a4f3-b0a06215ef30/spark-3.1.1-bin-2.8.3/python/lib/pyspark.zip/pyspark/worker.py", line 594, in process
    out_iter = func(split_index, iterator)
  File "/opt/spark-3.1.1-bin-2.8.3/python/pyspark/rdd.py", line 2916, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/opt/spark-3.1.1-bin-2.8.3/python/pyspa

22/02/07 12:25:51 WARN scheduler.TaskSetManager: Lost task 0.0 in stage 30.0 (TID 1204) (node28.cluster.tsc.uc3m.es executor 8): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/export/workdir/mesos/work/slaves/398d8fdd-ddd7-4a42-a8ea-a162fc6bbcf5-S25/frameworks/eee63918-d2dc-408c-9403-2e3d0577668d-0074/executors/8/runs/740c24e6-7d9a-4b8c-95ca-647346ffa302/spark-3.1.1-bin-2.8.3/python/lib/pyspark.zip/pyspark/worker.py", line 604, in main
    process()
  File "/export/workdir/mesos/work/slaves/398d8fdd-ddd7-4a42-a8ea-a162fc6bbcf5-S25/frameworks/eee63918-d2dc-408c-9403-2e3d0577668d-0074/executors/8/runs/740c24e6-7d9a-4b8c-95ca-647346ffa302/spark-3.1.1-bin-2.8.3/python/lib/pyspark.zip/pyspark/worker.py", line 594, in process
    out_iter = func(split_index, iterator)
  File "/opt/spark-3.1.1-bin-2.8.3/python/pyspark/rdd.py", line 2916, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/opt/spark-3.1.1-bin-2.8.3/python/pyspa

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 5 in stage 30.0 failed 4 times, most recent failure: Lost task 5.3 in stage 30.0 (TID 1215) (node28.cluster.tsc.uc3m.es executor 8): java.io.IOException: invalid code lengths set
	at org.apache.hadoop.io.compress.zlib.ZlibDecompressor.inflateBytesDirect(Native Method)
	at org.apache.hadoop.io.compress.zlib.ZlibDecompressor.decompress(ZlibDecompressor.java:225)
	at org.apache.hadoop.io.compress.DecompressorStream.decompress(DecompressorStream.java:111)
	at org.apache.hadoop.io.compress.DecompressorStream.read(DecompressorStream.java:105)
	at java.io.InputStream.read(InputStream.java:101)
	at org.apache.hadoop.util.LineReader.fillBuffer(LineReader.java:182)
	at org.apache.hadoop.util.LineReader.readDefaultLine(LineReader.java:218)
	at org.apache.hadoop.util.LineReader.readLine(LineReader.java:176)
	at org.apache.hadoop.mapreduce.lib.input.LineRecordReader.nextKeyValue(LineRecordReader.java:193)
	at org.apache.spark.sql.execution.datasources.RecordReaderIterator.hasNext(RecordReaderIterator.scala:37)
	at org.apache.spark.sql.execution.datasources.HadoopFileLinesReader.hasNext(HadoopFileLinesReader.scala:69)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:489)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:93)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
	at org.apache.spark.ContextAwareIterator.hasNext(ContextAwareIterator.scala:39)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
	at scala.collection.Iterator$GroupedIterator.takeDestructively(Iterator.scala:1158)
	at scala.collection.Iterator$GroupedIterator.go(Iterator.scala:1174)
	at scala.collection.Iterator$GroupedIterator.fill(Iterator.scala:1211)
	at scala.collection.Iterator$GroupedIterator.hasNext(Iterator.scala:1215)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
	at scala.collection.Iterator.foreach(Iterator.scala:941)
	at scala.collection.Iterator.foreach$(Iterator.scala:941)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1429)
	at org.apache.spark.api.python.PythonRDD$.writeIteratorToStream(PythonRDD.scala:307)
	at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$1.writeIteratorToStream(PythonUDFRunner.scala:53)
	at org.apache.spark.api.python.BasePythonRunner$WriterThread.$anonfun$run$1(PythonRunner.scala:397)
	at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:1996)
	at org.apache.spark.api.python.BasePythonRunner$WriterThread.run(PythonRunner.scala:232)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2253)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2202)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2201)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2201)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1078)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1078)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1078)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2440)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2382)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2371)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:868)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2202)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2223)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2242)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2267)
	at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1030)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:414)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:1029)
	at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:180)
	at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
Caused by: java.io.IOException: invalid code lengths set
	at org.apache.hadoop.io.compress.zlib.ZlibDecompressor.inflateBytesDirect(Native Method)
	at org.apache.hadoop.io.compress.zlib.ZlibDecompressor.decompress(ZlibDecompressor.java:225)
	at org.apache.hadoop.io.compress.DecompressorStream.decompress(DecompressorStream.java:111)
	at org.apache.hadoop.io.compress.DecompressorStream.read(DecompressorStream.java:105)
	at java.io.InputStream.read(InputStream.java:101)
	at org.apache.hadoop.util.LineReader.fillBuffer(LineReader.java:182)
	at org.apache.hadoop.util.LineReader.readDefaultLine(LineReader.java:218)
	at org.apache.hadoop.util.LineReader.readLine(LineReader.java:176)
	at org.apache.hadoop.mapreduce.lib.input.LineRecordReader.nextKeyValue(LineRecordReader.java:193)
	at org.apache.spark.sql.execution.datasources.RecordReaderIterator.hasNext(RecordReaderIterator.scala:37)
	at org.apache.spark.sql.execution.datasources.HadoopFileLinesReader.hasNext(HadoopFileLinesReader.scala:69)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:489)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:93)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
	at org.apache.spark.ContextAwareIterator.hasNext(ContextAwareIterator.scala:39)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
	at scala.collection.Iterator$GroupedIterator.takeDestructively(Iterator.scala:1158)
	at scala.collection.Iterator$GroupedIterator.go(Iterator.scala:1174)
	at scala.collection.Iterator$GroupedIterator.fill(Iterator.scala:1211)
	at scala.collection.Iterator$GroupedIterator.hasNext(Iterator.scala:1215)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
	at scala.collection.Iterator.foreach(Iterator.scala:941)
	at scala.collection.Iterator.foreach$(Iterator.scala:941)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1429)
	at org.apache.spark.api.python.PythonRDD$.writeIteratorToStream(PythonRDD.scala:307)
	at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$1.writeIteratorToStream(PythonUDFRunner.scala:53)
	at org.apache.spark.api.python.BasePythonRunner$WriterThread.$anonfun$run$1(PythonRunner.scala:397)
	at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:1996)
	at org.apache.spark.api.python.BasePythonRunner$WriterThread.run(PythonRunner.scala:232)


22/02/07 12:25:51 WARN scheduler.TaskSetManager: Lost task 3.0 in stage 30.0 (TID 1207) (node28.cluster.tsc.uc3m.es executor 8): TaskKilled (Stage cancelled)
22/02/07 12:25:52 WARN scheduler.TaskSetManager: Lost task 1.0 in stage 30.0 (TID 1205) (node28.cluster.tsc.uc3m.es executor 8): TaskKilled (Stage cancelled)
22/02/07 12:25:53 WARN scheduler.TaskSetManager: Lost task 2.0 in stage 30.0 (TID 1206) (node28.cluster.tsc.uc3m.es executor 8): TaskKilled (Stage cancelled)
22/02/07 12:25:53 WARN scheduler.TaskSetManager: Lost task 4.3 in stage 30.0 (TID 1214) (node21.cluster.tsc.uc3m.es executor 0): TaskKilled (Stage cancelled)
22/02/07 12:25:53 WARN scheduler.TaskSetManager: Lost task 6.3 in stage 30.0 (TID 1212) (node21.cluster.tsc.uc3m.es executor 0): TaskKilled (Stage cancelled)


In [25]:
download_pdf({'id': 'ea54ec0ab5fa16d842d07806191a5c8e260228e0', 'pdfUrls': 'http://familyrepository.lppkn.gov.my/265/1/Financing_Old_Age_(Mukul_G._Asher).pdf'})

FileNotFoundError: [Errno 2] No such file or directory: '/export/ml4ds/IntelComp/Datalake/SemanticScholar/out_prueba/pdfs/ea54ec0ab5fa16d842d07806191a5c8e260228e0.pdf'

In [21]:
pdf_urls.take(2)

                                                                                

[Row(id='ea54ec0ab5fa16d842d07806191a5c8e260228e0', pdfUrls='http://familyrepository.lppkn.gov.my/265/1/Financing_Old_Age_(Mukul_G._Asher).pdf'),
 Row(id='e56659d0b007a08f449dd39a59d188638b90723f', pdfUrls='https://www.iema.net/assets/uploads/EIA%20Articles/CBA%20Green%20Infrastructure%20and%20EIA.pdf')]