In [1]:
sc.version

'3.1.1'

### Imports

In [2]:
from configparser import ConfigParser
from pathlib import Path
import pyspark.sql.functions as F
import requests
from pyspark.sql.types import ArrayType, StringType

### Define directories

In [3]:
# Define directories
# 
# Relevant directories are read from the config file:
# dir_data:    full path to hdfs directory where the raw data .gz files are stored
# dir_parquet: full path to hdfs directory where the parquet tables will be stored
# version:     Version of Semantic Scholar that is being processed 
#              for information purposes only

cf = ConfigParser()
cf.read("../config.cf")

dir_data = Path(cf.get("data", "dir_data"))
dir_parquet = Path(cf.get("data", "dir_parquet"))
version = cf.get("data", "version")

# Create output directories if they do not exist
# !hadoop dfs ...
# !hadoop dfs -put 20220201 /export/ml4ds/IntelComp/Datalake/SemanticScholar/

### Auxiliary functions

In [4]:
def normalize(text):
    """
    Removes extra spaces in text
    """
    if isinstance(text, str):
        text = " ".join(text.split())
    return text


def get_pdf(pdf_list):
    """
    Gets the first valid pdf url for a paper
    """
    pdf_list = [pdf for pdf in pdf_list if pdf.endswith(".pdf")]
    if len(pdf_list) > 0:
        return pdf_list[0]
    else:
        return None


#
# Create user defined functions to apply in dataframes
#

# Obtain ID from author
take_id = F.udf(lambda x: normalize(x[0] if len(x) > 0 else None), StringType())

# For each paper get all authors
take_authors_ids = F.udf(
    lambda x: [normalize(el[0] if len(el) > 0 else None) for el in x],
    ArrayType(StringType()),
)

# Remove extra spaces
norm_string = F.udf(normalize, StringType())

# Get first valid pdf url
get_first_pdf = F.udf(get_pdf, StringType())


### Read data files

In [5]:
%%time

# Read data files
#
# Create a spark df with all the papers in all json files

df = spark.read.json(dir_data.as_posix())

22/02/09 10:39:44 WARN datasources.SharedInMemoryCache: Evicting cached table partition metadata from memory due to size constraints (spark.sql.hive.filesourcePartitionFileCacheSize = 262144000 bytes). This may impact query planning performance.
                                                                                

CPU times: user 955 ms, sys: 128 ms, total: 1.08 s
Wall time: 9min 1s


### Create papers dataframe and save as parquet file

In [14]:
%%time

# Create papers dataframe and save as parquet file
#
# Papers table will be created keeping only a subset of desired columns
# It is then stored in disk as a parquet file

# Columns to save
columns = [
    "id",
    "title",
    "paperAbstract",
    "s2Url",
    "pdfUrls",
    "year",
    "sources",
    "doi",
    "doiUrl",
    "pmid",
    "magId",
    "fieldsOfStudy",
    "journalName",
    "journalPages",
    "journalVolume",
    "venue",
]
# Select papers info
df_papers = df.select(columns)

# Clean info
for c in columns:
    if df.select(c).dtypes[0][1] == "string":
        df_papers = df_papers.withColumn(c, norm_string(c))

# Save dataframe as parquet
df_papers.write.parquet(
    dir_parquet.joinpath("papers.parquet").as_posix(),
    mode="overwrite",
)

print('Number of papers in S2 version ' + version + ':', df_papers.count())

                                                                                

CPU times: user 899 ms, sys: 291 ms, total: 1.19 s
Wall time: 18min 33s


### Create authors dataframe and save as parquet file

In [13]:
%%time

# Create authors dataframe and save as parquet file
#
# Authors table will be created from all authors listed in every paper
# - Duplicates will be removed keeping only one row for each author id
# - Authors with empty ids will also be removed from dataframe

# Select only the authors
df_authors = df.select(F.explode("authors").alias("authors"))

# Convert dataframe into two columns (id, author name)
df_authors = (
    df_authors.select("authors.ids", "authors.name")
    .withColumn("ids", take_id("ids"))
    .withColumn("name", norm_string("name"))
    .withColumnRenamed("ids", "id")
    .drop_duplicates(subset=["id"])
    .dropna(subset=["id"])
)

# Save dataframe as parquet
df_authors.write.parquet(
    dir_parquet.joinpath("authors.parquet").as_posix(), mode="overwrite"
)

print('Number of authors in S2 version ' + version + ':', df_authors.count())



Number of authors in S2 version 20220201: 76753643
CPU times: user 2.51 s, sys: 664 ms, total: 3.17 s
Wall time: 31min 11s


                                                                                

### Create citations dataframe and save as parquet file

In [15]:
%%time

# Create citations dataframe and save as parquet file
#
# We create a row paper_source_id -> paper_destination_id
# by exploding all citations of all papers in the version

# Select paper-authors info
df_citations = df.select(["id", "outCitations"])
df_citations = (
    df_citations.withColumn("outCitations", F.explode("outCitations"))
    .withColumnRenamed("id", "source")
    .withColumnRenamed("outCitations", "dest")
)

# Save dataframe as parquet
df_citations.write.parquet(
    dir_parquet.joinpath("citations.parquet").as_posix(),
    mode="overwrite",
)

print('Number of citations in S2 version ' + version + ':', df_citations.count())



Number of citations in S2 version 20220201: 2004466999
CPU times: user 1.57 s, sys: 257 ms, total: 1.82 s
Wall time: 27min 10s


                                                                                

### Create paper_author dataframe and save as parquet file

In [18]:
%%time

# Create paper_author dataframe and save as parquet file
#
# We create a row paper_id -> author_id
# by exploding all authors of all papers in the version

# Select paper-authors info
df_paperAuthor = df.select(["id", "authors"])
df_paperAuthor = (
    df_paperAuthor.withColumn("authors", F.explode(take_authors_ids("authors.ids")))
    .withColumnRenamed("id", "paper_id")
    .withColumnRenamed("authors", "author_id")
    .dropna(subset=["author_id"])
)

# Save dataframe as parquet
df_paperAuthor.write.parquet(
    dir_parquet.joinpath("paper_author.parquet").as_posix(),
    mode="overwrite",
)

print('Number of authorships in S2 version ' + version + ':', df_paperAuthor.count())

22/02/09 09:22:00 WARN scheduler.TaskSetManager: Lost task 2000.0 in stage 17.0 (TID 28622) (node47.cluster.tsc.uc3m.es executor 7): org.apache.spark.SparkException: Task failed while writing rows.
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.executeTask(FileFormatWriter.scala:296)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.$anonfun$write$15(FileFormatWriter.scala:210)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:131)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:497)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:500)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.apach

Py4JJavaError: An error occurred while calling o431.parquet.
: org.apache.spark.SparkException: Job aborted.
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:231)
	at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.run(InsertIntoHadoopFsRelationCommand.scala:188)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult$lzycompute(commands.scala:108)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult(commands.scala:106)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.doExecute(commands.scala:131)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:180)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:218)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:215)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:176)
	at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:132)
	at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:131)
	at org.apache.spark.sql.DataFrameWriter.$anonfun$runCommand$1(DataFrameWriter.scala:989)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:103)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:163)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:90)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:772)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
	at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:989)
	at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:438)
	at org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:415)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:293)
	at org.apache.spark.sql.DataFrameWriter.parquet(DataFrameWriter.scala:874)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.SparkException: Job aborted due to stage failure: Task 2000 in stage 17.0 failed 4 times, most recent failure: Lost task 2000.3 in stage 17.0 (TID 28625) (node94.cluster.tsc.uc3m.es executor 9): org.apache.spark.SparkException: Task failed while writing rows.
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.executeTask(FileFormatWriter.scala:296)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.$anonfun$write$15(FileFormatWriter.scala:210)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:131)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:497)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:500)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/export/workdir/mesos/work/slaves/e24e86b9-cf25-4fa7-8335-c4e89b35b268-S27/frameworks/eee63918-d2dc-408c-9403-2e3d0577668d-0080/executors/9/runs/84a4a4ef-58f5-4894-99c4-033093e583f1/spark-3.1.1-bin-2.8.3/python/lib/pyspark.zip/pyspark/worker.py", line 604, in main
    process()
  File "/export/workdir/mesos/work/slaves/e24e86b9-cf25-4fa7-8335-c4e89b35b268-S27/frameworks/eee63918-d2dc-408c-9403-2e3d0577668d-0080/executors/9/runs/84a4a4ef-58f5-4894-99c4-033093e583f1/spark-3.1.1-bin-2.8.3/python/lib/pyspark.zip/pyspark/worker.py", line 596, in process
    serializer.dump_stream(out_iter, outfile)
  File "/export/workdir/mesos/work/slaves/e24e86b9-cf25-4fa7-8335-c4e89b35b268-S27/frameworks/eee63918-d2dc-408c-9403-2e3d0577668d-0080/executors/9/runs/84a4a4ef-58f5-4894-99c4-033093e583f1/spark-3.1.1-bin-2.8.3/python/lib/pyspark.zip/pyspark/serializers.py", line 211, in dump_stream
    self.serializer.dump_stream(self._batched(iterator), stream)
  File "/export/workdir/mesos/work/slaves/e24e86b9-cf25-4fa7-8335-c4e89b35b268-S27/frameworks/eee63918-d2dc-408c-9403-2e3d0577668d-0080/executors/9/runs/84a4a4ef-58f5-4894-99c4-033093e583f1/spark-3.1.1-bin-2.8.3/python/lib/pyspark.zip/pyspark/serializers.py", line 132, in dump_stream
    for obj in iterator:
  File "/export/workdir/mesos/work/slaves/e24e86b9-cf25-4fa7-8335-c4e89b35b268-S27/frameworks/eee63918-d2dc-408c-9403-2e3d0577668d-0080/executors/9/runs/84a4a4ef-58f5-4894-99c4-033093e583f1/spark-3.1.1-bin-2.8.3/python/lib/pyspark.zip/pyspark/serializers.py", line 200, in _batched
    for item in iterator:
  File "/export/workdir/mesos/work/slaves/e24e86b9-cf25-4fa7-8335-c4e89b35b268-S27/frameworks/eee63918-d2dc-408c-9403-2e3d0577668d-0080/executors/9/runs/84a4a4ef-58f5-4894-99c4-033093e583f1/spark-3.1.1-bin-2.8.3/python/lib/pyspark.zip/pyspark/worker.py", line 450, in mapper
    result = tuple(f(*[a[o] for o in arg_offsets]) for (arg_offsets, f) in udfs)
  File "/export/workdir/mesos/work/slaves/e24e86b9-cf25-4fa7-8335-c4e89b35b268-S27/frameworks/eee63918-d2dc-408c-9403-2e3d0577668d-0080/executors/9/runs/84a4a4ef-58f5-4894-99c4-033093e583f1/spark-3.1.1-bin-2.8.3/python/lib/pyspark.zip/pyspark/worker.py", line 450, in <genexpr>
    result = tuple(f(*[a[o] for o in arg_offsets]) for (arg_offsets, f) in udfs)
  File "/export/workdir/mesos/work/slaves/e24e86b9-cf25-4fa7-8335-c4e89b35b268-S27/frameworks/eee63918-d2dc-408c-9403-2e3d0577668d-0080/executors/9/runs/84a4a4ef-58f5-4894-99c4-033093e583f1/spark-3.1.1-bin-2.8.3/python/lib/pyspark.zip/pyspark/worker.py", line 85, in <lambda>
    return lambda *a: f(*a)
  File "/export/workdir/mesos/work/slaves/e24e86b9-cf25-4fa7-8335-c4e89b35b268-S27/frameworks/eee63918-d2dc-408c-9403-2e3d0577668d-0080/executors/9/runs/84a4a4ef-58f5-4894-99c4-033093e583f1/spark-3.1.1-bin-2.8.3/python/lib/pyspark.zip/pyspark/util.py", line 73, in wrapper
    return f(*args, **kwargs)
  File "/tmp/ipykernel_14079/581591478.py", line 30, in <lambda>
TypeError: 'NoneType' object is not iterable

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:517)
	at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$2.read(PythonUDFRunner.scala:84)
	at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$2.read(PythonUDFRunner.scala:67)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:470)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:489)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:755)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
	at org.apache.spark.ContextAwareIterator.hasNext(ContextAwareIterator.scala:39)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
	at scala.collection.Iterator$GroupedIterator.takeDestructively(Iterator.scala:1158)
	at scala.collection.Iterator$GroupedIterator.go(Iterator.scala:1174)
	at scala.collection.Iterator$GroupedIterator.fill(Iterator.scala:1212)
	at scala.collection.Iterator$GroupedIterator.hasNext(Iterator.scala:1215)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
	at scala.collection.Iterator.foreach(Iterator.scala:941)
	at scala.collection.Iterator.foreach$(Iterator.scala:941)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1429)
	at org.apache.spark.api.python.PythonRDD$.writeIteratorToStream(PythonRDD.scala:307)
	at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$1.writeIteratorToStream(PythonUDFRunner.scala:53)
	at org.apache.spark.api.python.BasePythonRunner$WriterThread.$anonfun$run$1(PythonRunner.scala:397)
	at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:1996)
	at org.apache.spark.api.python.BasePythonRunner$WriterThread.run(PythonRunner.scala:232)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2253)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2202)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2201)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2201)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1078)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1078)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1078)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2440)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2382)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2371)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:868)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2202)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:200)
	... 33 more
Caused by: org.apache.spark.SparkException: Task failed while writing rows.
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.executeTask(FileFormatWriter.scala:296)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.$anonfun$write$15(FileFormatWriter.scala:210)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:131)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:497)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:500)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/export/workdir/mesos/work/slaves/e24e86b9-cf25-4fa7-8335-c4e89b35b268-S27/frameworks/eee63918-d2dc-408c-9403-2e3d0577668d-0080/executors/9/runs/84a4a4ef-58f5-4894-99c4-033093e583f1/spark-3.1.1-bin-2.8.3/python/lib/pyspark.zip/pyspark/worker.py", line 604, in main
    process()
  File "/export/workdir/mesos/work/slaves/e24e86b9-cf25-4fa7-8335-c4e89b35b268-S27/frameworks/eee63918-d2dc-408c-9403-2e3d0577668d-0080/executors/9/runs/84a4a4ef-58f5-4894-99c4-033093e583f1/spark-3.1.1-bin-2.8.3/python/lib/pyspark.zip/pyspark/worker.py", line 596, in process
    serializer.dump_stream(out_iter, outfile)
  File "/export/workdir/mesos/work/slaves/e24e86b9-cf25-4fa7-8335-c4e89b35b268-S27/frameworks/eee63918-d2dc-408c-9403-2e3d0577668d-0080/executors/9/runs/84a4a4ef-58f5-4894-99c4-033093e583f1/spark-3.1.1-bin-2.8.3/python/lib/pyspark.zip/pyspark/serializers.py", line 211, in dump_stream
    self.serializer.dump_stream(self._batched(iterator), stream)
  File "/export/workdir/mesos/work/slaves/e24e86b9-cf25-4fa7-8335-c4e89b35b268-S27/frameworks/eee63918-d2dc-408c-9403-2e3d0577668d-0080/executors/9/runs/84a4a4ef-58f5-4894-99c4-033093e583f1/spark-3.1.1-bin-2.8.3/python/lib/pyspark.zip/pyspark/serializers.py", line 132, in dump_stream
    for obj in iterator:
  File "/export/workdir/mesos/work/slaves/e24e86b9-cf25-4fa7-8335-c4e89b35b268-S27/frameworks/eee63918-d2dc-408c-9403-2e3d0577668d-0080/executors/9/runs/84a4a4ef-58f5-4894-99c4-033093e583f1/spark-3.1.1-bin-2.8.3/python/lib/pyspark.zip/pyspark/serializers.py", line 200, in _batched
    for item in iterator:
  File "/export/workdir/mesos/work/slaves/e24e86b9-cf25-4fa7-8335-c4e89b35b268-S27/frameworks/eee63918-d2dc-408c-9403-2e3d0577668d-0080/executors/9/runs/84a4a4ef-58f5-4894-99c4-033093e583f1/spark-3.1.1-bin-2.8.3/python/lib/pyspark.zip/pyspark/worker.py", line 450, in mapper
    result = tuple(f(*[a[o] for o in arg_offsets]) for (arg_offsets, f) in udfs)
  File "/export/workdir/mesos/work/slaves/e24e86b9-cf25-4fa7-8335-c4e89b35b268-S27/frameworks/eee63918-d2dc-408c-9403-2e3d0577668d-0080/executors/9/runs/84a4a4ef-58f5-4894-99c4-033093e583f1/spark-3.1.1-bin-2.8.3/python/lib/pyspark.zip/pyspark/worker.py", line 450, in <genexpr>
    result = tuple(f(*[a[o] for o in arg_offsets]) for (arg_offsets, f) in udfs)
  File "/export/workdir/mesos/work/slaves/e24e86b9-cf25-4fa7-8335-c4e89b35b268-S27/frameworks/eee63918-d2dc-408c-9403-2e3d0577668d-0080/executors/9/runs/84a4a4ef-58f5-4894-99c4-033093e583f1/spark-3.1.1-bin-2.8.3/python/lib/pyspark.zip/pyspark/worker.py", line 85, in <lambda>
    return lambda *a: f(*a)
  File "/export/workdir/mesos/work/slaves/e24e86b9-cf25-4fa7-8335-c4e89b35b268-S27/frameworks/eee63918-d2dc-408c-9403-2e3d0577668d-0080/executors/9/runs/84a4a4ef-58f5-4894-99c4-033093e583f1/spark-3.1.1-bin-2.8.3/python/lib/pyspark.zip/pyspark/util.py", line 73, in wrapper
    return f(*args, **kwargs)
  File "/tmp/ipykernel_14079/581591478.py", line 30, in <lambda>
TypeError: 'NoneType' object is not iterable

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:517)
	at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$2.read(PythonUDFRunner.scala:84)
	at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$2.read(PythonUDFRunner.scala:67)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:470)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:489)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:755)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
	at org.apache.spark.ContextAwareIterator.hasNext(ContextAwareIterator.scala:39)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
	at scala.collection.Iterator$GroupedIterator.takeDestructively(Iterator.scala:1158)
	at scala.collection.Iterator$GroupedIterator.go(Iterator.scala:1174)
	at scala.collection.Iterator$GroupedIterator.fill(Iterator.scala:1212)
	at scala.collection.Iterator$GroupedIterator.hasNext(Iterator.scala:1215)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
	at scala.collection.Iterator.foreach(Iterator.scala:941)
	at scala.collection.Iterator.foreach$(Iterator.scala:941)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1429)
	at org.apache.spark.api.python.PythonRDD$.writeIteratorToStream(PythonRDD.scala:307)
	at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$1.writeIteratorToStream(PythonUDFRunner.scala:53)
	at org.apache.spark.api.python.BasePythonRunner$WriterThread.$anonfun$run$1(PythonRunner.scala:397)
	at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:1996)
	at org.apache.spark.api.python.BasePythonRunner$WriterThread.run(PythonRunner.scala:232)


In [19]:
df_paperAuthor.count()

22/02/09 09:37:37 WARN scheduler.TaskSetManager: Lost task 2000.0 in stage 18.0 (TID 30626) (node79.cluster.tsc.uc3m.es executor 3): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/export/workdir/mesos/work/slaves/398d8fdd-ddd7-4a42-a8ea-a162fc6bbcf5-S68/frameworks/eee63918-d2dc-408c-9403-2e3d0577668d-0080/executors/3/runs/ab751468-0efc-47f9-9c06-75656596c26f/spark-3.1.1-bin-2.8.3/python/lib/pyspark.zip/pyspark/worker.py", line 604, in main
    process()
  File "/export/workdir/mesos/work/slaves/398d8fdd-ddd7-4a42-a8ea-a162fc6bbcf5-S68/frameworks/eee63918-d2dc-408c-9403-2e3d0577668d-0080/executors/3/runs/ab751468-0efc-47f9-9c06-75656596c26f/spark-3.1.1-bin-2.8.3/python/lib/pyspark.zip/pyspark/worker.py", line 596, in process
    serializer.dump_stream(out_iter, outfile)
  File "/export/workdir/mesos/work/slaves/398d8fdd-ddd7-4a42-a8ea-a162fc6bbcf5-S68/frameworks/eee63918-d2dc-408c-9403-2e3d0577668d-0080/executors/3/runs/ab751468-0efc-47f9-9c06-7

22/02/09 09:37:42 WARN scheduler.TaskSetManager: Lost task 2000.2 in stage 18.0 (TID 30628) (node85.cluster.tsc.uc3m.es executor 8): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/export/workdir/mesos/work/slaves/398d8fdd-ddd7-4a42-a8ea-a162fc6bbcf5-S66/frameworks/eee63918-d2dc-408c-9403-2e3d0577668d-0080/executors/8/runs/643b5215-210d-4086-8630-5919fef9715a/spark-3.1.1-bin-2.8.3/python/lib/pyspark.zip/pyspark/worker.py", line 604, in main
    process()
  File "/export/workdir/mesos/work/slaves/398d8fdd-ddd7-4a42-a8ea-a162fc6bbcf5-S66/frameworks/eee63918-d2dc-408c-9403-2e3d0577668d-0080/executors/8/runs/643b5215-210d-4086-8630-5919fef9715a/spark-3.1.1-bin-2.8.3/python/lib/pyspark.zip/pyspark/worker.py", line 596, in process
    serializer.dump_stream(out_iter, outfile)
  File "/export/workdir/mesos/work/slaves/398d8fdd-ddd7-4a42-a8ea-a162fc6bbcf5-S66/frameworks/eee63918-d2dc-408c-9403-2e3d0577668d-0080/executors/8/runs/643b5215-210d-4086-8630-5

PythonException: 
  An exception was thrown from the Python worker. Please see the stack trace below.
Traceback (most recent call last):
  File "/export/workdir/mesos/work/slaves/398d8fdd-ddd7-4a42-a8ea-a162fc6bbcf5-S73/frameworks/eee63918-d2dc-408c-9403-2e3d0577668d-0080/executors/1/runs/07390edb-77d5-42dd-bbeb-809d349e2edf/spark-3.1.1-bin-2.8.3/python/lib/pyspark.zip/pyspark/worker.py", line 604, in main
    process()
  File "/export/workdir/mesos/work/slaves/398d8fdd-ddd7-4a42-a8ea-a162fc6bbcf5-S73/frameworks/eee63918-d2dc-408c-9403-2e3d0577668d-0080/executors/1/runs/07390edb-77d5-42dd-bbeb-809d349e2edf/spark-3.1.1-bin-2.8.3/python/lib/pyspark.zip/pyspark/worker.py", line 596, in process
    serializer.dump_stream(out_iter, outfile)
  File "/export/workdir/mesos/work/slaves/398d8fdd-ddd7-4a42-a8ea-a162fc6bbcf5-S73/frameworks/eee63918-d2dc-408c-9403-2e3d0577668d-0080/executors/1/runs/07390edb-77d5-42dd-bbeb-809d349e2edf/spark-3.1.1-bin-2.8.3/python/lib/pyspark.zip/pyspark/serializers.py", line 211, in dump_stream
    self.serializer.dump_stream(self._batched(iterator), stream)
  File "/export/workdir/mesos/work/slaves/398d8fdd-ddd7-4a42-a8ea-a162fc6bbcf5-S73/frameworks/eee63918-d2dc-408c-9403-2e3d0577668d-0080/executors/1/runs/07390edb-77d5-42dd-bbeb-809d349e2edf/spark-3.1.1-bin-2.8.3/python/lib/pyspark.zip/pyspark/serializers.py", line 132, in dump_stream
    for obj in iterator:
  File "/export/workdir/mesos/work/slaves/398d8fdd-ddd7-4a42-a8ea-a162fc6bbcf5-S73/frameworks/eee63918-d2dc-408c-9403-2e3d0577668d-0080/executors/1/runs/07390edb-77d5-42dd-bbeb-809d349e2edf/spark-3.1.1-bin-2.8.3/python/lib/pyspark.zip/pyspark/serializers.py", line 200, in _batched
    for item in iterator:
  File "/export/workdir/mesos/work/slaves/398d8fdd-ddd7-4a42-a8ea-a162fc6bbcf5-S73/frameworks/eee63918-d2dc-408c-9403-2e3d0577668d-0080/executors/1/runs/07390edb-77d5-42dd-bbeb-809d349e2edf/spark-3.1.1-bin-2.8.3/python/lib/pyspark.zip/pyspark/worker.py", line 450, in mapper
    result = tuple(f(*[a[o] for o in arg_offsets]) for (arg_offsets, f) in udfs)
  File "/export/workdir/mesos/work/slaves/398d8fdd-ddd7-4a42-a8ea-a162fc6bbcf5-S73/frameworks/eee63918-d2dc-408c-9403-2e3d0577668d-0080/executors/1/runs/07390edb-77d5-42dd-bbeb-809d349e2edf/spark-3.1.1-bin-2.8.3/python/lib/pyspark.zip/pyspark/worker.py", line 450, in <genexpr>
    result = tuple(f(*[a[o] for o in arg_offsets]) for (arg_offsets, f) in udfs)
  File "/export/workdir/mesos/work/slaves/398d8fdd-ddd7-4a42-a8ea-a162fc6bbcf5-S73/frameworks/eee63918-d2dc-408c-9403-2e3d0577668d-0080/executors/1/runs/07390edb-77d5-42dd-bbeb-809d349e2edf/spark-3.1.1-bin-2.8.3/python/lib/pyspark.zip/pyspark/worker.py", line 85, in <lambda>
    return lambda *a: f(*a)
  File "/export/workdir/mesos/work/slaves/398d8fdd-ddd7-4a42-a8ea-a162fc6bbcf5-S73/frameworks/eee63918-d2dc-408c-9403-2e3d0577668d-0080/executors/1/runs/07390edb-77d5-42dd-bbeb-809d349e2edf/spark-3.1.1-bin-2.8.3/python/lib/pyspark.zip/pyspark/util.py", line 73, in wrapper
    return f(*args, **kwargs)
  File "/tmp/ipykernel_14079/581591478.py", line 30, in <lambda>
TypeError: 'NoneType' object is not iterable


22/02/09 09:37:44 WARN scheduler.TaskSetManager: Lost task 1997.0 in stage 18.0 (TID 30623) (node42.cluster.tsc.uc3m.es executor 6): TaskKilled (Stage cancelled)


### Download PDFs

In [42]:
## Download PDFs
#
# We download PDFs for all papers with valid a valid pdfUrl
# This option is not activated by default, since the number
# of papers to download would be huge

paper_download = 0

if paper_download:
    pdf_urls = (
        df.select(["id", "pdfUrls"])
        .withColumn("pdfUrls", get_first_pdf("pdfUrls"))
        .filter(F.length("pdfUrls") > 0)
    )
    # pdf_urls.show(5, truncate=False)


    def download_pdf(x):
        
        try:
            # If path exists do nothing
            
            else: 
            r = requests.get(x["pdfUrls"], stream=True)
            with Path("/export/data_ml4ds/IntelComp/Datasets/semanticscholar/rawdata/pdfs/" + x['id'] + ".pdf").open("wb") as f:
                f.write(r.content)
        except:
            pass


    pdf_urls.foreach(download_pdf)


22/02/09 10:31:26 WARN scheduler.TaskSetManager: Lost task 29.0 in stage 29.0 (TID 31224) (node85.cluster.tsc.uc3m.es executor 8): java.io.IOException: incorrect header check
	at org.apache.hadoop.io.compress.zlib.ZlibDecompressor.inflateBytesDirect(Native Method)
	at org.apache.hadoop.io.compress.zlib.ZlibDecompressor.decompress(ZlibDecompressor.java:225)
	at org.apache.hadoop.io.compress.DecompressorStream.decompress(DecompressorStream.java:111)
	at org.apache.hadoop.io.compress.DecompressorStream.read(DecompressorStream.java:105)
	at java.io.InputStream.read(InputStream.java:101)
	at org.apache.hadoop.util.LineReader.fillBuffer(LineReader.java:182)
	at org.apache.hadoop.util.LineReader.readDefaultLine(LineReader.java:218)
	at org.apache.hadoop.util.LineReader.readLine(LineReader.java:176)
	at org.apache.hadoop.mapreduce.lib.input.LineRecordReader.skipUtfByteOrderMark(LineRecordReader.java:151)
	at org.apache.hadoop.mapreduce.lib.input.LineRecordReader.nextKeyValue(LineRecordReader.j

KeyboardInterrupt: 

Hacer un Script que saque las siguientes estadísticas:
(df = spark.sql("SELECT * FROM papers.parquet WHERE")

   - Contar entradas de las tablas de papers, authors, citations, paperAuthors
   - Cuántos papers para cada FieldOfScience
   - Cuántos papers con pmid válido / magid válido
   - Cuántos dois únicos hay en el dataset
   - Cuántos papers tienen un Abstract de más de 256 chars