In [1]:
sc.version

'3.1.1'

In [37]:
from pathlib import Path
from pyspark.sql.types import StringType, ArrayType
import pyspark.sql.functions as F
import requests

### Auxiliary functions

In [93]:
# Aux functions
def normalize(text):
    if isinstance(text, str):
        text = " ".join(text.split())
    return text


take_id = F.udf(lambda x: normalize(x[0] if len(x) > 0 else None), StringType())
take_authors_ids = F.udf(
    lambda x: [normalize(el[0] if len(el) > 0 else None) for el in x],
    ArrayType(StringType()),
)
norm_string = F.udf(normalize, StringType())


def get_pdf(pdf_list):
    pdf_list = [pdf for pdf in pdf_list if pdf.endswith(".pdf")]
    if len(pdf_list) > 0:
        return pdf_list[0]
    else:
        return None


get_first_pdf = F.udf(get_pdf, StringType())


### Define directories

In [101]:
dir_data = "/export/ml4ds/semanticSC/test/"
dir_out = Path("/export/ml4ds/semanticSC/")

### Read data files

In [46]:
df = spark.read.json(dir_data)
df.count()

                                                                                

65981

### Save authors

In [14]:
df_authors = df.select(F.explode("authors").alias("authors"))
df_authors = (
    df_authors.select("authors.ids", "authors.name")
    .withColumn("ids", take_id("ids"))
    .withColumn("name", norm_string("name"))
    .withColumnRenamed("ids", "id")
    .drop_duplicates(subset=["id"])
    .dropna(subset=["id"])
)
df_authors.write.parquet(
    dir_out.joinpath("parquet/authors.parquet").as_posix(), mode="overwrite"
)


                                                                                

### Save papers

In [62]:
columns = [
    "id",
    "title",
    "paperAbstract",
    "s2Url",
    "pdfUrls",
    "year",
    "sources",
    "doi",
    "doiUrl",
    "pmid",
    "magId",
    "fieldsOfStudy",
    "journalName",
    "journalPages",
    "journalVolume",
    "venue",
    "inCitations",
    "outCitations",
    "authors",
]
df_papers = df.select(columns)
df_papers = df_papers.withColumn("authors", take_authors_ids("authors.ids"))
for c in columns:
    if df.select(c).dtypes[0][1] == "string":
        df_papers = df_papers.withColumn(c, norm_string(c))

df_papers.write.parquet(
    dir_out.joinpath("parquet/papers.parquet").as_posix(),
    mode="overwrite",
)


### Download PDFs

In [100]:
pdf_urls = (
    df.select(["id", "pdfUrls"])
    .withColumn("pdfUrls", get_first_pdf("pdfUrls"))
    .filter(F.length("pdfUrls") > 0)
)
# pdf_urls.show(5, truncate=False)

def download_pdf(x):
    r = requests.get(x["pdfUrls"], stream=True)

    with dir_out.joinpath(f"pdfs/{x['id']}.pdf").open("wb") as f:
        f.write(r.content)

pdf_urls.foreach(download_pdf)


+----------------------------------------+-----------------------------------------------------------------------------------------------------------------+
|id                                      |pdfUrls                                                                                                          |
+----------------------------------------+-----------------------------------------------------------------------------------------------------------------+
|e15212675135c612e335436e7a3f96b3b1c3e5b0|https://researchmgt.monash.edu/ws/portalfiles/portal/283970231/283970065_oa.pdf                                  |
|2c04fba10f6ccbd1d0d990862e4b6ae60e2c92c1|https://static-content.springer.com/esm/art:10.1186%2Fs13148-015-0077-1/MediaObjects/13148_2015_77_MOESM6_ESM.pdf|
|9e93efb038e37a50e0cd846a48eabd8a61382a74|http://circ.ahajournals.org/content/circulationaha/90/6/3070.full.pdf                                            |
|7f3155dbc72404cd86bc070ac172de5e918f6251|http://www.dmi.u