In [0]:
%sh
pip install spacy
pip install habanero
python -m spacy download en_core_web_sm

You should consider upgrading via the '/local_disk0/.ephemeral_nfs/envs/pythonEnv-142d7af9-eafc-4bac-95b1-1e0a2d13776d/bin/python -m pip install --upgrade pip' command.
You should consider upgrading via the '/local_disk0/.ephemeral_nfs/envs/pythonEnv-142d7af9-eafc-4bac-95b1-1e0a2d13776d/bin/python -m pip install --upgrade pip' command.
Collecting en-core-web-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl (12.8 MB)
You should consider upgrading via the '/local_disk0/.ephemeral_nfs/envs/pythonEnv-142d7af9-eafc-4bac-95b1-1e0a2d13776d/bin/python -m pip install --upgrade pip' command.
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [0]:
dbutils.library.restartPython()

In [0]:
from pyspark.sql.types import StringType, IntegerType, ArrayType
import time
import pyspark.sql.functions as F
from pyspark.sql import Window
import requests
import json
import urllib.parse
import spacy
from habanero import Crossref

#### Database Schema

<img src="https://i.ibb.co/hRqLR8d/Schema.png" alt="Schema" border="0">

In [0]:
# use spark.streams.active to loop over all active streams
# remember to stop streams if not working on them anymore

for stream in spark.streams.active:
    stream.stop()

In [0]:
raw_df_schema = "_id: STRING, abstract: STRING, authors: ARRAY<STRUCT<_id: STRING, bio: STRING, email: STRING, gid: STRING, name: STRING, name_zh: STRING, oid: STRING, oid_zh: STRING, orcid: STRING, org: STRING, org_zh: STRING, orgid: STRING, orgs: ARRAY<STRING>, orgs_zh: ARRAY<STRING>, sid: STRING>>, doi: STRING, fos: ARRAY<STRING>, isbn: STRING, issn: STRING, issue: STRING, keywords: ARRAY<STRING>, lang: STRING, n_citation: STRING, page_end: STRING, page_start: STRING, pdf: STRING, references: ARRAY<STRING>, title: STING, url: ARRAY<STRING>, venue: STRUCT<_id: STRING, issn: STRING, name: STRING, name_d: STRING, name_s: STRING, online_issn: STRING, publisher: STRING, raw: STRING, raw_zh: STRING, sid: STRING, src: STRING, t: STRING, type: STRING>, volume: STRING, year: STRING"

raw_df_stream = (spark.readStream
                 .option("maxFilesPerTrigger", 1)
                 .format("delta")
                 .load("/user/hive/warehouse/scientific_publications"))
#display(raw_df_stream)

In [0]:
running_count_df = raw_df_stream.agg(F.count("*"))

display(running_count_df)

count(1)
250000


In [0]:
#Filter out titles that have less than 2 words
filtered_df = raw_df_stream.withColumn("title_word_count", F.size(F.split("title", " "))).filter("title_word_count > 1")
# filtered_df = filtered_df.filter(F.col("doi").isNull())
# filtered_df = filtered_df.filter("n_citation > 3")
#display(filtered_df)

## Authors

In [0]:
def get_author_from_dblp(title, rank):
    URL = "http://dblp.org/search/publ/api?q=" + title.replace(" ", "+") + "&format=json"
    try:
        r = requests.get(url = URL)
        data = r.json()
        return data['result']['hits']['hit'][0]['info']['authors']['author'][rank-1]['text']
    except:
        return None
      
get_author_from_dblp_UDF = udf(get_author_from_dblp)

In [0]:
authors = (filtered_df.limit(100).select("doi", "authors", "title", F.posexplode(F.col("authors")).alias("rank", "authors_exp"))
            .withColumn("rank", F.col("rank") + 1)
            .select("authors_exp.*","*")
            .select("doi", "rank", "name", "title")
            .withColumn("name", F.initcap(F.col("name")))
            )

authors2 = authors.filter(F.col("name").rlike(r"^\p{L}\.?(-\p{L}\.?)?\s.+"))
"""authors_rdd = authors2.rdd.map(lambda x: (x[0], x[1], x[2], x[3], get_author_from_dblp(x[3], x[1])))
authors2 = authors_rdd.toDF(["doi", "rank", "name", "title", "dblp_name"])"""

authors2 = (authors2
            .withColumn("dblp_name", get_author_from_dblp_UDF(F.col("title"), F.col("rank")))
            .withColumn("dblp_name", F.when(F.col("dblp_name") == "null", F.col("name")).otherwise(F.col("dblp_name")))
            .withColumn("dblp_name", F.regexp_extract(F.col("dblp_name"), r"^(\D+)(\s\d\d\d\d)?$", 1)))

authors_raw = (authors.withColumn("dblp_name", F.col("name"))
           .union(authors2)
           .withColumn("last_name_raw", F.regexp_extract(F.col("name"), r"^.+\s(\S+)(\sJr\.)?$", 1))
           .withColumn("dblp_name", F.when(F.col("dblp_name") == "null", F.col("name")).otherwise(F.col("dblp_name")))
           .withColumn("dblp_name", F.regexp_extract(F.col("dblp_name"), r"^(\D+)(\s\d\d\d\d)?$", 1))
           .withColumn("dblp_last_name", F.regexp_extract(F.col("dblp_name"), r"^.+\s(\S+)(\sJr\.)?$", 1))
           .withColumn("name", F.when(F.col("last_name_raw") == F.col("dblp_last_name"), 
                                      F.col("dblp_name")).otherwise(F.col("name")))
           .withColumn("first_name", F.regexp_extract(F.col("name"), r"^(\S+)\s.+$", 1))
           .withColumn("last_name", F.regexp_extract(F.col("name"), r"^.+\s(\S+)$", 1))
           .withColumn("middle_name", F.regexp_extract(F.col("name"), r"^\S+\s(\S+)\s\S+$", 1))
          )

authors = (authors_raw.select("first_name", "last_name", "middle_name")
            .dropDuplicates()
            .withColumn("author_id", F.expr("uuid()"))
          )

authors = (authors_raw.join(authors, ["first_name", "last_name", "middle_name"])
           .select("author_id", "first_name", "last_name", "middle_name", "title", F.col("name").alias("author"), "rank")
           .dropDuplicates()
          )

display(authors)

authors_clean = authors.select("author_id", "first_name", "last_name", "middle_name")
#authors_clean.write.format("delta").mode("overwrite").saveAsTable("authors")

author_id,first_name,last_name,middle_name,title,author,rank
4150a4d1-dc34-45cf-b379-65a81ead7264,K.,Subramanian,G.,Array P Systems and t.Communication,K. G. Subramanian,1
1d313f43-e00a-4c3a-96a5-7dbb47fbe5d9,Ye,Zhang,,A 1.7mW quadrature bandpass ΔΣ ADC with 1MHz BW and 60dB DR at 1MHz IF,Ye Zhang,5
f5bfcb98-45e0-4c18-b49e-efc31c27e437,J.,Kneen,,Real time calculation of GPS flight simulated data using ARM microcontroller.,J. Kneen,1
3b98f6f9-ddd2-40dc-9805-21291c185cc9,Lei,Xi,,A novel ensemble algorithm for biomedical classification based on Ant Colony Optimization,Lei Xi,2
0d13219e-0528-449d-9be8-4185d48f7f0d,Dana,Kulić,,Pre-collision safety strategies for human-robot interaction,Dana Kulić,1
7099772a-14d8-4b25-a06f-f07e250a208f,Marwan,Awad,,Machine Translation Errors: English and Iraqi Arabic,Marwan Awad,6
b2ed331d-061b-45a4-8a1d-608bd8a1d079,Yifan,Wang,,A 1.7mW quadrature bandpass ΔΣ ADC with 1MHz BW and 60dB DR at 1MHz IF,Yifan Wang,3
462e21b8-481c-4f5d-9e68-e60a2b8541a1,Gilles,Dequen,,Automatic parallel SAT solving using MTSS.,Gilles Dequen,3
cda31779-2f1d-4a2b-9dcc-3642904073da,Stephan,Olariu,,A Two-Cell-Lookahead Call Admission and Handoff Management Scheme for Multimedia LEO Satellite Networks,Stephan Olariu,2
4923c889-318c-4d4b-a67c-6a927e9a386a,Pengcheng,Shi,,Meshfree implementation of individualized active cardiac dynamics,Pengcheng Shi,5


## Organizations

In [0]:
def get_organization(name, country):
    try:
        URL = "https://api.ror.org/organizations?query=" + name.replace(" ", "+")
        r = requests.get(url = URL)
        data = r.json()
        if data['number_of_results'] == 0:
            return "No results"
        for i in range(data['number_of_results']):
            found_country = data['items'][i]["country"]["country_name"]
            if country == "" or country == found_country:
                return data['items'][i]["name"]+";"+data['items'][i]["addresses"][0]["city"]+";"+found_country
    except:
        return "No results"
      
get_organization_UDF = udf(get_organization)

In [0]:
countries = "(,|\s)(Afghanistan|Albania|Algeria|Andorra|Angola|Antigua and Barbuda|Argentina|Armenia|Australia|Austria|Azerbaijan|Bahamas|Bahrain|Bangladesh|Barbados|Belarus|Belgium|Belize|Benin|Bhutan|Bolivia|Bosnia and Herzegovina|Botswana|Brazil|Brunei|Bulgaria|Burkina Faso|Burundi|Cabo Verde|Cambodia|Cameroon|Canada|Central African Republic|Chad|Chile|China|Colombia|Comoros|Democratic Republic of the Congo|Republic of the Congo|Costa Rica|Cote d'Ivoire|Croatia|Cuba|Cyprus|Czech Republic|Denmark|Djibouti|Dominica|Dominican Republic|Ecuador|Egypt|El Salvador|Equatorial Guinea|Eritrea|Estonia|Ethiopia|Fiji|Finland|France|Gabon|Gambia|Georgia|Germany|Ghana|Greece|Grenada|Guatemala|Guinea|Guinea-Bissau|Guyana|Haiti|Honduras|Hungary|Iceland|India|Indonesia|Iran|Iraq|Ireland|Israel|Italy|Jamaica|Japan|Jordan|Kazakhstan|Kenya|Kiribati|Kosovo|Kuwait|Kyrgyzstan|Laos|Latvia|Lebanon|Lesotho|Liberia|Libya|Liechtenstein|Lithuania|Luxembourg|North Macedonia|Madagascar|Malawi|Malaysia|Maldives|Mali|Malta|Marshall Islands|Mauritania|Mauritius|Mexico|Micronesia|Moldova|Monaco|Mongolia|Montenegro|Morocco|Mozambique|Myanmar|Namibia|Nauru|Nepal|Netherlands|New Zealand|Nicaragua|Niger|Nigeria|North Korea|Norway|Oman|Pakistan|Palau|Palestine|Panama|Papua New Guinea|Paraguay|Peru|Philippines|Poland|Portugal|Qatar|Romania|Russia|Rwanda|Saint Kitts and Nevis|Saint Lucia|Saint Vincent and the Grenadines|Samoa|San Marino|Sao Tome and Principe|Saudi Arabia|Senegal|Serbia|Seychelles|Sierra Leone|Singapore|Slovakia|Slovenia|Solomon Islands|Somalia|South Africa|South Korea|South Sudan|Spain|Sri Lanka|Sudan|Suriname|Swaziland|Sweden|Switzerland|Syria|Taiwan|Tajikistan|Tanzania|Thailand|Timor-Leste|Togo|Tonga|Trinidad and Tobago|Tunisia|Turkey|Turkmenistan|Tuvalu|Uganda|Ukraine|United Arab Emirates|UAE|United Kingdom|UK|United States of America|USA|United States|US|Uruguay|Uzbekistan|Vanuatu|Vatican City|Venezuela|Vietnam|Yemen|Zambia|Zimbabwe)(,|\s|$)"


organization = (filtered_df.limit(100).select("authors", "title", F.explode(F.col("authors")).alias("authors_exp"))
               .select("authors_exp.*","*")
               .withColumn("author", F.col("name"))
               .select("author", "org")
               .filter(F.col("org").isNotNull())
               .dropDuplicates()
                # ., +, *, ?, ^, $, (, ), [, ], {, }, |, \
               .withColumn("strip_org", F.regexp_replace(F.col("org"), r'[\+-=#&\|><!\(\)\{\}\[\]\^"~\*\?:\\/]', " "))
               .withColumn("county", F.regexp_extract(F.col("org"), countries, 2))
               .withColumn("county", F.regexp_replace("county", "United States of America|USA|US", "United States"))
               .withColumn("county", F.regexp_replace("county", "UK", "United Kingdom"))
               .withColumn("county", F.regexp_replace("county", "UAE", "United Arab Emirates"))
               )

"""organization_rdd = organization.rdd.map(lambda x: (x[0], x[1], x[2], x[3], get_organization(x[2], x[3])))
organization_raw = (organization_rdd.toDF(["author", "org", "strip_org", "country", "api_org"])"""
organization_raw = (organization
                    .withColumn("api_org", get_organization_UDF(F.col("strip_org"), F.col("county")))
                    .withColumn("api_name", F.regexp_extract(F.col("api_org"), r"^(.+);.+;.+$", 1))
                    .withColumn("match", F.expr(r"regexp_extract(org, concat('(', api_name,')'), 0)"))
                    .withColumn("name", F.when(F.col("match") == "", F.col("org"))
                               .otherwise(F.col("api_name")))
                    .withColumn("city", F.when(F.col("match") == "", "")
                               .otherwise(F.regexp_extract(F.col("api_org"), r"^.+;(.+);.+$", 1)))
                    .withColumn("country", F.when(F.col("match") == "", F.col("county"))
                               .otherwise(F.regexp_extract(F.col("api_org"), r"^.+;.+;(.+)$", 1)))
                   )


organization = (organization_raw.select("name", "city", "country")
                .dropDuplicates()
                .withColumn("org_id", F.expr("uuid()"))
               )

organization = (organization_raw.join(organization, ["name", "city", "country"])
                .select("org_id", "name", "city", "country", "author")
               )

display(organization)

organization_clean = organization.select("org_id", "name", "city", "country")
#organization_clean.write.format("delta").mode("overwrite").saveAsTable("organization")

## Publications

In [0]:
publication_raw = filtered_df.limit(100).select("_id", "title", "volume", "n_citation", "doi", "url")

display(publication_raw)

In [0]:
def check_return_data(check_type, data, cur):
    if data is not None and check_type in data:
        return data[check_type]
    else:
        return cur
    
def update_df(df, doi_list, data_list, data_str, is_int=False):
    data_dict = dict(zip(doi_list, data_list))
    update_data = udf(lambda x: data_dict[x], IntegerType() if is_int else StringType())
    return df.withColumn(data_str, update_data(F.col('doi')))

def get_publication_data(df):
    doi_list = df.select(F.col("doi")).rdd.flatMap(lambda x: x).collect()
    url_list = df.select(F.col("url")).rdd.flatMap(lambda x: x).collect()
    title_list = df.select(F.col("title")).rdd.flatMap(lambda x: x).collect()
    citation_list = df.select(F.col("n_citation")).rdd.flatMap(lambda x: x).collect()
    volume_list = df.select(F.col("volume")).rdd.flatMap(lambda x: x).collect()
    new_volume_list = []
    n_citation_list = []
    series_list = []
    new_doi_list = []
    for i, doi in enumerate(doi_list):
        try:
            if doi == "" or doi is None:
                if "doi" in url_list[i][0]:
                    doi_req = url_list[i][0].split("org/")[-1]
                else:
                    raise Exception
            else:
                doi_req = doi
            
            new_doi_list.append(doi_req)
            response = requests.get(f"https://api.crossref.org/works/{doi_req}")
            data = response.json()['message']
        except Exception:
            new_doi_list.append(None)
            data = None

        n_citation_list.append(int(check_return_data('is-referenced-by-count', data, citation_list[i])))
        new_volume_list.append(check_return_data('volume', data, volume_list[i]))
        temp = check_return_data('container-title', data, None)
        series_list.append(temp if temp == None else temp[0])

        time.sleep(0.05)
    
    df = update_df(df, doi_list, n_citation_list, 'n_citation')
    df = update_df(df, doi_list, new_volume_list, 'volume')
    df = update_df(df, doi_list, series_list, 'series')
    df = update_df(df, doi_list, new_doi_list, 'doi')
    
    return df

In [0]:
publications_raw = get_publication_data(publication_raw).filter("n_citation > 2")
# publication = get_publication_dataget_crossref_data(publication_raw).select("_id", "title", "volume", "series", "n_citation").filter("n_citation > 2")

publications = (publications_raw.select("title", "volume", "series", "n_citation")
            .dropDuplicates()
            .withColumn("publ_id", F.expr("uuid()"))
          )

publications = (publications_raw.join(publications, ["title", "volume", "series", "n_citation"])
           .select("publ_id", "title", "volume", "series", "n_citation")
           .dropDuplicates()
          )

display(publications)
publications.write.format("delta").mode("overwrite").saveAsTable("publications")

[0;31m---------------------------------------------------------------------------[0m
[0;31mAnalysisException[0m                         Traceback (most recent call last)
File [0;32m<command-3750788705179393>:1[0m
[0;32m----> 1[0m publications_raw [38;5;241m=[39m get_publication_data(publication_raw)[38;5;241m.[39mfilter([38;5;124m"[39m[38;5;124mn_citation > 2[39m[38;5;124m"[39m)
[1;32m      2[0m [38;5;66;03m# publication = get_publication_dataget_crossref_data(publication_raw).select("_id", "title", "volume", "series", "n_citation").filter("n_citation > 2")[39;00m
[1;32m      4[0m publications [38;5;241m=[39m (publications_raw[38;5;241m.[39mselect([38;5;124m"[39m[38;5;124mtitle[39m[38;5;124m"[39m, [38;5;124m"[39m[38;5;124mvolume[39m[38;5;124m"[39m, [38;5;124m"[39m[38;5;124mseries[39m[38;5;124m"[39m, [38;5;124m"[39m[38;5;124mn_citation[39m[38;5;124m"[39m)
[1;32m      5[0m             [38;5;241m.[39mdropDuplicates()
[1;32m      6[

## Types

In [0]:
types_raw = filtered_df.limit(100).select("_id", "volume", "issue", "doi", "url", "venue", "title")

display(types_raw)

_id,volume,issue,doi,url,venue,title
53e9987db7602d97020b8240,23,1,10.1108/10650750710720757,List(http://dx.doi.org/10.1108/10650750710720757),"List(555036d97cea80f95415f809, null, null, Oclc Systems & Services, null, null, null, OCLC Systems & Services, null, null, null, null, 0)","Building partnerships among social science researchers, institution-based repositories and domain specific data archives"
53e9987db7602d97020b8243,,,10.1007/978-3-642-14496-7_7,List(http://dx.doi.org/10.1007/978-3-642-14496-7_7),"List(555037247cea80f95417608a, null, null, International Conference on Information Theoretic Security, null, null, null, ICITS, null, null, null, null, 0)",Efficient statistical asynchronous verifiable secret sharing with optimal resilience
53e9987db7602d97020b818a,96-A,12,,List(http://search.ieice.org/bin/summary.php?id=e96-a_12_2728),"List(555036cc7cea80f95415814b, null, null, IEICE Transactions on Fundamentals of Electronics, Communications and Computer Sciences, null, null, null, IEICE Transactions, null, null, null, null, 0)",On the Sparse Signal Recovery with Parallel Orthogonal Matching Pursuit.
53e9987db7602d97020b8292,6,,,List(http://www.jmlr.org/proceedings/papers/v6/voortman10a.html),"List(53a728e520f7420be8bbc4bb, null, null, Neural Information Processing Systems, null, null, null, NIPS Causality: Objectives and Assessment, null, null, null, null, 0)",Learning Causal Models That Make Correct Manipulation Predictions
53e9987db7602d97020b818f,13,3,10.1007/s10776-005-0024-8,"List(http://dx.doi.org/10.1007/s10776-005-0024-8, https://link.springer.com/10.1007/s10776-005-0024-8)","List(555036c47cea80f954153a3f, null, null, International Journal of Wireless Information Networks, null, null, null, IJWIN, null, null, null, null, 0)",Energy-efficient Routing of Multimedia Traffic in Frequency-Hop Packet Radio Networks
53e9987db7602d97020b8196,,,10.1109/HICSS.2003.1174852,"List(http://dx.doi.org/10.1109/HICSS.2003.1174852, http://computer.org/proceedings/hicss/1874/track9/187490302babs.htm, http://ieeexplore.ieee.org/xpl/abstractAuthors.jsp?tp=&arnumber=1174852)","List(53a724b320f7420be8b37f4c, null, null, Hawaii International Conference on System Sciences, null, null, null, HICSS, null, null, null, null, 0)",A Two-Cell-Lookahead Call Admission and Handoff Management Scheme for Multimedia LEO Satellite Networks
53e9987db7602d97020b82ba,27,3,,"List(http://doi.acm.org/10.1145/182119.1096164, https://doi.org/10.1145/182119.1096164, http://doi.org/10.1145/182119.1096164, https://dblp.uni-trier.de/db/journals/sigir/sigir27.html#Can93)","List(53a72a9120f7420be8c056af, null, null, null, null, null, null, SIGIR Forum, null, null, null, null, 0)","Information Retrieval Data Structures & Algorithms, by William B. Frakes and Ricardo Baeza-Yates (Book Review)"
53e9987db7602d97020b82c6,5894,,10.1007/978-3-642-10406-0_3,"List(http://dx.doi.org/10.1007/978-3-642-10406-0_3, http://www.webofknowledge.com/)","List(53a72bd420f7420be8c3282b, null, null, null, null, null, null, NET-COOP, null, null, null, null, 0)",Performance Evaluation of Multi-rate Streaming Traffic by Quasi-Stationary Modelling
53e9987db7602d97020b81ba,11,8,10.1016/j.asoc.2011.03.025,"List(http://dx.doi.org/10.1016/j.asoc.2011.03.025, http://www.webofknowledge.com/)","List(555036b67cea80f95414b7c5, null, null, null, null, null, null, Appl. Soft Comput., null, null, null, null, 0)",A novel ensemble algorithm for biomedical classification based on Ant Colony Optimization
53e9987db7602d97020b83e4,,,,List(),"List(53a7278720f7420be8b92c2a, null, null, International Conference on Lightning Protection, null, null, null, ICLP, null, null, null, null, 0)",Computation Trees and Transformations of Logic Programs


In [0]:
types_pre_api = types_raw.withColumn("type", F.when(F.col("venue.raw").contains("@"), "workshop")
                   .when(((F.col("volume").isNotNull()) & (F.col("volume") != "")) | ((F.col("issue").isNotNull()) & (F.col("issue") != "")), "journal-article")
                   .otherwise("conference paper"))

display(types_pre_api)

_id,volume,issue,doi,url,venue,title,type
53e9987db7602d97020b8240,23,1,10.1108/10650750710720757,List(http://dx.doi.org/10.1108/10650750710720757),"List(555036d97cea80f95415f809, null, null, Oclc Systems & Services, null, null, null, OCLC Systems & Services, null, null, null, null, 0)","Building partnerships among social science researchers, institution-based repositories and domain specific data archives",journal-article
53e9987db7602d97020b8243,,,10.1007/978-3-642-14496-7_7,List(http://dx.doi.org/10.1007/978-3-642-14496-7_7),"List(555037247cea80f95417608a, null, null, International Conference on Information Theoretic Security, null, null, null, ICITS, null, null, null, null, 0)",Efficient statistical asynchronous verifiable secret sharing with optimal resilience,conference paper
53e9987db7602d97020b818a,96-A,12,,List(http://search.ieice.org/bin/summary.php?id=e96-a_12_2728),"List(555036cc7cea80f95415814b, null, null, IEICE Transactions on Fundamentals of Electronics, Communications and Computer Sciences, null, null, null, IEICE Transactions, null, null, null, null, 0)",On the Sparse Signal Recovery with Parallel Orthogonal Matching Pursuit.,journal-article
53e9987db7602d97020b8292,6,,,List(http://www.jmlr.org/proceedings/papers/v6/voortman10a.html),"List(53a728e520f7420be8bbc4bb, null, null, Neural Information Processing Systems, null, null, null, NIPS Causality: Objectives and Assessment, null, null, null, null, 0)",Learning Causal Models That Make Correct Manipulation Predictions,journal-article
53e9987db7602d97020b818f,13,3,10.1007/s10776-005-0024-8,"List(http://dx.doi.org/10.1007/s10776-005-0024-8, https://link.springer.com/10.1007/s10776-005-0024-8)","List(555036c47cea80f954153a3f, null, null, International Journal of Wireless Information Networks, null, null, null, IJWIN, null, null, null, null, 0)",Energy-efficient Routing of Multimedia Traffic in Frequency-Hop Packet Radio Networks,journal-article
53e9987db7602d97020b8196,,,10.1109/HICSS.2003.1174852,"List(http://dx.doi.org/10.1109/HICSS.2003.1174852, http://computer.org/proceedings/hicss/1874/track9/187490302babs.htm, http://ieeexplore.ieee.org/xpl/abstractAuthors.jsp?tp=&arnumber=1174852)","List(53a724b320f7420be8b37f4c, null, null, Hawaii International Conference on System Sciences, null, null, null, HICSS, null, null, null, null, 0)",A Two-Cell-Lookahead Call Admission and Handoff Management Scheme for Multimedia LEO Satellite Networks,conference paper
53e9987db7602d97020b82ba,27,3,,"List(http://doi.acm.org/10.1145/182119.1096164, https://doi.org/10.1145/182119.1096164, http://doi.org/10.1145/182119.1096164, https://dblp.uni-trier.de/db/journals/sigir/sigir27.html#Can93)","List(53a72a9120f7420be8c056af, null, null, null, null, null, null, SIGIR Forum, null, null, null, null, 0)","Information Retrieval Data Structures & Algorithms, by William B. Frakes and Ricardo Baeza-Yates (Book Review)",journal-article
53e9987db7602d97020b82c6,5894,,10.1007/978-3-642-10406-0_3,"List(http://dx.doi.org/10.1007/978-3-642-10406-0_3, http://www.webofknowledge.com/)","List(53a72bd420f7420be8c3282b, null, null, null, null, null, null, NET-COOP, null, null, null, null, 0)",Performance Evaluation of Multi-rate Streaming Traffic by Quasi-Stationary Modelling,journal-article
53e9987db7602d97020b81ba,11,8,10.1016/j.asoc.2011.03.025,"List(http://dx.doi.org/10.1016/j.asoc.2011.03.025, http://www.webofknowledge.com/)","List(555036b67cea80f95414b7c5, null, null, null, null, null, null, Appl. Soft Comput., null, null, null, null, 0)",A novel ensemble algorithm for biomedical classification based on Ant Colony Optimization,journal-article
53e9987db7602d97020b83e4,,,,List(),"List(53a7278720f7420be8b92c2a, null, null, International Conference on Lightning Protection, null, null, null, ICLP, null, null, null, null, 0)",Computation Trees and Transformations of Logic Programs,conference paper


In [0]:
def check_return_data(check_type, data, cur):
    if data is not None and check_type in data:
        return data[check_type]
    else:
        return cur
    
def update_df(df, doi_list, data_list, data_str, is_int=False):
    new_df = spark.createDataFrame(zip(df.select("doi").rdd.flatMap(lambda x: x).collect(), data_list), ["doi", data_str])
    return df.join(new_df, on="doi", how="left").select(new_df["doi"], new_df[data_str], F.col("title"))

def get_type_data(df):
    doi_list = df.select(F.col("doi")).rdd.flatMap(lambda x: x).collect()
    url_list = df.select(F.col("url")).rdd.flatMap(lambda x: x).collect()
    type_list = df.select(F.col("type")).rdd.flatMap(lambda x: x).collect()
    new_type_list = []
    for i, doi in enumerate(doi_list):
        try:
            if doi == "" or doi is None:
                if "doi" in url_list[i][0]:
                    doi_req = url_list[i][0].split("org/")[-1]
                else:
                    raise Exception
            else:
                doi_req = doi
            
            response = requests.get(f"https://api.crossref.org/works/{doi_req}")
            data = response.json()['message']
        except Exception:
            data = None

        new_type_list.append(check_return_data('type', data, type_list[i]))

        time.sleep(0.05)

    df = update_df(df, doi_list, new_type_list, 'type')
    
    return df

In [0]:
types_raw = get_type_data(types_pre_api)

types = (types_raw.select("type")
            .dropDuplicates()
            .withColumn("type_id", F.expr("uuid()"))
          )

types = (types_raw.join(types, ["type"])
           .select("type_id", "type", "title")
           .dropDuplicates()
        )

display(types)

types_clean = types.select("type_id", "type")
types_clean.write.format("delta").mode("overwrite").saveAsTable("types")

[0;31m---------------------------------------------------------------------------[0m
[0;31mAnalysisException[0m                         Traceback (most recent call last)
File [0;32m<command-3750788705179398>:1[0m
[0;32m----> 1[0m types_raw [38;5;241m=[39m get_type_data(types_pre_api)
[1;32m      3[0m types [38;5;241m=[39m (types_raw[38;5;241m.[39mselect([38;5;124m"[39m[38;5;124mtype[39m[38;5;124m"[39m)
[1;32m      4[0m             [38;5;241m.[39mdropDuplicates()
[1;32m      5[0m             [38;5;241m.[39mwithColumn([38;5;124m"[39m[38;5;124mtype_id[39m[38;5;124m"[39m, F[38;5;241m.[39mexpr([38;5;124m"[39m[38;5;124muuid()[39m[38;5;124m"[39m))
[1;32m      6[0m           )
[1;32m      8[0m types [38;5;241m=[39m (types_raw[38;5;241m.[39mjoin(types, [[38;5;124m"[39m[38;5;124mtype[39m[38;5;124m"[39m])
[1;32m      9[0m            [38;5;241m.[39mselect([38;5;124m"[39m[38;5;124mtype_id[39m[38;5;124m"[39m, [38;5;124m"[39m[38;5

## Keywords

In [0]:
keywords_raw = filtered_df.limit(100).select("_id", "keywords", "doi", "title")

display(keywords_raw)

_id,keywords,doi,title
53e9987db7602d97020b8240,"List(design methodology, information flow, data storage, social science, preprint, digital library, life cycle, digital repository)",10.1108/10650750710720757,"Building partnerships among social science researchers, institution-based repositories and domain specific data archives"
53e9987db7602d97020b8243,"List(optimal resilience, efficient statistical asynchronous, statistical avss protocol, avss protocol, a-cast communication, verifiable secret sharing, a-casts o, private communication, multiplication gate, communication complexity, share l, statistical ampc, finite field, secret sharing)",10.1007/978-3-642-14496-7_7,Efficient statistical asynchronous verifiable secret sharing with optimal resilience
53e9987db7602d97020b818a,"List(mean squared error, orthogonal matching pursuit, compressed sensing)",,On the Sparse Signal Recovery with Parallel Orthogonal Matching Pursuit.
53e9987db7602d97020b8292,List(causal models),,Learning Causal Models That Make Correct Manipulation Predictions
53e9987db7602d97020b818f,"List(Packet radio networks, multimedia routing protocols, frequency-hop spread spectrum, ad hoc wireless networks)",10.1007/s10776-005-0024-8,Energy-efficient Routing of Multimedia Traffic in Frequency-Hop Packet Radio Networks
53e9987db7602d97020b8196,"List(bandwidth allocation, mobile satellite communication, multimedia communication, network topology, quality of service, telecommunication congestion control, telecommunication network management, QoS, bandwidth utilization, call blocking probability, call dropping probability, handoff management, multimedia LEO satellite networks, multimedia connections, network topology, predictive bandwidth allocation, simulation, two-cell-lookahead call admission)",10.1109/HICSS.2003.1174852,A Two-Cell-Lookahead Call Admission and Handoff Management Scheme for Multimedia LEO Satellite Networks
53e9987db7602d97020b82ba,"List(information retrieval, data structure)",,"Information Retrieval Data Structures & Algorithms, by William B. Frakes and Ricardo Baeza-Yates (Book Review)"
53e9987db7602d97020b82c6,"List(traffic variation, constant bit rate flow, original contribution, performance evaluation, continuous distribution, quasi-stationary modelling, quasi-stationary approach, traffic performance, simple case, flow peak rate, multi-rate aspect, multi-rate streaming traffic, simulation, constant bit rate)",10.1007/978-3-642-10406-0_3,Performance Evaluation of Multi-rate Streaming Traffic by Quasi-Stationary Modelling
53e9987db7602d97020b81ba,"List(ant colony optimization, biomedical data, generalization ability, biomedicine community, biomedical classification, ensemble learning, existing technique, higher prediction performance, trained component classifier, novel ensemble algorithm, ensemble approach, rough set)",10.1016/j.asoc.2011.03.025,A novel ensemble algorithm for biomedical classification based on Ant Colony Optimization
53e9987db7602d97020b83e4,List(),,Computation Trees and Transformations of Logic Programs


In [0]:
# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

def is_city_location_person(keyword):
    # Process the keyword using the spaCy NER model
    doc = nlp(keyword)
    
    # Check if any of the entities recognized by the model are of type GPE (city or location) or PERSON
    for ent in doc.ents:
        if ent.label_ in ['GPE', 'PERSON']:
            return True
    
    return False

def update_df(df, doi_list, data_list, data_str, is_int=False):
    data_dict = dict(zip(doi_list, data_list))
    update_data = udf(lambda x: data_dict[x], IntegerType() if is_int else StringType())
    return df.withColumn(data_str, update_data(F.col('doi')))

def check_keywords(df):
    doi_list = df.select(F.col("doi")).rdd.flatMap(lambda x: x).collect()
    keywords_list = df.select(F.col("keywords")).rdd.flatMap(lambda x: x).collect()
    new_keywords_list = []
    for i, keys in enumerate(keywords_list):
        temp = []
        for k in keys:
            if not is_city_location_person(k):
                temp.append(k)
        new_keywords_list.append(temp)
        
    df = update_df(df, doi_list, keywords_list, 'keywords')
    
    return df

In [0]:
keywords_raw = check_keywords(keywords_raw).filter(F.size(keywords_raw.keywords) > 0)


keywords = (keywords_raw.select("keywords")
            .dropDuplicates()
            .withColumn("keyw_id", F.expr("uuid()"))
          )

keywords = (keywords_raw.join(keywords, ["keywords"])
           .select("keyw_id", "keywords", "title")
           .dropDuplicates()
        )

display(keywords)

keywords_clean = keywords.select("keyw_id", "keywords")
keywords_clean.write.format("delta").mode("overwrite").saveAsTable("keywords")

[0;31m---------------------------------------------------------------------------[0m
[0;31mAnalysisException[0m                         Traceback (most recent call last)
File [0;32m<command-3750788705179402>:1[0m
[0;32m----> 1[0m keywords_raw [38;5;241m=[39m check_keywords(keywords_raw)[38;5;241m.[39mfilter(F[38;5;241m.[39msize(keywords_raw[38;5;241m.[39mkeywords) [38;5;241m>[39m [38;5;241m0[39m)
[1;32m      4[0m keywords [38;5;241m=[39m (keywords_raw[38;5;241m.[39mselect([38;5;124m"[39m[38;5;124mkeywords[39m[38;5;124m"[39m)
[1;32m      5[0m             [38;5;241m.[39mdropDuplicates()
[1;32m      6[0m             [38;5;241m.[39mwithColumn([38;5;124m"[39m[38;5;124mkeyw_id[39m[38;5;124m"[39m, F[38;5;241m.[39mexpr([38;5;124m"[39m[38;5;124muuid()[39m[38;5;124m"[39m))
[1;32m      7[0m           )
[1;32m      9[0m keywords [38;5;241m=[39m (keywords_raw[38;5;241m.[39mjoin(keywords, [[38;5;124m"[39m[38;5;124mkeywords[39m[38;5;1

## Venue

In [0]:
# https://pypi.org/project/habanero/
from habanero import Crossref
cr = Crossref()

def getVenue(doi, venue):
    result = [venue['name_d'], None]
    try:
        query = cr.works(ids = doi)['message']['event']
        result[0] = query['name']
        result[1] = query['location']
    except:
        pass
    return result

getVenueUDF = udf(getVenue, ArrayType(StringType()))

In [0]:
venues_df = (filtered_df
            .limit(100)
            .select('title', "doi", 'venue')
            .withColumn("VENUE", getVenueUDF(F.col("doi"), F.col('venue')))
            .select('title',
                    F.col("VENUE")[0].alias("venue"),
                    F.col("VENUE")[1].alias("location")
                   )
             .dropna(subset="venue")
           )

venues = (venues_df.select("venue", "location")
            .dropDuplicates()
            .withColumn("venue_id", F.expr("uuid()"))
          )

venues = (venues_df.join(venues, ["venue", "location"])
           .select("venue_id", "venue", "location", "title")
           .dropDuplicates()
        )

display(venues)
venues_clean = venues.select("venue_id", "venue")
#venues_clean.write.format("delta").mode("overwrite").saveAsTable("venues")

venue_id,venue,location,title
4664d0f7-462b-4f54-9f68-91c484d8d824,IGARSS 2011 - 2011 IEEE International Geoscience and Remote Sensing Symposium,"Vancouver, BC, Canada",Automated micro-landform classification by combination of satellite images and SRTM DEM.
eea35599-2faa-4da0-964f-8bbc039798e8,ICMI '08: INTERNATIONAL CONFERENCE ON MULTIMODAL INTERFACES,Chania Crete Greece,Manipulating trigonometric expressions encodedthrough electro-tactile signals
df6f57ec-394c-4f00-a781-4ede2901eddf,Sixth International Conference on Intelligent Systems Design and Applications],"Jian, China",Pipeline Damage and Leak Detection Based on Sound Spectrum LPCC and HMM
3c1220c3-df7a-4476-bdef-717e9a89d969,"2009 IEEE International Conference on Systems, Man and Cybernetics - SMC","San Antonio, TX, USA",Extracting spatial semantics in association rules for ocean image retrieval
5e6f4dda-45e6-4fb4-ac7a-14cb1c570138,Eleventh International Conference on Computer Communications and Networks,"Miami, FL, USA",Efficient broadcast with forward node set in clustered mobile ad hoc networks
1ec3dd47-0f2e-4092-a644-182cded2c05d,2007 4th IEEE International Symposium on Biomedical Imaging: From Nano to Macro,"Arlington, VA, USA",AN EFFECTIVE AND EFFICIENT TECHNIQUE FOR SEARCHING FOR SIMILAR BRAIN ACTIVATION PATTERNS
29b6639e-84c0-4256-aabc-f39c9c0f396c,2006 IEEE International Conference on Communications,Istanbul,Joint Bandwidth Allocation and Connection Admission Control for Polling Services in IEEE 802.16 Broadband Wireless Networks.
1814cd71-318e-4bb6-875f-f6c9aee2e3c9,GLOBECOM 2012 - 2012 IEEE Global Communications Conference,"Anaheim, CA, USA",TIS: A threshold incentive scheme for secure and reliable data forwarding in vehicular Delay Tolerant Networks
aaff608c-7340-42f3-b13d-9e1a2688bf00,the 13th conference,"Helsinki, Finland",Centering theory and the Italian pronominal system
6fb272c9-7cd5-4879-b527-8c9fd5112cca,Simulation (HPCS),"Leipzig, Germany",Automatic parallel SAT solving using MTSS.


## FieldOfStudy

In [0]:
def getFos(doi, fos):
    result = [] if not fos else fos
    query = []
    try:
        query = cr.works(ids = doi)['message']['subject']
    except:
        pass
    return [*result, *query]

getFosUDF = udf(getFos, ArrayType(StringType()))

In [0]:
fos_df = (filtered_df
          .limit(100)
          .select('title', "doi", 'fos')
          .withColumn("FOS", getFosUDF(F.col("doi"), F.col("fos")))
          .select('title', F.col('FOS').alias('fos'))
          .filter(F.size('fos') > 0)
           )

fos = (fos_df.select("fos")
            .dropDuplicates()
            .withColumn("fos_id", F.expr("uuid()"))
          )

fos = (fos_df.join(fos, ["fos"])
           .select("fos_id", "fos", "title")
           .dropDuplicates()
        )

display(fos)
fos_clean = fos.select("fos_id", "fos")
#fos_clean.write.format("delta").mode("overwrite").saveAsTable("fos")

fos_id,fos,title
a4a2d7d8-f56f-4581-bd7f-1ba8426c6e9a,"List(Information system, Facial recognition system, Data mining, Receiver operating characteristic, Authentication, Computer science, Fingerprint recognition, Word error rate, Fingerprint, Speech recognition, Biometrics)",An Empirical Study of Multi-mode Biometric Systems Using Face and Fingerprint
6c49e081-db37-41e1-8033-a6ba2b860ac5,"List(Aromaticity, Electron counting, Crystallography, Planarity testing, Computational chemistry, Boron, Atom, Chemistry, Antiaromaticity, Chemical bond, Delocalized electron, Computational Mathematics, General Chemistry)",Comprehensive analysis of chemical bonding in boron clusters
fded6754-9ba9-4344-99e3-a42957c5b6fe,"List(Resource management, Middleware, Metadata, Virtual machine, Computer science, Resource allocation, Software architecture, Operating system, Message passing, Distributed computing, Cloud computing)",Designing a Middleware API for Building Private IaaS Cloud Architectures
d4f27608-f5f8-428c-88f4-fa1b5cd73d04,"List(State observer, Observability, State vector, Algorithm design, Linear system, Control theory, Steady state, Observer (quantum physics), Mathematics)",Global hierarchical observer for linear systems with unknown inputs
5fada3ea-52c1-4370-97b4-ce8778d0d95c,"List(Mathematical optimization, Computer science, Theoretical computer science, Optimal allocation, Distributed database, Quadratic programming, Applied Mathematics, Computational Mathematics, Computer Networks and Communications, Software)",A Quadratic Programming Model for Optimal Data Distribution
75750147-57e2-440e-a7ce-79030d07334c,"List(Asynchronous communication, Gigabit, Computer science, Computer network, Quality of service, Fault tolerance, Ethernet, Local area network, Dynamic bandwidth allocation, Time division multiple access, Distributed computing, Computer Networks and Communications, Hardware and Architecture, Media Technology, Software)",Dynamic Resource Allocation for Multimedia Document Retrieval over High Speed LANs
061d6dce-46de-4a56-b6d0-b7c2042ec99f,"List(Mobile ad hoc network, Multimedia Broadcast Multicast Service, Broadcast domain, Atomic broadcast, Computer science, Computer network, Optimized Link State Routing Protocol, Wireless ad hoc network, Broadcast radiation, Distributed computing, Broadcast communication network)",Efficient broadcast with forward node set in clustered mobile ad hoc networks
8175cebe-c4f0-4f01-bfd2-5d63a68dff2c,"List(Ontology (information science), Ontology-based data integration, Ontology, Annotation, Information retrieval, Computer science)","From chaos to order: A generic, distributed, ontology based annotation system"
09a66c7c-9c95-4037-81e1-f92fad3c7c32,"List(Noise floor, Noise measurement, Noise (signal processing), Computer science, Control theory, Salt-and-pepper noise, Noise figure, Noise temperature, Effective input noise temperature, Gaussian noise)",An adaptive noise canceller with adaptive delay compensation for a distant noise source.
c16bbff6-be81-4621-86be-f46ba5845c12,"List(Pronoun, Computer science, Utterance, Phenomenon, Linguistics, Functional role)",Centering theory and the Italian pronominal system


## Date

In [0]:
def getDate(doi, year):
    formated = [None, None, None]
    try:
        query = cr.works(ids = doi)['message']['issued']['date-parts'][0]
        for i in [0, 1, 2]:
            formated[i] = query[i]
    except:
        pass
    if formated[0] == None:
        formated[0] = year 
    return formated

getDateUDF = udf(getDate, ArrayType(StringType()))

In [0]:
dates_df = (filtered_df
            .limit(100)
            .select('title', "year", "doi")
            .withColumn("Date", getDateUDF(F.col("doi"), F.col("year")))
            .select('title',
                    F.col("Date")[2].alias("day"),
                    F.col("Date")[1].alias("month"),
                    F.col("Date")[0].alias("year")
                   )
           )

dates = (dates_df.select([F.col("day").alias("day1"), F.col("month").alias("month1"), F.col("year").alias("year1")])
            .dropDuplicates()
            .withColumn("date_id", F.expr("uuid()"))
          )

dates = (dates_df.join(dates, ((dates_df.day.eqNullSafe(dates.day1)) & 
                               (dates_df.month.eqNullSafe(dates.month1)) &
                               (dates_df.year.eqNullSafe(dates.year1))))
           .select("date_id", "day", "month", "year", "title")
           .dropDuplicates()
        )

display(dates)

dates_clean = dates.select("date_id", "day", "month", "year")
#dates_clean.write.format("delta").mode("overwrite").saveAsTable("dates")

date_id,day,month,year,title
623f37fa-e75d-4323-ab82-55c27050517e,,,2010,Efficient statistical asynchronous verifiable secret sharing with optimal resilience
6b24ec2e-130a-4e7f-af6c-e3bc68f6ce1d,,,1990,Centering theory and the Italian pronominal system
623f37fa-e75d-4323-ab82-55c27050517e,,,2010,A Practical Subspace Approach To Landmarking.
0e7f5751-873e-4bfa-8d4b-80c96a729970,,,2009,Array P Systems and t.Communication
bada7408-7a58-465c-aacd-44e2ba49544a,,,2013,From republicans to teenagers --- group membership and search (GRUMPS)
b0ab1457-bd38-4927-8465-1047e3968bf3,,,1989,A new interactive protein sequence alignment program and comparison of its results with widely used algorithms.
0e7f5751-873e-4bfa-8d4b-80c96a729970,,,2009,A Study on Characteristics of Software Vendors in Japan: from Environmental Threats and Resource-Based View.
ccadf96e-f62a-4875-b4f9-cb099b29f30b,,,2002,Efficient broadcast with forward node set in clustered mobile ad hoc networks
b50d468a-e78f-491c-90a3-10612fc8d2ff,,9.0,2004,Makespan minimization subject to flowtime optimality on identical parallel machines
82fe669a-0350-4d17-b3a9-8821fadceb1b,,9.0,1988,Some nonstandard methods in combinatorial number theory


## Language

In [0]:
lang_df = filtered_df.select("title", "lang")

lang = (lang_df.select("lang")
            .dropDuplicates()
            .withColumn("lang_id", F.expr("uuid()"))
          )

lang = (lang_df.join(lang, ["lang"])
           .select("lang_id", "lang", "title")
           .dropDuplicates()
        )

display(lang)

lang_clean = lang.select("lang_id", "lang")
#lang_clean.write.format("delta").mode("overwrite").saveAsTable("lang")

## MAIN TABLE

In [0]:
main_table = (publications
              .join(authors, ["title"])
              .join(organization, ["author"], "left")
              .join(types, ["title"])
              .join(keywords, ["title"])
              .join(venues, ["title"])
              .join(fos, ["title"])
              .join(dates, ["title"])
              .join(lang, ["title"])
              .select("publ_id", "author_id", "org_id", "type_id", "keyw_id", "venue_id", "fos_id", "date_id", "lang_id", F.col("rank").alias("author_rank"))
              )
display(main_table)
main_table.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable("DBLP_fact_table")

[0;31m---------------------------------------------------------------------------[0m
[0;31mNameError[0m                                 Traceback (most recent call last)
File [0;32m<command-3750788705179409>:1[0m
[0;32m----> 1[0m main_table [38;5;241m=[39m (publications
[1;32m      2[0m               [38;5;241m.[39mjoin(authors, [[38;5;124m"[39m[38;5;124mtitle[39m[38;5;124m"[39m])
[1;32m      3[0m               [38;5;241m.[39mjoin(organization, [[38;5;124m"[39m[38;5;124mauthor[39m[38;5;124m"[39m], [38;5;124m"[39m[38;5;124mleft[39m[38;5;124m"[39m)
[1;32m      4[0m               [38;5;241m.[39mjoin(types, [[38;5;124m"[39m[38;5;124mtitle[39m[38;5;124m"[39m])
[1;32m      5[0m               [38;5;241m.[39mjoin(keywords, [[38;5;124m"[39m[38;5;124mtitle[39m[38;5;124m"[39m])
[1;32m      6[0m               [38;5;241m.[39mjoin(venues, [[38;5;124m"[39m[38;5;124mtitle[39m[38;5;124m"[39m])
[1;32m      7[0m               [38;5;241m