In [0]:
%sh
pip install spacy
pip install habanero
python -m spacy download en_core_web_sm

You should consider upgrading via the '/local_disk0/.ephemeral_nfs/envs/pythonEnv-1772fb82-d923-4ab4-a162-d948c264c5a1/bin/python -m pip install --upgrade pip' command.
You should consider upgrading via the '/local_disk0/.ephemeral_nfs/envs/pythonEnv-1772fb82-d923-4ab4-a162-d948c264c5a1/bin/python -m pip install --upgrade pip' command.
Collecting en-core-web-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl (12.8 MB)
You should consider upgrading via the '/local_disk0/.ephemeral_nfs/envs/pythonEnv-1772fb82-d923-4ab4-a162-d948c264c5a1/bin/python -m pip install --upgrade pip' command.
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [0]:
dbutils.library.restartPython()

In [0]:
from pyspark.sql.types import StringType, IntegerType, ArrayType
import time
import pyspark.sql.functions as F
from pyspark.sql import Window
import requests
import json
import urllib.parse
import spacy
from habanero import Crossref

#### Database Schema

<img src="https://i.ibb.co/hRqLR8d/Schema.png" alt="Schema" border="0">

In [0]:
# use spark.streams.active to loop over all active streams
# remember to stop streams if not working on them anymore

for stream in spark.streams.active:
    stream.stop()

In [0]:
raw_df_schema = "_id: STRING, abstract: STRING, authors: ARRAY<STRUCT<_id: STRING, bio: STRING, email: STRING, gid: STRING, name: STRING, name_zh: STRING, oid: STRING, oid_zh: STRING, orcid: STRING, org: STRING, org_zh: STRING, orgid: STRING, orgs: ARRAY<STRING>, orgs_zh: ARRAY<STRING>, sid: STRING>>, doi: STRING, fos: ARRAY<STRING>, isbn: STRING, issn: STRING, issue: STRING, keywords: ARRAY<STRING>, lang: STRING, n_citation: STRING, page_end: STRING, page_start: STRING, pdf: STRING, references: ARRAY<STRING>, title: STING, url: ARRAY<STRING>, venue: STRUCT<_id: STRING, issn: STRING, name: STRING, name_d: STRING, name_s: STRING, online_issn: STRING, publisher: STRING, raw: STRING, raw_zh: STRING, sid: STRING, src: STRING, t: STRING, type: STRING>, volume: STRING, year: STRING"

raw_df_stream = (spark.readStream
                 .option("maxFilesPerTrigger", 1)
                 .format("delta")
                 .load("/user/hive/warehouse/scientific_publications"))
#display(raw_df_stream)

In [0]:
running_count_df = raw_df_stream.agg(F.count("*"))

display(running_count_df)

count(1)
250000


In [0]:
#Filter out titles that have less than 2 words
filtered_df = raw_df_stream.withColumn("title_word_count", F.size(F.split("title", " "))).filter("title_word_count > 1")
# filtered_df = filtered_df.filter(F.col("doi").isNull())
# filtered_df = filtered_df.filter("n_citation > 3")
#display(filtered_df)

## Authors

In [0]:
def get_author_from_dblp(title, rank):
    URL = "http://dblp.org/search/publ/api?q=" + title.replace(" ", "+") + "&format=json"
    try:
        r = requests.get(url = URL)
        data = r.json()
        return data['result']['hits']['hit'][0]['info']['authors']['author'][rank-1]['text']
    except:
        return None
      
get_author_from_dblp_UDF = udf(get_author_from_dblp)

In [0]:
authors = (filtered_df.limit(100).select("doi", "authors", "title", F.posexplode(F.col("authors")).alias("rank", "authors_exp"))
            .withColumn("rank", F.col("rank") + 1)
            .select("authors_exp.*","*")
            .select("doi", "rank", "name", "title")
            .withColumn("name", F.initcap(F.col("name")))
            )

authors2 = authors.filter(F.col("name").rlike(r"^\p{L}\.?(-\p{L}\.?)?\s.+"))
"""authors_rdd = authors2.rdd.map(lambda x: (x[0], x[1], x[2], x[3], get_author_from_dblp(x[3], x[1])))
authors2 = authors_rdd.toDF(["doi", "rank", "name", "title", "dblp_name"])"""

authors2 = (authors2
            .withColumn("dblp_name", get_author_from_dblp_UDF(F.col("title"), F.col("rank")))
            .withColumn("dblp_name", F.when(F.col("dblp_name") == "null", F.col("name")).otherwise(F.col("dblp_name")))
            .withColumn("dblp_name", F.regexp_extract(F.col("dblp_name"), r"^(\D+)(\s\d\d\d\d)?$", 1)))

authors_raw = (authors.withColumn("dblp_name", F.col("name"))
           .union(authors2)
           .withColumn("last_name_raw", F.regexp_extract(F.col("name"), r"^.+\s(\S+)(\sJr\.)?$", 1))
           .withColumn("dblp_name", F.when(F.col("dblp_name") == "null", F.col("name")).otherwise(F.col("dblp_name")))
           .withColumn("dblp_name", F.regexp_extract(F.col("dblp_name"), r"^(\D+)(\s\d\d\d\d)?$", 1))
           .withColumn("dblp_last_name", F.regexp_extract(F.col("dblp_name"), r"^.+\s(\S+)(\sJr\.)?$", 1))
           .withColumn("name", F.when(F.col("last_name_raw") == F.col("dblp_last_name"), 
                                      F.col("dblp_name")).otherwise(F.col("name")))
           .withColumn("first_name", F.regexp_extract(F.col("name"), r"^(\S+)\s.+$", 1))
           .withColumn("last_name", F.regexp_extract(F.col("name"), r"^.+\s(\S+)$", 1))
           .withColumn("middle_name", F.regexp_extract(F.col("name"), r"^\S+\s(\S+)\s\S+$", 1))
          )

authors = (authors_raw.select("first_name", "last_name", "middle_name")
            .dropDuplicates()
            .withColumn("author_id", F.expr("uuid()"))
          )

authors = (authors_raw.join(authors, ["first_name", "last_name", "middle_name"])
           .select("author_id", "first_name", "last_name", "middle_name", "title", F.col("name").alias("author"), "rank")
           .dropDuplicates()
          )

display(authors)

authors_clean = authors.select("author_id", "first_name", "last_name", "middle_name")
#authors_clean.write.format("delta").mode("overwrite").saveAsTable("authors")

author_id,first_name,last_name,middle_name,title,author,rank
bb4562a9-7fc4-41d8-8e87-0f7af2d0bc60,Gordon,Roberts,W.,A DC current measurement circuit for on-chip applications,Gordon W. Roberts,2
79196f76-ce00-4328-9f98-c299cec54269,,,,Mobile Adaptation with Multiple Representation Approach as Educational Pedagogy,Kinshuk,1
72a2477e-cccd-4798-bdd5-e4a9233f8df7,Jean,Nganou,B.,MV-algebras derived from ideals in BL-algebras,Jean B. Nganou,2
7c95ce26-a584-43ee-8785-c03d43cb340c,Petia,Todorova,,A Two-Cell-Lookahead Call Admission and Handoff Management Scheme for Multimedia LEO Satellite Networks,Petia Todorova,1
5490a15e-50d1-4700-86fb-70aa1058186f,Wu-yuin,Hwang,,A Study of Listening Diversity and Speaking for English Learning with Mobile Device Supports,Wu-yuin Hwang,1
06c1a2ff-416e-425a-a8d1-4fd4db4d20f4,Mustapha,Lalam,,Evaluating vehicular radio connectivity with environment-based metrics.,Mustapha Lalam,4
263e12bd-0281-491e-bbef-d0e7ee17e6dd,C.,Rangan,Pandu,Efficient statistical asynchronous verifiable secret sharing with optimal resilience,C. Pandu Rangan,3
db89f809-3bae-494c-a2dc-731b5258a6e2,John,Choi,D.,Performance analysis of RAKE receivers for ultra-wideband communications with PPM and OOK in multipath channels.,John D. Choi,1
00814a6c-7bf5-449d-b7ca-bb4210d61e3a,Henk,Vandecasteele,,Query transformations for improving the efficiency of ilp systems,Henk Vandecasteele,8
c36195bd-7928-4e97-973a-38d8b9ce6885,Hongyun,He,,An ideal run mode for mass transit based on ADS.,Hongyun He,4


## Organizations

In [0]:
def get_organization(name, country):
    try:
        URL = "https://api.ror.org/organizations?query=" + name.replace(" ", "+")
        r = requests.get(url = URL)
        data = r.json()
        if data['number_of_results'] == 0:
            return "No results"
        for i in range(data['number_of_results']):
            found_country = data['items'][i]["country"]["country_name"]
            if country == "" or country == found_country:
                return data['items'][i]["name"]+";"+data['items'][i]["addresses"][0]["city"]+";"+found_country
    except:
        return "No results"
      
get_organization_UDF = udf(get_organization)

In [0]:
countries = "(,|\s)(Afghanistan|Albania|Algeria|Andorra|Angola|Antigua and Barbuda|Argentina|Armenia|Australia|Austria|Azerbaijan|Bahamas|Bahrain|Bangladesh|Barbados|Belarus|Belgium|Belize|Benin|Bhutan|Bolivia|Bosnia and Herzegovina|Botswana|Brazil|Brunei|Bulgaria|Burkina Faso|Burundi|Cabo Verde|Cambodia|Cameroon|Canada|Central African Republic|Chad|Chile|China|Colombia|Comoros|Democratic Republic of the Congo|Republic of the Congo|Costa Rica|Cote d'Ivoire|Croatia|Cuba|Cyprus|Czech Republic|Denmark|Djibouti|Dominica|Dominican Republic|Ecuador|Egypt|El Salvador|Equatorial Guinea|Eritrea|Estonia|Ethiopia|Fiji|Finland|France|Gabon|Gambia|Georgia|Germany|Ghana|Greece|Grenada|Guatemala|Guinea|Guinea-Bissau|Guyana|Haiti|Honduras|Hungary|Iceland|India|Indonesia|Iran|Iraq|Ireland|Israel|Italy|Jamaica|Japan|Jordan|Kazakhstan|Kenya|Kiribati|Kosovo|Kuwait|Kyrgyzstan|Laos|Latvia|Lebanon|Lesotho|Liberia|Libya|Liechtenstein|Lithuania|Luxembourg|North Macedonia|Madagascar|Malawi|Malaysia|Maldives|Mali|Malta|Marshall Islands|Mauritania|Mauritius|Mexico|Micronesia|Moldova|Monaco|Mongolia|Montenegro|Morocco|Mozambique|Myanmar|Namibia|Nauru|Nepal|Netherlands|New Zealand|Nicaragua|Niger|Nigeria|North Korea|Norway|Oman|Pakistan|Palau|Palestine|Panama|Papua New Guinea|Paraguay|Peru|Philippines|Poland|Portugal|Qatar|Romania|Russia|Rwanda|Saint Kitts and Nevis|Saint Lucia|Saint Vincent and the Grenadines|Samoa|San Marino|Sao Tome and Principe|Saudi Arabia|Senegal|Serbia|Seychelles|Sierra Leone|Singapore|Slovakia|Slovenia|Solomon Islands|Somalia|South Africa|South Korea|South Sudan|Spain|Sri Lanka|Sudan|Suriname|Swaziland|Sweden|Switzerland|Syria|Taiwan|Tajikistan|Tanzania|Thailand|Timor-Leste|Togo|Tonga|Trinidad and Tobago|Tunisia|Turkey|Turkmenistan|Tuvalu|Uganda|Ukraine|United Arab Emirates|UAE|United Kingdom|UK|United States of America|USA|United States|US|Uruguay|Uzbekistan|Vanuatu|Vatican City|Venezuela|Vietnam|Yemen|Zambia|Zimbabwe)(,|\s|$)"


organization = (filtered_df.limit(100).select("authors", "title", F.explode(F.col("authors")).alias("authors_exp"))
               .select("authors_exp.*","*")
               .withColumn("author", F.col("name"))
               .select("author", "org")
               .filter(F.col("org").isNotNull())
               .dropDuplicates()
                # ., +, *, ?, ^, $, (, ), [, ], {, }, |, \
               .withColumn("strip_org", F.regexp_replace(F.col("org"), r'[\+-=#&\|><!\(\)\{\}\[\]\^"~\*\?:\\/]', " "))
               .withColumn("county", F.regexp_extract(F.col("org"), countries, 2))
               .withColumn("county", F.regexp_replace("county", "United States of America|USA|US", "United States"))
               .withColumn("county", F.regexp_replace("county", "UK", "United Kingdom"))
               .withColumn("county", F.regexp_replace("county", "UAE", "United Arab Emirates"))
               )

"""organization_rdd = organization.rdd.map(lambda x: (x[0], x[1], x[2], x[3], get_organization(x[2], x[3])))
organization_raw = (organization_rdd.toDF(["author", "org", "strip_org", "country", "api_org"])"""
organization_raw = (organization
                    .withColumn("api_org", get_organization_UDF(F.col("strip_org"), F.col("county")))
                    .withColumn("api_name", F.regexp_extract(F.col("api_org"), r"^(.+);.+;.+$", 1))
                    .withColumn("match", F.expr(r"regexp_extract(org, concat('(', api_name,')'), 0)"))
                    .withColumn("name", F.when(F.col("match") == "", F.col("org"))
                               .otherwise(F.col("api_name")))
                    .withColumn("city", F.when(F.col("match") == "", "")
                               .otherwise(F.regexp_extract(F.col("api_org"), r"^.+;(.+);.+$", 1)))
                    .withColumn("country", F.when(F.col("match") == "", F.col("county"))
                               .otherwise(F.regexp_extract(F.col("api_org"), r"^.+;.+;(.+)$", 1)))
                   )


organization = (organization_raw.select("name", "city", "country")
                .dropDuplicates()
                .withColumn("org_id", F.expr("uuid()"))
               )

organization = (organization_raw.join(organization, ["name", "city", "country"])
                .select("org_id", "name", "city", "country", "author")
               )

display(organization)

organization_clean = organization.select("org_id", "name", "city", "country")
#organization_clean.write.format("delta").mode("overwrite").saveAsTable("organization")

org_id,name,city,country,author
13f0a96f-1327-4203-9f22-4a59be12ebda,Universität Stuttgart,,,Gabriel Dermler
dc30d8da-73de-47c1-b47c-6ea130f6a134,National Taipei University of Technology,Taipei,Taiwan,Li-Jen Kao
db093e78-94dd-40d6-8d4c-9ad07b07c7f8,"Graduate of Network Learning Technology, National Central University, Taiwan",,Taiwan,Wu-Yuin Hwang
db093e78-94dd-40d6-8d4c-9ad07b07c7f8,"Graduate of Network Learning Technology, National Central University, Taiwan",,Taiwan,Sheng-Yi Wu
1c71f8a2-d29f-4590-bb71-c6cca0c542a6,"Orange Labs, CORE/TPN Laboratory, France Telecom, Issy-les-Moulineaux Cedex 9, France 92794",,France,Philippe Olivier
9a6670a8-4b1a-429e-80a4-9355f18ba03f,"LITA, Universite Paul Verlaine-Metz, Ile du Saulcy, 57045 Metz Cedex, France",,France,Maurice Margenstern
a077c2d0-db04-4a5b-b2fa-9158f38a96b7,,,,Ali Amouri
a077c2d0-db04-4a5b-b2fa-9158f38a96b7,,,,Hakim Mohellebi
a077c2d0-db04-4a5b-b2fa-9158f38a96b7,,,,Abderrahmane Kheddar
a077c2d0-db04-4a5b-b2fa-9158f38a96b7,,,,Hichem Arioui


## Publications

In [0]:
publication_raw = filtered_df.limit(100).select("_id", "title", "volume", "n_citation", "doi", "url")

display(publication_raw)

_id,title,volume,n_citation,doi,url
53e9987db7602d97020b8240,"Building partnerships among social science researchers, institution-based repositories and domain specific data archives",23,62,10.1108/10650750710720757,List(http://dx.doi.org/10.1108/10650750710720757)
53e9987db7602d97020b8243,Efficient statistical asynchronous verifiable secret sharing with optimal resilience,,14,10.1007/978-3-642-14496-7_7,List(http://dx.doi.org/10.1007/978-3-642-14496-7_7)
53e9987db7602d97020b818a,On the Sparse Signal Recovery with Parallel Orthogonal Matching Pursuit.,96-A,9,,List(http://search.ieice.org/bin/summary.php?id=e96-a_12_2728)
53e9987db7602d97020b8292,Learning Causal Models That Make Correct Manipulation Predictions,6,3,,List(http://www.jmlr.org/proceedings/papers/v6/voortman10a.html)
53e9987db7602d97020b818f,Energy-efficient Routing of Multimedia Traffic in Frequency-Hop Packet Radio Networks,13,3,10.1007/s10776-005-0024-8,"List(http://dx.doi.org/10.1007/s10776-005-0024-8, https://link.springer.com/10.1007/s10776-005-0024-8)"
53e9987db7602d97020b8196,A Two-Cell-Lookahead Call Admission and Handoff Management Scheme for Multimedia LEO Satellite Networks,,18,10.1109/HICSS.2003.1174852,"List(http://dx.doi.org/10.1109/HICSS.2003.1174852, http://computer.org/proceedings/hicss/1874/track9/187490302babs.htm, http://ieeexplore.ieee.org/xpl/abstractAuthors.jsp?tp=&arnumber=1174852)"
53e9987db7602d97020b82ba,"Information Retrieval Data Structures & Algorithms, by William B. Frakes and Ricardo Baeza-Yates (Book Review)",27,0,,"List(http://doi.acm.org/10.1145/182119.1096164, https://doi.org/10.1145/182119.1096164, http://doi.org/10.1145/182119.1096164, https://dblp.uni-trier.de/db/journals/sigir/sigir27.html#Can93)"
53e9987db7602d97020b82c6,Performance Evaluation of Multi-rate Streaming Traffic by Quasi-Stationary Modelling,5894,0,10.1007/978-3-642-10406-0_3,"List(http://dx.doi.org/10.1007/978-3-642-10406-0_3, http://www.webofknowledge.com/)"
53e9987db7602d97020b81ba,A novel ensemble algorithm for biomedical classification based on Ant Colony Optimization,11,3,10.1016/j.asoc.2011.03.025,"List(http://dx.doi.org/10.1016/j.asoc.2011.03.025, http://www.webofknowledge.com/)"
53e9987db7602d97020b83e4,Computation Trees and Transformations of Logic Programs,,0,,List()


In [0]:
def check_return_data(check_type, data, cur):
    if data is not None and check_type in data:
        return data[check_type]
    else:
        return cur
    
def update_df(df, doi_list, data_list, data_str, is_int=False):
    data_dict = dict(zip(doi_list, data_list))
    update_data = udf(lambda x: data_dict[x], IntegerType() if is_int else StringType())
    return df.withColumn(data_str, update_data(F.col('doi')))

def get_publication_data(df):
    doi_list = df.select(F.col("doi")).rdd.flatMap(lambda x: x).collect()
    url_list = df.select(F.col("url")).rdd.flatMap(lambda x: x).collect()
    title_list = df.select(F.col("title")).rdd.flatMap(lambda x: x).collect()
    citation_list = df.select(F.col("n_citation")).rdd.flatMap(lambda x: x).collect()
    volume_list = df.select(F.col("volume")).rdd.flatMap(lambda x: x).collect()
    new_volume_list = []
    n_citation_list = []
    series_list = []
    new_doi_list = []
    for i, doi in enumerate(doi_list):
        try:
            if doi == "" or doi is None:
                if "doi" in url_list[i][0]:
                    doi_req = url_list[i][0].split("org/")[-1]
                else:
                    raise Exception
            else:
                doi_req = doi
            
            new_doi_list.append(doi_req)
            response = requests.get(f"https://api.crossref.org/works/{doi_req}")
            data = response.json()['message']
        except Exception:
            new_doi_list.append(None)
            data = None

        n_citation_list.append(int(check_return_data('is-referenced-by-count', data, citation_list[i])))
        new_volume_list.append(check_return_data('volume', data, volume_list[i]))
        temp = check_return_data('container-title', data, None)
        series_list.append(temp if temp == None else temp[0])

        time.sleep(0.05)
    
    df = update_df(df, doi_list, n_citation_list, 'n_citation')
    df = update_df(df, doi_list, new_volume_list, 'volume')
    df = update_df(df, doi_list, series_list, 'series')
    df = update_df(df, doi_list, new_doi_list, 'doi')
    
    return df

In [0]:
publications_raw = get_publication_data(publication_raw).filter("n_citation > 2")
# publication = get_publication_dataget_crossref_data(publication_raw).select("_id", "title", "volume", "series", "n_citation").filter("n_citation > 2")

publications = (publications_raw.select("title", "volume", "series", "n_citation")
            .dropDuplicates()
            .withColumn("publ_id", F.expr("uuid()"))
          )

publications = (publications_raw.join(publications, ["title", "volume", "series", "n_citation"])
           .select("publ_id", "title", "volume", "series", "n_citation")
           .dropDuplicates()
          )

display(publications)
publications.write.format("delta").mode("overwrite").saveAsTable("publications")

[0;31m---------------------------------------------------------------------------[0m
[0;31mAnalysisException[0m                         Traceback (most recent call last)
[0;32m<command-4451806548808403>[0m in [0;36m<cell line: 1>[0;34m()[0m
[0;32m----> 1[0;31m [0mpublications_raw[0m [0;34m=[0m [0mget_publication_data[0m[0;34m([0m[0mpublication_raw[0m[0;34m)[0m[0;34m.[0m[0mfilter[0m[0;34m([0m[0;34m"n_citation > 2"[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[1;32m      2[0m [0;31m# publication = get_publication_dataget_crossref_data(publication_raw).select("_id", "title", "volume", "series", "n_citation").filter("n_citation > 2")[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[1;32m      3[0m [0;34m[0m[0m
[1;32m      4[0m publications = (publications_raw.select("title", "volume", "series", "n_citation")
[1;32m      5[0m             [0;34m.[0m[0mdropDuplicates[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m

[0;32m<command-4451806548808402>

## Types

In [0]:
types_raw = filtered_df.limit(100).select("_id", "volume", "issue", "doi", "url", "venue", "title")

display(types_raw)

_id,volume,issue,doi,url,venue,title
53e9987db7602d97020b8240,23,1,10.1108/10650750710720757,List(http://dx.doi.org/10.1108/10650750710720757),"List(555036d97cea80f95415f809, null, null, Oclc Systems & Services, null, null, null, OCLC Systems & Services, null, null, null, null, 0)","Building partnerships among social science researchers, institution-based repositories and domain specific data archives"
53e9987db7602d97020b8243,,,10.1007/978-3-642-14496-7_7,List(http://dx.doi.org/10.1007/978-3-642-14496-7_7),"List(555037247cea80f95417608a, null, null, International Conference on Information Theoretic Security, null, null, null, ICITS, null, null, null, null, 0)",Efficient statistical asynchronous verifiable secret sharing with optimal resilience
53e9987db7602d97020b818a,96-A,12,,List(http://search.ieice.org/bin/summary.php?id=e96-a_12_2728),"List(555036cc7cea80f95415814b, null, null, IEICE Transactions on Fundamentals of Electronics, Communications and Computer Sciences, null, null, null, IEICE Transactions, null, null, null, null, 0)",On the Sparse Signal Recovery with Parallel Orthogonal Matching Pursuit.
53e9987db7602d97020b8292,6,,,List(http://www.jmlr.org/proceedings/papers/v6/voortman10a.html),"List(53a728e520f7420be8bbc4bb, null, null, Neural Information Processing Systems, null, null, null, NIPS Causality: Objectives and Assessment, null, null, null, null, 0)",Learning Causal Models That Make Correct Manipulation Predictions
53e9987db7602d97020b818f,13,3,10.1007/s10776-005-0024-8,"List(http://dx.doi.org/10.1007/s10776-005-0024-8, https://link.springer.com/10.1007/s10776-005-0024-8)","List(555036c47cea80f954153a3f, null, null, International Journal of Wireless Information Networks, null, null, null, IJWIN, null, null, null, null, 0)",Energy-efficient Routing of Multimedia Traffic in Frequency-Hop Packet Radio Networks
53e9987db7602d97020b8196,,,10.1109/HICSS.2003.1174852,"List(http://dx.doi.org/10.1109/HICSS.2003.1174852, http://computer.org/proceedings/hicss/1874/track9/187490302babs.htm, http://ieeexplore.ieee.org/xpl/abstractAuthors.jsp?tp=&arnumber=1174852)","List(53a724b320f7420be8b37f4c, null, null, Hawaii International Conference on System Sciences, null, null, null, HICSS, null, null, null, null, 0)",A Two-Cell-Lookahead Call Admission and Handoff Management Scheme for Multimedia LEO Satellite Networks
53e9987db7602d97020b82ba,27,3,,"List(http://doi.acm.org/10.1145/182119.1096164, https://doi.org/10.1145/182119.1096164, http://doi.org/10.1145/182119.1096164, https://dblp.uni-trier.de/db/journals/sigir/sigir27.html#Can93)","List(53a72a9120f7420be8c056af, null, null, null, null, null, null, SIGIR Forum, null, null, null, null, 0)","Information Retrieval Data Structures & Algorithms, by William B. Frakes and Ricardo Baeza-Yates (Book Review)"
53e9987db7602d97020b82c6,5894,,10.1007/978-3-642-10406-0_3,"List(http://dx.doi.org/10.1007/978-3-642-10406-0_3, http://www.webofknowledge.com/)","List(53a72bd420f7420be8c3282b, null, null, null, null, null, null, NET-COOP, null, null, null, null, 0)",Performance Evaluation of Multi-rate Streaming Traffic by Quasi-Stationary Modelling
53e9987db7602d97020b81ba,11,8,10.1016/j.asoc.2011.03.025,"List(http://dx.doi.org/10.1016/j.asoc.2011.03.025, http://www.webofknowledge.com/)","List(555036b67cea80f95414b7c5, null, null, null, null, null, null, Appl. Soft Comput., null, null, null, null, 0)",A novel ensemble algorithm for biomedical classification based on Ant Colony Optimization
53e9987db7602d97020b83e4,,,,List(),"List(53a7278720f7420be8b92c2a, null, null, International Conference on Lightning Protection, null, null, null, ICLP, null, null, null, null, 0)",Computation Trees and Transformations of Logic Programs


In [0]:
types_pre_api = types_raw.withColumn("type", F.when(F.col("venue.raw").contains("@"), "workshop")
                   .when(((F.col("volume").isNotNull()) & (F.col("volume") != "")) | ((F.col("issue").isNotNull()) & (F.col("issue") != "")), "journal-article")
                   .otherwise("conference paper"))

display(types_pre_api)

_id,volume,issue,doi,url,venue,title,type
53e9987db7602d97020b8240,23,1,10.1108/10650750710720757,List(http://dx.doi.org/10.1108/10650750710720757),"List(555036d97cea80f95415f809, null, null, Oclc Systems & Services, null, null, null, OCLC Systems & Services, null, null, null, null, 0)","Building partnerships among social science researchers, institution-based repositories and domain specific data archives",journal-article
53e9987db7602d97020b8243,,,10.1007/978-3-642-14496-7_7,List(http://dx.doi.org/10.1007/978-3-642-14496-7_7),"List(555037247cea80f95417608a, null, null, International Conference on Information Theoretic Security, null, null, null, ICITS, null, null, null, null, 0)",Efficient statistical asynchronous verifiable secret sharing with optimal resilience,conference paper
53e9987db7602d97020b818a,96-A,12,,List(http://search.ieice.org/bin/summary.php?id=e96-a_12_2728),"List(555036cc7cea80f95415814b, null, null, IEICE Transactions on Fundamentals of Electronics, Communications and Computer Sciences, null, null, null, IEICE Transactions, null, null, null, null, 0)",On the Sparse Signal Recovery with Parallel Orthogonal Matching Pursuit.,journal-article
53e9987db7602d97020b8292,6,,,List(http://www.jmlr.org/proceedings/papers/v6/voortman10a.html),"List(53a728e520f7420be8bbc4bb, null, null, Neural Information Processing Systems, null, null, null, NIPS Causality: Objectives and Assessment, null, null, null, null, 0)",Learning Causal Models That Make Correct Manipulation Predictions,journal-article
53e9987db7602d97020b818f,13,3,10.1007/s10776-005-0024-8,"List(http://dx.doi.org/10.1007/s10776-005-0024-8, https://link.springer.com/10.1007/s10776-005-0024-8)","List(555036c47cea80f954153a3f, null, null, International Journal of Wireless Information Networks, null, null, null, IJWIN, null, null, null, null, 0)",Energy-efficient Routing of Multimedia Traffic in Frequency-Hop Packet Radio Networks,journal-article
53e9987db7602d97020b8196,,,10.1109/HICSS.2003.1174852,"List(http://dx.doi.org/10.1109/HICSS.2003.1174852, http://computer.org/proceedings/hicss/1874/track9/187490302babs.htm, http://ieeexplore.ieee.org/xpl/abstractAuthors.jsp?tp=&arnumber=1174852)","List(53a724b320f7420be8b37f4c, null, null, Hawaii International Conference on System Sciences, null, null, null, HICSS, null, null, null, null, 0)",A Two-Cell-Lookahead Call Admission and Handoff Management Scheme for Multimedia LEO Satellite Networks,conference paper
53e9987db7602d97020b82ba,27,3,,"List(http://doi.acm.org/10.1145/182119.1096164, https://doi.org/10.1145/182119.1096164, http://doi.org/10.1145/182119.1096164, https://dblp.uni-trier.de/db/journals/sigir/sigir27.html#Can93)","List(53a72a9120f7420be8c056af, null, null, null, null, null, null, SIGIR Forum, null, null, null, null, 0)","Information Retrieval Data Structures & Algorithms, by William B. Frakes and Ricardo Baeza-Yates (Book Review)",journal-article
53e9987db7602d97020b82c6,5894,,10.1007/978-3-642-10406-0_3,"List(http://dx.doi.org/10.1007/978-3-642-10406-0_3, http://www.webofknowledge.com/)","List(53a72bd420f7420be8c3282b, null, null, null, null, null, null, NET-COOP, null, null, null, null, 0)",Performance Evaluation of Multi-rate Streaming Traffic by Quasi-Stationary Modelling,journal-article
53e9987db7602d97020b81ba,11,8,10.1016/j.asoc.2011.03.025,"List(http://dx.doi.org/10.1016/j.asoc.2011.03.025, http://www.webofknowledge.com/)","List(555036b67cea80f95414b7c5, null, null, null, null, null, null, Appl. Soft Comput., null, null, null, null, 0)",A novel ensemble algorithm for biomedical classification based on Ant Colony Optimization,journal-article
53e9987db7602d97020b83e4,,,,List(),"List(53a7278720f7420be8b92c2a, null, null, International Conference on Lightning Protection, null, null, null, ICLP, null, null, null, null, 0)",Computation Trees and Transformations of Logic Programs,conference paper


In [0]:
def check_return_data(check_type, data, cur):
    if data is not None and check_type in data:
        return data[check_type]
    else:
        return cur
    
def update_df(df, doi_list, data_list, data_str, is_int=False):
    new_df = spark.createDataFrame(zip(df.select("doi").rdd.flatMap(lambda x: x).collect(), data_list), ["doi", data_str])
    return df.join(new_df, on="doi", how="left").select(new_df["doi"], new_df[data_str], F.col("title"))

def get_type_data(df):
    doi_list = df.select(F.col("doi")).rdd.flatMap(lambda x: x).collect()
    url_list = df.select(F.col("url")).rdd.flatMap(lambda x: x).collect()
    type_list = df.select(F.col("type")).rdd.flatMap(lambda x: x).collect()
    new_type_list = []
    for i, doi in enumerate(doi_list):
        try:
            if doi == "" or doi is None:
                if "doi" in url_list[i][0]:
                    doi_req = url_list[i][0].split("org/")[-1]
                else:
                    raise Exception
            else:
                doi_req = doi
            
            response = requests.get(f"https://api.crossref.org/works/{doi_req}")
            data = response.json()['message']
        except Exception:
            data = None

        new_type_list.append(check_return_data('type', data, type_list[i]))

        time.sleep(0.05)

    df = update_df(df, doi_list, new_type_list, 'type')
    
    return df

In [0]:
types_raw = get_type_data(types_pre_api)

types = (types_raw.select("type")
            .dropDuplicates()
            .withColumn("type_id", F.expr("uuid()"))
          )

types = (types_raw.join(types, ["type"])
           .select("type_id", "type", "title")
           .dropDuplicates()
        )

display(types)

types_clean = types.select("type_id", "type")
types_clean.write.format("delta").mode("overwrite").saveAsTable("types")

[0;31m---------------------------------------------------------------------------[0m
[0;31mAnalysisException[0m                         Traceback (most recent call last)
[0;32m<command-4451806548808408>[0m in [0;36m<cell line: 1>[0;34m()[0m
[0;32m----> 1[0;31m [0mtypes_raw[0m [0;34m=[0m [0mget_type_data[0m[0;34m([0m[0mtypes_pre_api[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[1;32m      2[0m [0;34m[0m[0m
[1;32m      3[0m types = (types_raw.select("type")
[1;32m      4[0m             [0;34m.[0m[0mdropDuplicates[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[1;32m      5[0m             [0;34m.[0m[0mwithColumn[0m[0;34m([0m[0;34m"type_id"[0m[0;34m,[0m [0mF[0m[0;34m.[0m[0mexpr[0m[0;34m([0m[0;34m"uuid()"[0m[0;34m)[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m

[0;32m<command-4451806548808407>[0m in [0;36mget_type_data[0;34m(df)[0m
[1;32m     10[0m [0;34m[0m[0m
[1;32m     11[0m [0;32mdef[0m [0mget_type_data[0m[0;34m

## Keywords

In [0]:
keywords_raw = filtered_df.limit(100).select("_id", "keywords", "doi", "title")

display(keywords_raw)

_id,keywords,doi,title
53e9987db7602d97020b8240,"List(design methodology, information flow, data storage, social science, preprint, digital library, life cycle, digital repository)",10.1108/10650750710720757,"Building partnerships among social science researchers, institution-based repositories and domain specific data archives"
53e9987db7602d97020b8243,"List(optimal resilience, efficient statistical asynchronous, statistical avss protocol, avss protocol, a-cast communication, verifiable secret sharing, a-casts o, private communication, multiplication gate, communication complexity, share l, statistical ampc, finite field, secret sharing)",10.1007/978-3-642-14496-7_7,Efficient statistical asynchronous verifiable secret sharing with optimal resilience
53e9987db7602d97020b818a,"List(mean squared error, orthogonal matching pursuit, compressed sensing)",,On the Sparse Signal Recovery with Parallel Orthogonal Matching Pursuit.
53e9987db7602d97020b8292,List(causal models),,Learning Causal Models That Make Correct Manipulation Predictions
53e9987db7602d97020b818f,"List(Packet radio networks, multimedia routing protocols, frequency-hop spread spectrum, ad hoc wireless networks)",10.1007/s10776-005-0024-8,Energy-efficient Routing of Multimedia Traffic in Frequency-Hop Packet Radio Networks
53e9987db7602d97020b8196,"List(bandwidth allocation, mobile satellite communication, multimedia communication, network topology, quality of service, telecommunication congestion control, telecommunication network management, QoS, bandwidth utilization, call blocking probability, call dropping probability, handoff management, multimedia LEO satellite networks, multimedia connections, network topology, predictive bandwidth allocation, simulation, two-cell-lookahead call admission)",10.1109/HICSS.2003.1174852,A Two-Cell-Lookahead Call Admission and Handoff Management Scheme for Multimedia LEO Satellite Networks
53e9987db7602d97020b82ba,"List(information retrieval, data structure)",,"Information Retrieval Data Structures & Algorithms, by William B. Frakes and Ricardo Baeza-Yates (Book Review)"
53e9987db7602d97020b82c6,"List(traffic variation, constant bit rate flow, original contribution, performance evaluation, continuous distribution, quasi-stationary modelling, quasi-stationary approach, traffic performance, simple case, flow peak rate, multi-rate aspect, multi-rate streaming traffic, simulation, constant bit rate)",10.1007/978-3-642-10406-0_3,Performance Evaluation of Multi-rate Streaming Traffic by Quasi-Stationary Modelling
53e9987db7602d97020b81ba,"List(ant colony optimization, biomedical data, generalization ability, biomedicine community, biomedical classification, ensemble learning, existing technique, higher prediction performance, trained component classifier, novel ensemble algorithm, ensemble approach, rough set)",10.1016/j.asoc.2011.03.025,A novel ensemble algorithm for biomedical classification based on Ant Colony Optimization
53e9987db7602d97020b83e4,List(),,Computation Trees and Transformations of Logic Programs


In [0]:
# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

def is_city_location_person(keyword):
    # Process the keyword using the spaCy NER model
    doc = nlp(keyword)
    
    # Check if any of the entities recognized by the model are of type GPE (city or location) or PERSON
    for ent in doc.ents:
        if ent.label_ in ['GPE', 'PERSON']:
            return True
    
    return False

def update_df(df, doi_list, data_list, data_str, is_int=False):
    data_dict = dict(zip(doi_list, data_list))
    update_data = udf(lambda x: data_dict[x], IntegerType() if is_int else StringType())
    return df.withColumn(data_str, update_data(F.col('doi')))

def check_keywords(df):
    doi_list = df.select(F.col("doi")).rdd.flatMap(lambda x: x).collect()
    keywords_list = df.select(F.col("keywords")).rdd.flatMap(lambda x: x).collect()
    new_keywords_list = []
    for i, keys in enumerate(keywords_list):
        temp = []
        for k in keys:
            if not is_city_location_person(k):
                temp.append(k)
        new_keywords_list.append(temp)
        
    df = update_df(df, doi_list, keywords_list, 'keywords')
    
    return df

In [0]:
keywords_raw = check_keywords(keywords_raw).filter(F.size(keywords_raw.keywords) > 0)


keywords = (keywords_raw.select("keywords")
            .dropDuplicates()
            .withColumn("keyw_id", F.expr("uuid()"))
          )

keywords = (keywords_raw.join(keywords, ["keywords"])
           .select("keyw_id", "keywords", "title")
           .dropDuplicates()
        )

display(keywords)

keywords_clean = keywords.select("keyw_id", "keywords")
keywords_clean.write.format("delta").mode("overwrite").saveAsTable("keywords")

[0;31m---------------------------------------------------------------------------[0m
[0;31mAnalysisException[0m                         Traceback (most recent call last)
[0;32m<command-4451806548808412>[0m in [0;36m<cell line: 1>[0;34m()[0m
[0;32m----> 1[0;31m [0mkeywords_raw[0m [0;34m=[0m [0mcheck_keywords[0m[0;34m([0m[0mkeywords_raw[0m[0;34m)[0m[0;34m.[0m[0mfilter[0m[0;34m([0m[0mF[0m[0;34m.[0m[0msize[0m[0;34m([0m[0mkeywords_raw[0m[0;34m.[0m[0mkeywords[0m[0;34m)[0m [0;34m>[0m [0;36m0[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[1;32m      2[0m [0;34m[0m[0m
[1;32m      3[0m [0;34m[0m[0m
[1;32m      4[0m keywords = (keywords_raw.select("keywords")
[1;32m      5[0m             [0;34m.[0m[0mdropDuplicates[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m

[0;32m<command-4451806548808411>[0m in [0;36mcheck_keywords[0;34m(df)[0m
[1;32m     19[0m [0;34m[0m[0m
[1;32m     20[0m [0;32mdef[0m [0mcheck_keywords

## Date

In [0]:
# https://pypi.org/project/habanero/
cr = Crossref()

def getDateHabanero(doi, year):
    formated = [None, None, None]
    try:
        result = cr.works(ids = doi)['message']['issued']['date-parts'][0]
        for i in [0, 1, 2]:
            formated[i] = result[i]
    except:
        pass
    if formated[0] == None:
        formated[0] = year 
    return formated

getDateHabaneroUDF = udf(getDateHabanero, ArrayType(StringType()))

In [0]:
dates_df = (filtered_df
            .limit(100)
            .select('title', "year", "doi")
            .withColumn("Date", getDateHabaneroUDF(F.col("doi"), F.col("year")))
            .select('title',
                    F.col("Date")[2].alias("day"),
                    F.col("Date")[1].alias("month"),
                    F.col("Date")[0].alias("year")
                   )
           )

dates = (dates_df.select([F.col("day").alias("day1"), F.col("month").alias("month1"), F.col("year").alias("year1")])
            .dropDuplicates()
            .withColumn("date_id", F.expr("uuid()"))
          )

dates = (dates_df.join(dates, ((dates_df.day.eqNullSafe(dates.day1)) & 
                               (dates_df.month.eqNullSafe(dates.month1)) &
                               (dates_df.year.eqNullSafe(dates.year1))))
           .select("date_id", "day", "month", "year", "title")
           .dropDuplicates()
        )

display(dates)

dates_clean = dates.select("date_id", "day", "month", "year")
#dates_clean.write.format("delta").mode("overwrite").saveAsTable("dates")

date_id,day,month,year,title
e4422330-cb92-4d71-95a7-219012c4b0ca,,6.0,2010,Subspace models for document script and language identification
dfdb9d29-9504-451d-851c-e8c1eb2eef1d,,,2005,An ideal run mode for mass transit based on ADS.
8fd71e92-0e59-4b1a-9275-82d463b00016,,,2002,Performance analysis of RAKE receivers for ultra-wideband communications with PPM and OOK in multipath channels.
137b0e14-c2b0-44e7-8237-8bf0156f96e8,,3.0,2008,Design and Control of a Small-Clearance Driving Simulator
b4c57083-d314-4621-9152-034b8ee5c701,,,2000,On Soddy's Hexlet and a Linked 4-Pair
2cd3eb1c-48f3-4c94-a93c-ee75704bf4d8,,10.0,2009,Hemispheric asymmetry in cognitive division of anterior cingulate cortex: a resting-state functional connectivity study.
ed75524a-8c07-4ecb-bd5c-7ca8d3b793d9,,5.0,2013,A 1.7mW quadrature bandpass ΔΣ ADC with 1MHz BW and 60dB DR at 1MHz IF
02aedb0e-6a29-4101-8286-ebc58256ac58,,,2010,Efficient statistical asynchronous verifiable secret sharing with optimal resilience
00cba79e-342f-4088-b554-6226b1684422,,,2004,An efficient scheduling algorithm for combined input-crosspoint-queued (CICQ) switches
d44990b4-3af0-4d88-bcbe-108de0a7a312,,9.0,1981,A Quadratic Programming Model for Optimal Data Distribution


## Language

In [0]:
lang_df = filtered_df.select("title", "lang")

lang = (lang_df.select("lang")
            .dropDuplicates()
            .withColumn("lang_id", F.expr("uuid()"))
          )

lang = (lang_df.join(lang, ["lang"])
           .select("lang_id", "lang", "title")
           .dropDuplicates()
        )

display(lang)

lang_clean = lang.select("lang_id", "lang")
#lang_clean.write.format("delta").mode("overwrite").saveAsTable("lang")

lang_id,lang,title
bf7228a0-9880-429c-90ee-917bcbf58433,en,A Pragmatic Survey of Automated Debugging
bf7228a0-9880-429c-90ee-917bcbf58433,en,On the extension of UML with use case maps concepts
bf7228a0-9880-429c-90ee-917bcbf58433,en,Forecasting financial condition of Chinese listed companies based on support vector machine
bf7228a0-9880-429c-90ee-917bcbf58433,en,High Assurance Step-by-Step Autonomous Construction Technique for Large Real Time System
bf7228a0-9880-429c-90ee-917bcbf58433,en,Practice: why initial conditions are important
bf7228a0-9880-429c-90ee-917bcbf58433,en,ERS-1 scatterometer measurements. II. An algorithm for ocean-surface wind retrieval including light winds
bf7228a0-9880-429c-90ee-917bcbf58433,en,Thermal expansion and third order elastic constants of FCC metals
bf7228a0-9880-429c-90ee-917bcbf58433,en,Caching in Real-time Reconfiguration Port Scheduling
bf7228a0-9880-429c-90ee-917bcbf58433,en,Building Scalable PGAS Communication Subsystem on Blue Gene/Q
bf7228a0-9880-429c-90ee-917bcbf58433,en,W-Disjoint Orthogonality Based Residual Acoustic Echo Cancellation for Hands-Free Communication


## MAIN TABLE

In [0]:
main_table = (publications.join(authors, ["title"])
              .join(organization, ["author"], "left")
              .join(types, ["title"])
              .join(keywords, ["title"])
              .join(dates, ["title"])
              .join(lang, ["title"])
              .select("publ_id", "author_id", "org_id", "type_id", "keyw_id", "date_id", "lang_id", F.col("rank").alias("author_rank"))
              )
display(main_table)
main_table.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable("DBLP_fact_table")

[0;31m---------------------------------------------------------------------------[0m
[0;31mNameError[0m                                 Traceback (most recent call last)
[0;32m<command-4451806548808420>[0m in [0;36m<cell line: 1>[0;34m()[0m
[0;32m----> 1[0;31m main_table = (publications.join(authors, ["title"])
[0m[1;32m      2[0m               [0;34m.[0m[0mjoin[0m[0;34m([0m[0morganization[0m[0;34m,[0m [0;34m[[0m[0;34m"author"[0m[0;34m][0m[0;34m,[0m [0;34m"left"[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[1;32m      3[0m               [0;34m.[0m[0mjoin[0m[0;34m([0m[0mtypes[0m[0;34m,[0m [0;34m[[0m[0;34m"title"[0m[0;34m][0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[1;32m      4[0m               [0;34m.[0m[0mjoin[0m[0;34m([0m[0mkeywords[0m[0;34m,[0m [0;34m[[0m[0;34m"title"[0m[0;34m][0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[1;32m      5[0m               [0;34m.[0m[0mjoin[0m[0;34m([0m[0mdates[0m[0;34m,[0m [0;34m[