In [0]:
%sh
pip install spacy
pip install habanero
python -m spacy download en_core_web_sm

Collecting spacy
  Downloading spacy-3.5.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.6 MB)
Collecting thinc<8.2.0,>=8.1.8
  Downloading thinc-8.1.10-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (924 kB)
Collecting tqdm<5.0.0,>=4.38.0
  Downloading tqdm-4.65.0-py3-none-any.whl (77 kB)
Collecting pydantic!=1.8,!=1.8.1,<1.11.0,>=1.7.4
  Downloading pydantic-1.10.7-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.2 MB)
Collecting spacy-legacy<3.1.0,>=3.0.11
  Downloading spacy_legacy-3.0.12-py2.py3-none-any.whl (29 kB)
Collecting srsly<3.0.0,>=2.4.3
  Downloading srsly-2.4.6-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (491 kB)
Collecting preshed<3.1.0,>=3.0.2
  Downloading preshed-3.0.8-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (128 kB)
Collecting murmurhash<1.1.0,>=0.28.0
  Downloading murmurhash-1.0.9-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2

In [0]:
dbutils.library.restartPython()

In [0]:
from pyspark.sql.types import StringType, IntegerType, ArrayType
import time
import pyspark.sql.functions as F
from pyspark.sql import Window
import requests
import json
import urllib.parse
import spacy

#### Database Schema

<img src="https://i.ibb.co/hRqLR8d/Schema.png" alt="Schema" border="0">

In [0]:
raw_df = spark.read.format("delta").load("/user/hive/warehouse/scientific_publications")
display(raw_df.limit(5))

_id,abstract,authors,doi,fos,isbn,issn,issue,keywords,lang,n_citation,page_end,page_start,pdf,references,title,url,venue,volume,year
53e99784b7602d9701f3e3f5,,,,,,,,List(),en,0.0,,,,,3GIO.,,"List(null, null, null, null, null, null, null, null, null, null, null, null, 0)",,2011
53e99784b7602d9701f3e133,"Drought is the first place in all the natural disasters in the world. It is especially serious in North China Plain. In this paper, different soil water content control levels at winter wheat growth stages are performed on Gucheng Ecological-Meteorological Integrated Observation Experiment Station of CAMS, China. Some canopy parameters, including growth conditions, dry weight, physiological parameters and hyperspectral reflectance, are measured from erecting stage to milk stage for winter wheat in 2009. The relationship between canopy parameters and soil relative moisture, canopy water content and water indices of winter wheat are established. The results show that some parameters, such as SPAD and dry weight of leaves, decrease with the increasing of soil relative moisture, while other parameters, including dry weight of caudexes, above ground dry weight, height, photosynthesis rate, intercellular CO 2 concentration, stomatal conductance and transpiration rate, increase corresponding to the soil relative moisture. Obvious linear relationship between stomatal conductance and transpiration rate is established with 45 samples, which R2 reaches to 0.6152. Finally, the fitting equations between canopy water content and water indices are regressed with b5, b6 and b7 of MODIS bands. The equations are best with b7 and worst with b5. So the fitting equations with b7 can be used to inverse the canopy water content of winter wheat using MODIS or other remote sensing images with similar bands range to MODIS in Hebei Province. © 2011 IEEE.","List(List(53f45728dabfaec09f209538, null, null, null, Peijuan Wang, null, null, null, null, null, null, null, null, null, null), List(5601754345cedb3395e59457, null, null, null, Jiahua Zhang, null, null, null, null, null, null, null, null, null, null), List(53f38438dabfae4b34a08928, null, null, null, Donghui Xie, null, null, null, null, null, null, null, null, null, null), List(5601754345cedb3395e5945a, null, null, null, Yanyan Xu, null, null, null, null, null, null, null, null, null, null), List(53f43d25dabfaeecd6995149, null, null, null, Yun Xu, null, null, null, null, null, null, null, null, null, null))",10.1109/IGARSS.2011.6049503,"List(Agronomy, Moisture, Hydrology, Environmental science, Dry weight, Water content, Stomatal conductance, Transpiration, Irrigation, Soil water, Canopy)",,,,"List(canopy parameters, canopy spectrum, different soil water content control, winter wheat, irrigation, hydrology, radiometry, moisture, indexes, vegetation, indexation, dry weight, soil moisture, water content, indexing terms, spectrum, natural disaster)",en,0.0,1933,1930.0,,,The relationship between canopy parameters and spectrum of winter wheat under different irrigations in Hebei Province.,List(http://dx.doi.org/10.1109/IGARSS.2011.6049503),"List(53a7297d20f7420be8bd4ae7, null, null, International Geoscience and Remote Sensing Symposium, null, null, null, IGARSS, null, null, null, null, 0)",,2011
53e99784b7602d9701f3e151,,"List(List(53f46797dabfaeb22f542630, null, null, null, Jairo Rocha, null, null, null, null, null, null, null, null, null, null), List(54328883dabfaeb4c6a8a699, null, null, null, Theo Pavlidis, null, null, null, null, null, null, null, null, null, null))",10.1109/ICDAR.1993.395663,"List(Intelligent character recognition, Pattern recognition, Computer science, Feature (computer vision), Document processing, Handwriting recognition, Optical character recognition, Feature extraction, Feature (machine learning), Artificial intelligence, Intelligent word recognition)",,,,"List(handwriting recognition, prototypes, image segmentation, computer science, expert systems, knowledge base, pattern recognition, usability, optical character recognition, shape, feature extraction)",en,17.0,605,602.0,,"List(53e99cf5b7602d97025ace63, 557e8a7a6fee0fe990caa63d, 53e9a96cb7602d97032c459a, 53e9b929b7602d9704515791, 557e59ebf6678c77ea222447)",A solution to the problem of touching and broken characters.,List(http://dx.doi.org/10.1109/ICDAR.1993.395663),"List(53a72a4920f7420be8bfa51b, null, null, International Conference on Document Analysis and Recognition, null, null, null, ICDAR-1, null, null, null, null, 0)",,1993
53e99784b7602d9701f3e15d,"As process variations become a significant problem in deep sub-micron technology, a shift from deterministic static timing analysis to statistical static timing analysis for high-performance circuit designs could reduce the excessive conservatism that is built into current timing design methods. We address the timing yield problem for sequential circuits and propose a statistical approach to handle it. We consider the spatial and path reconvergence correlations between path delays, set-up time and hold time constraints, and clock skew due to process variations. We propose a method to get the timing yield based on the delay distributions of register-to-register paths in the circuit On average, the timing yield results obtained by our approach have average errors of less than 1.0% in comparison with Monte Carlo simulation. Experimental results show that shortest path variations and clock skew due to process variations have considerable impact on circuit timing, which could bias the timing yield results. In addition, the correlation between longest and shortest path delays is not significant.","List(List(53f43b03dabfaedce555bf2a, null, null, null, Min Pan, null, null, null, null, null, null, null, null, null, null), List(53f45ee9dabfaee43ecda842, null, null, null, Chris C. N. Chu, null, null, null, null, null, null, null, null, null, null), List(53f42e8cdabfaee1c0a4274e, null, null, null, Hai Zhou, null, null, null, null, null, null, null, null, null, null))",10.1109/ISCAS.2005.1465124,"List(Delay calculation, Timing failure, Monte Carlo method, Sequential logic, Statistical static timing analysis, Shortest path problem, Computer science, Algorithm, Clock skew, Static timing analysis, Statistics)",0-7803-8834-8,,,"List(sequential circuits, statistical distributions, set-up time constraints, register-to-register paths, statistical static timing analysis, integrated circuit modelling, parameter estimation, statistical analysis, circuit model, path delays, deep sub-micron technology, timing, delay distributions, delays, circuit timing, shortest path variations, hold time constraints, integrated circuit yield, process variations, integrated circuit layout, high-performance circuit designs, clock skew, timing yield estimation, deterministic static timing analysis, monte carlo simulation, design method, static timing analysis, design methodology, process variation, shortest path, registers, circuit design, circuit analysis)",en,28.0,2464Vol.3,2461.0,//static.aminer.org/pdf/PDF/000/423/329/timing_yield_estimation_using_statistical_static_timing_analysis.pdf,"List(53e9a8a9b7602d97031f6bb9, 599c7b6b601a182cd27360da, 53e9b443b7602d9703f3e52b, 53e9a6a6b7602d9702fdc57e, 599c7b6a601a182cd2735703, 53e9aad9b7602d970345afea, 5582821f0cf2bf7bae57ac18, 5e8911859fced0a24bb9a2ba, 53e9b002b7602d9703a5c932)",Timing yield estimation using statistical static timing analysis,"List(http://dx.doi.org/10.1109/ISCAS.2005.1465124, http://ieeexplore.ieee.org/xpl/abstractAuthors.jsp?tp=&arnumber=1465124)","List(53a72e2020f7420be8c80142, null, null, International Symposium on Circuits and Systems, null, null, null, ISCAS (3), null, null, null, null, 0)",,2005
53e99784b7602d9701f3e161,"360° represents the concerns that are addressed in most of my work and my approach to it: the effect of global high technology on daily life and the environment (for example, daily short-distance flights to avoid enormous wastes of time).","List(List(53f46946dabfaec09f24b4ed, null, null, 5b86cf1ae1cd8e14a3fc787b, Miguel Palma, null, 544bd9c245ce266baf189c4f, null, null, Miguel Palma Studio, null, null, null, null, null))",10.1145/1665137.1665166,,,,,"List(global high technology, daily short-distance flight, enormous waste, daily life)",en,,39,39.0,,,360°,,"List(5390a74a20f70186a0e8b40b, null, null, null, null, null, null, ACM SIGGRAPH ASIA 2009 Art Gallery & Emerging Technologies: Adaptation, null, null, null, null, null)",,2009


In [0]:
#Filter out titles that have less than 2 words
filtered_df = raw_df.withColumn("title_word_count", F.size(F.split("title", " "))).filter("title_word_count > 1")
display(filtered_df.limit(25))

_id,abstract,authors,doi,fos,isbn,issn,issue,keywords,lang,n_citation,page_end,page_start,pdf,references,title,url,venue,volume,year,title_word_count
53e99967b7602d97021a560a,"Ab initio predictions of secondary structures in proteins have to combine local predictions, based on short fragments of the protein sequence, with consistency restrictions, as not all locally plausible predictions may be simultaneously true. We use the fact that secondary structures are patterns of hydrogen bonds and that a single residue can participate in hydrogen bonds of at most one secondary structure. Consistency of fixed-sized pieces of secondary structures is the easiest to approximate and we formalize it as 1-2 matching problem. Consistency of entire secondary structures is a version of set packing. We also investigate how to form a simple problem if we add the requirement that the secondary structure and the loops that connect them fit together in a metric space. Every problem that we investigated is MAX-SNP hard and it has a constant factor approximation. Computational experience suggests that in biological instances, we can find nearly optimal solutions using heuristics.","List(List(53f45d3adabfaee0d9c0cb90, null, berman@cse.psu.edu, 5b86b6c2e1cd8e14a34ca959, Piotr Berman, null, null, null, null, The Pennsylvania State University, University Park, PA, USA, null, 5f71b2d91c455f439fe3eb4f, List(Penn State Univ, University Pk, PA 16802 USA), null, null), List(53f436cbdabfaedf43582e9f, null, null, 5b86b6c2e1cd8e14a34ca959, Jieun Jeong, null, null, null, null, The Pennsylvania State University, University Park, PA, USA, null, 5f71b2d91c455f439fe3eb4f, List(Penn State Univ, University Pk, PA 16802 USA), null, null))",10.1007/s00453-007-9068-8,"List(Algorithm, Heuristics, Set packing, Metric space, Ab initio, Hydrogen bond, Protein secondary structure, Mathematics, Protein structure, Theory of computation)",,0178-4617,1,"List(Secondary Structure, Maximum Weight, Local Prediction, Horizontal Edge, Connector Edge)",en,2,34,16.0,,"List(53e9bdcdb7602d9704a7e6ae, 53e99d5eb7602d970261a1c4, 53e9bdcdb7602d9704a7e6ae, 53e99e3eb7602d970270422e, 53e9adffb7602d970380a762, 53e9b593b7602d97040d9071, 53e99c6fb7602d9702522122, 55a43e58612ca648688dc943, 53e99822b7602d97020431be)",Consistent Sets of Secondary Structures in Proteins,"List(http://dx.doi.org/10.1007/s00453-007-9068-8, http://dx.doi.org/https://doi.org/10.1007/s00453-007-9068-8, https://link.springer.com/article/10.1007/s00453-007-9068-8, https://dl.acm.org/citation.cfm?id=3118965&picked=prox&preflayout=flat, http://www.webofknowledge.com/)","List(539078f220f770854f5a882f, null, null, null, null, null, null, Algorithmica, null, null, null, null, 0)",53,2009,7
53e99960b7602d97021a53fa,"In this work a new method of feature extraction for an interactive and adaptive recognizer for on-line handwritten alphanumeric characters has been proposed. The system is suitable for use in conjunction with magnetic pen based devices for inputting data to a data processing system or a computer terminal. The features are extracted from dynamically changing locations of the writing device. The new method of feature extraction is simple, computationally light and fast enough for adaptive on-line use. Extracted features are robust with respect to all possible distortions like shape, size, and orientation. For simulation experiment, numerals 0-9 are used. A single hidden layer feed forward neural network trained by Quickprop algorithm, a variation of error back propagation is used for recognition. Very high recognition rates, even for highly distorted samples have been achieved confirming high generalization capability of the extracted feature set.","List(List(53f36626dabfae4b3499aa7a, null, null, 5b86b9c0e1cd8e14a3625bbe, Basabi Chakraborty, null, null, null, null, Faculty of Software and Information Science, Iwate Prefectural University, 152-52 Aza Sugo, Takizawamura, Iwate 020-0193, Japan, null, 5f71b4211c455f439fe47d1e, List(Faculty of Software and Information Science, Iwate Prefectural University, 152-52 Aza Sugo, Takizawamura, Iwate 020-0193, Japan), null, null), List(53f428c6dabfaeb22f3d18be, null, null, 5b86b9c0e1cd8e14a3625bbe, Goutam Chakraborty, null, null, null, null, Faculty of Software and Information Science, Iwate Prefectural University, 152-52 Aza Sugo, Takizawamura, Iwate 020-0193, Japan, null, 5f71b4211c455f439fe47d1e, List(Corresponding author, Faculty of Software and Information Science, Iwate Prefectural University, 152-52 Aza Sugo, Takizawamura, Iwate 020-0193, Japan), null, null))",10.1016/S0020-0255(02)00276-1,"List(Alphanumeric, Feedforward neural network, Pattern recognition, Computer science, Data processing system, Feature extraction, Feature (machine learning), Artificial intelligence, Backpropagation, Artificial neural network, Quickprop)",,0020-0255,1-4,"List(adaptive on-line use, data processing system, feature extraction, inputting data, on-line recognition, handwritten alphanumeric character, feature set, new feature extraction technique, extracted feature, adaptive recognizer, new method, high generalization capability, high recognition rate, feed forward neural network, error back propagation, simulation experiment, data processing, artificial neural network)",en,19,70,55.0,,"List(53e998f6b7602d9702131ac1, 53e9a7acb7602d97030ebcb1, 53e9bd5ab7602d97049f4d54, 53e9b76eb7602d9704316bdc, 53e9a381b7602d9702c90ad0, 557d555c6feeaa8086da9ac9, 53e9a88cb7602d97031d8e96)",A new feature extraction technique for on-line recognition of handwritten alphanumeric characters,"List(http://dx.doi.org/10.1016/S0020-0255(02)00276-1, https://www.sciencedirect.com/science/article/pii/S0020025502002761)","List(555036b77cea80f95414b7e3, null, null, null, null, null, null, Inf. Sci., null, null, null, null, 0)",148,2002,12
53e99960b7602d97021a5416,,"List(List(53f4489fdabfaeee22a007dc, null, null, null, T. Merriam, null, null, null, null, null, null, null, null, null, 188660))",10.1093/llc/19.2.181,"List(Inference, Norm (social), Philosophy, Literature)",,,2,List(),en,0,195,181.0,,,King John Divided,List(http://dx.doi.org/10.1093/llc/19.2.181),"List(555036d37cea80f95415b0b3, null, null, null, null, null, null, LLC, null, null, null, null, 0)",19,2004,3
53e99960b7602d97021a541f,"The rapid evolution of the telecommunication domain increases the performance of different access networks continuously. New services, especially in the domain multimedia content distribution, require higher and higher bandwidth at the user's and service provider's side. Multimedia services like Video on Demand, IPTV, and live streaming were introduced in the past and are still improved in quality and quantity. Multimedia streams and Peer to Peer P2P file sharing dominates the worldwide Internet traffic nowadays and will continue further Schulze, 2009. The user acceptance of enjoying multimedia content over the Internet will grow steadily together with the increasing quality of the available multimedia content. Network operators and service providers have to face the growths by increasing their service platform with higher performance and bandwidth or introducing a scalable solution. In this paper, the authors present an algorithm for scalable P2P live streaming in Next-Generation-Networks NGN that addresses this challenge. An evaluation proves the performance of the implementation of this algorithm in a demo scenario.","List(List(null, null, null, 5b8694a4e1cd8e14a36732eb, Julius Müller, null, null, null, null, Fraunhofer FOKUS, Germany, null, 5f71b4fc1c455f439fe4def4, null, null, null), List(53f42abbdabfaeb1a7b680dc, null, null, 5b869fc3e1cd8e14a3b31423, Thomas Magedanz, null, null, null, null, TU Berlin, Germany, null, 5f71b2901c455f439fe3cbc2, null, null, null), List(53f4bb8cdabfaedce565271a, null, null, 5b869fc3e1cd8e14a3b31423, Jens Fiedler, null, null, null, null, TU Berlin, Germany, null, 5f71b2901c455f439fe3cbc2, null, null, null))",10.4018/jhcr.2010100102,"List(Next-generation network, Peer-to-peer, Computer science, Computer network, Service provider, IPTV, File sharing, Multimedia, Access network, Internet traffic, The Internet)",,,4,"List(scalability, ims, p2p, ngn, next generation network)",en,0,40,25.0,,"List(53e9b8a8b7602d9704480882, 53e9a88cb7602d97031d7522, 53e9b37bb7602d9703e5b878, 53e9ba06b7602d9704608db3, 53e9af61b7602d97039a16ce, 53e9a3e1b7602d9702cf7e32, 53e9a44fb7602d9702d6b817, 558ae98de4b037c08759e9ae, 5ce2d052ced107d4c6367f01, 5c871fb14895d9cbc6cec40d, 53e9be35b7602d9704af0d7e, 53e99915b7602d970214faad, 53e9b2bfb7602d9703d6cb3b)",Peer Assist Live Streaming Overlay for Next-Generation-Networks,"List(http://dx.doi.org/10.4018/jhcr.2010100102, http://dx.doi.org/10.4018/978-1-4666-0921-1.ch017)","List(555036cc7cea80f954158149, null, null, International Journal of Handheld Computing Research, null, null, null, IJHCR, null, null, null, null, 0)",1,2010,7
53e99960b7602d97021a5427,"In conventional DHTs, each node is assigned an exclusive slice of identifier space. Simple it is, such arrangement may be rough. In this paper we propose a generic component structure: several independent nodes constitute a cell; a slice of identifier space is under nodes' condominium; part of nodes in the same cell cooperatively and transparently shield the internal dynamism and structure of the cell from outsiders; this type of structure can be recursively repeated. Cells act like raw nodes in conventional DHTs and cell components can be used as bricks to construct any DHT-like systems. This approach provides encapsulation, scalable hierarchy, and enhanced security with bare incurred complexity.","List(List(54408649dabfae7f9b33f2b4, null, cm01@mails.tsinghua.edu.cn, 5b86c040e1cd8e14a3913066, Ming Chen, null, null, null, null, Dept. of Computer Science and Technology, Tsinghua University, null, 5f71b2881c455f439fe3c860, null, null, null), List(null, null, ygw@mail.tsinghua.edu.cn, 5b86c040e1cd8e14a3913066, Guangwen Yang, null, null, null, null, Dept. of Computer Science and Technology, Tsinghua University, null, 5f71b2881c455f439fe3c860, null, null, null), List(56061ee245cedb33967bcefd, null, wuyw@tsinghua.edu.cn, 5b86c040e1cd8e14a3913066, Yongwei Wu, null, null, null, null, Dept. of Computer Science and Technology, Tsinghua University, null, 5f71b2881c455f439fe3c860, null, null, null), List(53f4276ddabfaec09f0d8801, null, xuezhengliu00@mails.tsinghua.edu.cn, 5b86c040e1cd8e14a3913066, Xuezheng Liu, null, null, null, null, Dept. of Computer Science and Technology, Tsinghua University, null, 5f71b2881c455f439fe3c860, null, null, null))",10.1007/978-3-540-30141-7_67,"List(Object-oriented programming, Identifier, Computer science, Hierarchy, Generic programming, Encapsulation (computer programming), Recursion, Hash table, Distributed computing, Scalability)",,0302-9743,,List(),en,2,484,481.0,https://static.aminer.cn/upload/pdf/1967/700/1141/53e99960b7602d97021a5427_0.pdf,"List(5ce2d052ced107d4c6367f01, 53e9ab69b7602d970350990d, 53e9a0d1b7602d97029b7f79, 53e9a4d6b7602d9702df4ac5, 53e99b78b7602d970241f5f5)",Paramecium: Assembling Raw Nodes into Composite Cells,"List(http://dx.doi.org/10.1007/978-3-540-30141-7_67, http://www.webofknowledge.com/)","List(null, 0302-9743, null, null, null, 1611-3349, null, LECTURE NOTES IN COMPUTER SCIENCE, null, LECTURE NOTES IN COMPUTER SCIENCE, null, J, null)",3222,2004,7
53e99967b7602d97021a5645,,"List(List(53f43205dabfaeb2ac025041, null, null, null, Jean-François Nouvel, null, null, null, null, null, null, null, null, null, null), List(53f4667ddabfaee43ecf6de8, null, null, null, Jean-Claude Souyris, null, null, null, null, null, null, null, null, null, null))",10.1109/IGARSS.2009.5418122,"List(Radar engineering details, Radar, Nadir, Altimeter, Synthetic aperture radar, Backscatter, Remote sensing, Specular reflection, Ka band, Geology, Geodesy)",,,,"List(airborne radar, backscatter, height measurement, oceanographic regions, oceanographic techniques, remote sensing by radar, synthetic aperture radar, Camargue region, DRIVE radar, France, ONERA BUSARD platform, SAR instrument, backscattering coefficient profiles, dry surfaces, future space Ka band altimetry missions, nadir backscattering, pond surfaces, sea surfaces, sensor configuration, specular behavior, steep radar incident angle, synthetic aperture radar, water surface radar signal retrodiffusion, wet surfaces, Ka band, altimetry mode, radar sensor)",en,1,486,483.0,,List(53e9afbab7602d9703a0a47a),Radar Signal Retrodiffusion by Water Surface,"List(http://dx.doi.org/10.1109/IGARSS.2009.5418122, http://ieeexplore.ieee.org/xpl/abstractAuthors.jsp?tp=&arnumber=5418122)","List(53a7297c20f7420be8bd4ae5, null, null, International Geoscience and Remote Sensing Symposium, null, null, null, IGARSS, null, null, null, null, 0)",2,2009,6
53e99960b7602d97021a543e,,"List(List(53f43ff8dabfaec09f1b83f7, null, null, null, John Kar-Kin Zao, null, null, null, null, null, null, null, null, null, null), List(53f43a10dabfaee1c0abf341, null, null, null, Shih-Chen Fan, null, null, null, null, null, null, null, null, null, null), List(53f44bb5dabfaeecd69c216b, null, null, null, Bing-Shiang Yang, null, null, null, null, null, null, null, null, null, null), List(53f43418dabfaee4dc76c911, null, null, null, Shang Hwa Hsu, null, null, null, null, null, null, null, null, null, null), List(53f44879dabfaeee229ffd25, null, null, null, Han-Chin Cheng, null, null, null, null, null, null, null, null, null, null), List(53f44240dabfaedd74de06c4, null, null, null, Ming-Lun Liu, null, null, null, null, null, null, null, null, null, null), List(53f43b03dabfaedd74dcd1dd, null, null, null, Howard C. Huang, null, null, null, null, null, null, null, null, null, null), List(53f437c6dabfaeb2ac05fb63, null, null, null, Fan-Yi Teng, null, null, null, null, null, null, null, null, null, null))",10.1109/ICSMC.2008.4811630,"List(Mobile computing, Smart environment, Telecommunications, Computer science, Computer security, Universal Plug and Play, Sun SPOT, Mobile phone, Web service, Wireless sensor network, The Internet)",,,,"List(Java, Web services, biomedical telemetry, health care, mobile computing, sensor fusion, telemedicine, wireless sensor networks, Custos, ICT technique, Internet, Squawk Java J2ME, Sun SPOT, UPnP digital-home platform, Web service, Wei-Gong memorial hospital, clinical trial, geriatric psychiatry ward, mobile phone, multisensor data fusion, remote on-demand healthcare, small programmable object technology, smart phone, wireless sensor network, Smart Environments, Telemedicine, Telemonitoring, Wireless Sensor Networks)",en,5,2269,2264.0,,"List(53e9b2c6b7602d9703d6f600, 53e9b295b7602d9703d3dd5b, 53e9bb1cb7602d9704757f2a, 53e99866b7602d970209e5a2, 53e9b1c9b7602d9703c58a18)",Custos Remote on-demand healthcare aided with wireless sensors and mobile phones.,"List(http://dx.doi.org/10.1109/ICSMC.2008.4811630, http://ieeexplore.ieee.org/xpl/abstractAuthors.jsp?tp=&arnumber=4811630)","List(555037277cea80f954176d3b, null, null, Systems, Man and Cybernetics, null, null, null, SMC, null, null, null, null, 0)",,2008,11
53e99967b7602d97021a566b,,"List(List(53f7e7d1dabfae92b40f1791, null, null, null, Bo Xiong, null, null, null, null, null, null, null, null, null, null), List(5602421f45cedb3395f4c7fa, null, null, null, Peter B. Luh, null, null, null, null, null, null, null, null, null, null), List(5406ae55dabfae44f084e890, null, null, null, Shi-chung Chang, null, null, null, null, null, null, null, null, null, null))",10.1109/ROBOT.2005.1570314,"List(Dynamic programming, Emergency evacuation, Scheduling (computing), Operations research, Control engineering, Elevator, Lagrangian relaxation, Local search (optimization), Engineering, Wireless sensor network, State space)",,,,"List(information technology, simulation, transportation, objective function, dynamic programming, state space, sensor network, elevators, uncertainty, testing, normal operator, look ahead, lagrangian relaxation, local search)",en,13,1424,1419.0,,"List(53e9abbfb7602d970356f65f, 53e9ace9b7602d97036c9628, 53e9b3efb7602d9703ee0001, 56d81daadabfae2eeea5d427, 53e9ac4eb7602d970361ffb2, 53e9ab9eb7602d970354b3be, 53e9b895b7602d9704469a31)",Group Elevator Scheduling with Advanced Traffic Information for Normal Operations and Coordinated Emergency Evacuation,List(http://dx.doi.org/10.1109/ROBOT.2005.1570314),"List(53a726bb20f7420be8b7f846, null, null, International Conference on Robotics and Automation, null, null, null, ICRA, null, null, null, null, 0)",,2005,14
53e99960b7602d97021a5451,"This paper presents a new visual feature representation method called the weighted census transform (WCT) based on modified census transform (MCT) and entropy information of training dataset. The proposed feature representation model can offer robustness to represent the same visual images such as MCT feature and sensitivity to effectively classify different visual images. In order to enhance the sensitivity of MCT feature, we designed the different weights for each MCT feature as binary code bit by statistical approach with the training dataset. In order to compare the proposed feature with MCT feature, we fixed classification method such as compressive sensing technique for two features. Experimental results shows that proposed WCT features have better classification performance than traditional MCT features for AR face datasets.","List(List(53f437e5dabfaeecd696c9a8, null, null, null, Sungmoon Jeong, null, null, null, null, null, null, null, null, null, null), List(53f43518dabfaec09f171b9e, null, null, null, Hosun Lee, null, null, null, null, null, null, null, null, null, null), List(53f43c17dabfaefedbafc568, null, null, null, Younes El Hamdi, null, null, null, null, null, null, null, null, null, null), List(54329e04dabfaeb542160fda, null, null, null, Nak Young Chong, null, null, null, null, null, null, null, null, null, null))",10.1109/URAI.2013.6677409,"List(Facial recognition system, Pattern recognition, Computer science, Feature (computer vision), Binary code, Robustness (computer science), Census transform, Artificial intelligence, Contextual image classification, Entropy (information theory), Compressed sensing)",978-1-4799-1195-0,,,"List(weighted census transform, visual feature representation method, image representation, binary code bit, face recognition, ar face datasets, modified census transform, wct, statistical analysis, feature representation model, visual image classification, feature representation, training dataset, compressed sensing, image classification, statistical approach, mct, entropy information, transforms, entropy, compressive sensing technique, pattern classification)",en,2,628,627.0,,"List(53e9b968b7602d9704553276, 53e9b895b7602d970446c94b, 53e9a26ab7602d9702b6fc46)",Weighted census transform for feature representation,"List(http://dx.doi.org/10.1109/URAI.2013.6677409, http://ieeexplore.ieee.org/xpl/abstractAuthors.jsp?tp=&arnumber=6677409)","List(555037017cea80f95416d334, null, null, International Conference on Ubiquitous Robots and Ambient Intelligence, null, null, null, URAI, null, null, null, null, 0)",,2013,6
53e99967b7602d97021a5814,"This paper presents a signaling and control architecture for mobility support in a “wireless ATM” network that provides integrated broadband services to mobile terminals. A system level protocol architecture for a wireless ATM network is outlined. The proposed protocol stack incorporates new wireless link MAC, DLC and wireless control sublayers, together with appropriate mobility extensions to the existing ATM network control layer. Wireless control and ATM signaling capabilities required for mobility support are discussed, and preliminary solutions are given for selected major functions. Potential extensions to standard Q.2931 ATM signaling are proposed to support handoff and service parameter/QoS renegotiation required for mobility. An associated wireless control protocol for supporting terminal migration, resource allocation, and handoff is discussed. Preliminary experimental results are given which validate the proposed handoff control protocol on an ATM network testbed.","List(List(53f45579dabfaeee22a30d2c, null, null, 5b86a6d6e1cd8e14a3dfe66e, R. Yuan, null, null, null, null, NEC USA, Princeton, NJ, null, 5f71b54b1c455f439fe502c1, null, null, null), List(null, null, null, 5b86a6d6e1cd8e14a3dfe66e, S. K. Biswas, null, null, null, null, NEC USA, Princeton, NJ, null, 5f71b54b1c455f439fe502c1, null, null, null), List(null, null, null, 5b86a6d6e1cd8e14a3dfe66e, L. J. French, null, null, null, null, NEC USA, Princeton, NJ, null, 5f71b54b1c455f439fe502c1, null, null, null), List(5433cbf4dabfaeb4c6acf7f9, null, null, 5b86a6d6e1cd8e14a3dfe66e, J. Li, null, null, null, null, NEC USA, Princeton, NJ, null, 5f71b54b1c455f439fe502c1, null, null, null), List(54088a10dabfae8faa64bdcc, null, null, 5b86a6d6e1cd8e14a3dfe66e, D. Raychaudhuri, null, null, null, null, NEC USA, Princeton, NJ, null, 5f71b54b1c455f439fe502c1, null, null, null))",10.1007/BF01193262,"List(Wireless network, Computer science, Computer network, Quality of service, Wireless WAN, Asynchronous Transfer Mode, ATM adaptation layer, Wi-Fi array, Protocol stack, Handover)",,,3,"List(Wireless Link, Mobile Terminal, Control Protocol, Control Architecture, Control Layer)",en,134,298,287.0,//static.aminer.org/pdf/PDF/001/012/950/a_signaling_and_control_architecture_for_mobility_support_in_wireless.pdf,"List(53e9adffb7602d970380703a, 557f7143d19faf961d1712f4, 557f7143d19faf961d1712f5)",A signaling and control architecture for mobility support in wireless ATM networks,"List(http://dx.doi.org/10.1007/BF01193262, http://dx.doi.org/10.1109/ICC.1996.542243, http://dx.doi.org/https://doi.org/10.1007/BF01193262, https://link.springer.com/article/10.1007/BF01193262)","List(53a730a020f7420be8cff2d8, null, null, International Conference on Communications, null, null, null, MONET, null, null, null, null, 0)",1,1996,12


## Authors

In [0]:
def get_author_from_dblp(title, rank):
    URL = "http://dblp.org/search/publ/api?q=" + title.replace(" ", "+") + "&format=json"
    try:
        r = requests.get(url = URL)
        data = r.json()
        return data['result']['hits']['hit'][0]['info']['authors']['author'][rank-1]['text']
    except:
        return None

In [0]:
authors = (filtered_df.limit(1000).select("doi", "authors", "title", F.posexplode(F.col("authors")).alias("rank", "authors_exp"))
            .withColumn("rank", F.col("rank") + 1)
            .select("authors_exp.*","*")
            .select("doi", "rank", "name", "title")
            .withColumn("name", F.initcap(F.col("name")))
            )

authors2 = authors.filter(F.col("name").rlike(r"^\p{L}\.?(-\p{L}\.?)?\s.+"))
authors_rdd = authors2.rdd.map(lambda x: (x[0], x[1], x[2], x[3], get_author_from_dblp(x[3], x[1])))
authors2 = authors_rdd.toDF(["doi", "rank", "name", "title", "dblp_name"])
    
authors2 = (authors2.withColumn("dblp_name", F.when(F.col("dblp_name") == "null", F.col("name")).otherwise(F.col("dblp_name")))
           .withColumn("dblp_name", F.regexp_extract(F.col("dblp_name"), r"^(\D+)(\s\d\d\d\d)?$", 1)))

authors_raw = (authors.withColumn("dblp_name", F.col("name"))
           .union(authors2)
           .withColumn("last_name_raw", F.regexp_extract(F.col("name"), r"^.+\s(\S+)(\sJr\.)?$", 1))
           .withColumn("dblp_name", F.when(F.col("dblp_name") == "null", F.col("name")).otherwise(F.col("dblp_name")))
           .withColumn("dblp_name", F.regexp_extract(F.col("dblp_name"), r"^(\D+)(\s\d\d\d\d)?$", 1))
           .withColumn("dblp_last_name", F.regexp_extract(F.col("dblp_name"), r"^.+\s(\S+)(\sJr\.)?$", 1))
           .withColumn("name", F.when(F.col("last_name_raw") == F.col("dblp_last_name"), 
                                      F.col("dblp_name")).otherwise(F.col("name")))
           .withColumn("first_name", F.regexp_extract(F.col("name"), r"^(\S+)\s.+$", 1))
           .withColumn("last_name", F.regexp_extract(F.col("name"), r"^.+\s(\S+)$", 1))
           .withColumn("middle_name", F.regexp_extract(F.col("name"), r"^\S+\s(\S+)\s\S+$", 1))
          )

authors = (authors_raw.select("first_name", "last_name", "middle_name")
            .dropDuplicates()
            .withColumn("author_id", F.monotonically_increasing_id())
          )

authors = (authors_raw.join(authors, ["first_name", "last_name", "middle_name"])
           .select("author_id", "first_name", "last_name", "middle_name", "title", F.col("name").alias("author"), "rank")
           .dropDuplicates()
          )

display(authors)

authors_clean = authors.select("author_id", "first_name", "last_name", "middle_name")
authors_clean.write.format("delta").mode("overwrite").saveAsTable("authors")

author_id,first_name,last_name,middle_name,title,author,rank
0,Andreas,Herzig,,Dynamic Logic of Propositional Assignments: A Well-Behaved Variant of PDL,Andreas Herzig,2
1,Fu-lai,Chung,,Stock time series pattern matching: Template-based vs. rule-based approaches,Fu-lai Chung,2
2,Roberto,Barbuti,,Compositional semantics of spiking neural P systems,Roberto Barbuti,1
2,Roberto,Barbuti,,LORETO: a tool for reducing state explosion in verification of LOTOS programs,Roberto Barbuti,1
3,Ee-peng,Lim,,An Overview of the Agent-Based Electronic Commerce System (ABECOS) Project,Ee-peng Lim,1
4,Chae-woo,Yoo,,Connect with things through instant messaging,Chae-woo Yoo,2
5,Wencheng,Wang,,A Multi-focus Image Fusion Method Based on Laplacian Pyramid.,Wencheng Wang,1
6,Wanda,Pratt,,Brainstorming Design for Health: Helping Patients Utilize Patient-Generated Information on the Web.,Wanda Pratt,12
7,Naoki,Nakashima,,Applicability of Portable Health Clinic to Ageing Society.,Naoki Nakashima,5
8,Zena,Ariola,M.,From Syntactic Theories to Interpreters: Automating the Proof of Unique Decomposition,Zena M. Ariola,3


## Organizations

In [0]:
def get_organization(name, country):
    try:
        URL = "https://api.ror.org/organizations?query=" + name.replace(" ", "+")
        r = requests.get(url = URL)
        data = r.json()
        if data['number_of_results'] == 0:
            return "No results"
        for i in range(data['number_of_results']):
            found_country = data['items'][i]["country"]["country_name"]
            if country == "" or country == found_country:
                return data['items'][i]["name"]+";"+data['items'][i]["addresses"][0]["city"]+";"+found_country
    except:
        return "No results"

In [0]:
countries = "(,|\s)(Afghanistan|Albania|Algeria|Andorra|Angola|Antigua and Barbuda|Argentina|Armenia|Australia|Austria|Azerbaijan|Bahamas|Bahrain|Bangladesh|Barbados|Belarus|Belgium|Belize|Benin|Bhutan|Bolivia|Bosnia and Herzegovina|Botswana|Brazil|Brunei|Bulgaria|Burkina Faso|Burundi|Cabo Verde|Cambodia|Cameroon|Canada|Central African Republic|Chad|Chile|China|Colombia|Comoros|Democratic Republic of the Congo|Republic of the Congo|Costa Rica|Cote d'Ivoire|Croatia|Cuba|Cyprus|Czech Republic|Denmark|Djibouti|Dominica|Dominican Republic|Ecuador|Egypt|El Salvador|Equatorial Guinea|Eritrea|Estonia|Ethiopia|Fiji|Finland|France|Gabon|Gambia|Georgia|Germany|Ghana|Greece|Grenada|Guatemala|Guinea|Guinea-Bissau|Guyana|Haiti|Honduras|Hungary|Iceland|India|Indonesia|Iran|Iraq|Ireland|Israel|Italy|Jamaica|Japan|Jordan|Kazakhstan|Kenya|Kiribati|Kosovo|Kuwait|Kyrgyzstan|Laos|Latvia|Lebanon|Lesotho|Liberia|Libya|Liechtenstein|Lithuania|Luxembourg|North Macedonia|Madagascar|Malawi|Malaysia|Maldives|Mali|Malta|Marshall Islands|Mauritania|Mauritius|Mexico|Micronesia|Moldova|Monaco|Mongolia|Montenegro|Morocco|Mozambique|Myanmar|Namibia|Nauru|Nepal|Netherlands|New Zealand|Nicaragua|Niger|Nigeria|North Korea|Norway|Oman|Pakistan|Palau|Palestine|Panama|Papua New Guinea|Paraguay|Peru|Philippines|Poland|Portugal|Qatar|Romania|Russia|Rwanda|Saint Kitts and Nevis|Saint Lucia|Saint Vincent and the Grenadines|Samoa|San Marino|Sao Tome and Principe|Saudi Arabia|Senegal|Serbia|Seychelles|Sierra Leone|Singapore|Slovakia|Slovenia|Solomon Islands|Somalia|South Africa|South Korea|South Sudan|Spain|Sri Lanka|Sudan|Suriname|Swaziland|Sweden|Switzerland|Syria|Taiwan|Tajikistan|Tanzania|Thailand|Timor-Leste|Togo|Tonga|Trinidad and Tobago|Tunisia|Turkey|Turkmenistan|Tuvalu|Uganda|Ukraine|United Arab Emirates|UAE|United Kingdom|UK|United States of America|USA|United States|US|Uruguay|Uzbekistan|Vanuatu|Vatican City|Venezuela|Vietnam|Yemen|Zambia|Zimbabwe)(,|\s|$)"


organization = (filtered_df.limit(1000).select("authors", "title", F.explode(F.col("authors")).alias("authors_exp"))
               .select("authors_exp.*","*")
               .withColumn("author", F.col("name"))
               .select("author", "org")
               .filter(F.col("org").isNotNull())
               .dropDuplicates()
                # ., +, *, ?, ^, $, (, ), [, ], {, }, |, \
               .withColumn("strip_org", F.regexp_replace(F.col("org"), r'[\+-=#&\|><!\(\)\{\}\[\]\^"~\*\?:\\/]', " "))
               .withColumn("county", F.regexp_extract(F.col("org"), countries, 2))
               .withColumn("county", F.regexp_replace("county", "United States of America|USA|US", "United States"))
               .withColumn("county", F.regexp_replace("county", "UK", "United Kingdom"))
               .withColumn("county", F.regexp_replace("county", "UAE", "United Arab Emirates"))
               )

organization_rdd = organization.rdd.map(lambda x: (x[0], x[1], x[2], x[3], get_organization(x[2], x[3])))
organization_raw = (organization_rdd.toDF(["author", "org", "strip_org", "country", "api_org"])
                    .withColumn("api_name", F.regexp_extract(F.col("api_org"), r"^(.+);.+;.+$", 1))
                    .withColumn("match", F.expr(r"regexp_extract(org, concat('(', api_name,')'), 0)"))
                    .withColumn("name", F.when(F.col("match") == "", F.col("org"))
                               .otherwise(F.col("api_name")))
                    .withColumn("city", F.when(F.col("match") == "", "")
                               .otherwise(F.regexp_extract(F.col("api_org"), r"^.+;(.+);.+$", 1)))
                    .withColumn("country", F.when(F.col("match") == "", F.col("country"))
                               .otherwise(F.regexp_extract(F.col("api_org"), r"^.+;.+;(.+)$", 1)))
                   )

organization = (organization_raw.select("name", "city", "country")
                .dropDuplicates()
                .withColumn("org_id", F.monotonically_increasing_id())
               )

organization = (organization_raw.join(organization, ["name", "city", "country"])
                .select("org_id", "name", "city", "country", "author"))

display(organization)

organization_clean = organization.select("org_id", "name", "city", "country")
organization_clean.write.format("delta").mode("overwrite").saveAsTable("organization")

org_id,name,city,country,author
0,Vrije Universiteit Amsterdam,Amsterdam,Netherlands,Henri E. Bal
1,Tel.: +44 7786 317995.,,,Nigel Jones
2,"Advanced Broadcast Systems Development, Division, NEC 10, Nisshin-cho 1-chome, Fuchu, Tokyo 183-6501, Japan",,Japan,Kazutoshi Maeno
3,"Univ Wisconsin, Madison, WI USA",,United States,Kathy A. Johnson
3,"Univ Wisconsin, Madison, WI USA",,United States,Patricia Flatley Brennan
4,National Institute of Mental Health,Bethesda,United States,Brooke A Babineau
4,National Institute of Mental Health,Bethesda,United States,Jacqueline N Crawley
5,university of pavia,,,Elena Molho
6,"ST Microelectronics, Cornaredo",,,Francesco Papariello
6,"ST Microelectronics, Cornaredo",,,Giuseppe Desoli


## Publications

In [0]:
publication_raw = filtered_df.limit(1000).select("_id", "title", "volume", "n_citation", "doi", "url")

display(publication_raw)

_id,title,volume,n_citation,doi,url
53e99967b7602d97021a560a,Consistent Sets of Secondary Structures in Proteins,53,2.0,10.1007/s00453-007-9068-8,"List(http://dx.doi.org/10.1007/s00453-007-9068-8, http://dx.doi.org/https://doi.org/10.1007/s00453-007-9068-8, https://link.springer.com/article/10.1007/s00453-007-9068-8, https://dl.acm.org/citation.cfm?id=3118965&picked=prox&preflayout=flat, http://www.webofknowledge.com/)"
53e99960b7602d97021a53fa,A new feature extraction technique for on-line recognition of handwritten alphanumeric characters,148,19.0,10.1016/S0020-0255(02)00276-1,"List(http://dx.doi.org/10.1016/S0020-0255(02)00276-1, https://www.sciencedirect.com/science/article/pii/S0020025502002761)"
53e99960b7602d97021a5416,King John Divided,19,0.0,10.1093/llc/19.2.181,List(http://dx.doi.org/10.1093/llc/19.2.181)
53e99960b7602d97021a541f,Peer Assist Live Streaming Overlay for Next-Generation-Networks,1,0.0,10.4018/jhcr.2010100102,"List(http://dx.doi.org/10.4018/jhcr.2010100102, http://dx.doi.org/10.4018/978-1-4666-0921-1.ch017)"
53e99960b7602d97021a5427,Paramecium: Assembling Raw Nodes into Composite Cells,3222,2.0,10.1007/978-3-540-30141-7_67,"List(http://dx.doi.org/10.1007/978-3-540-30141-7_67, http://www.webofknowledge.com/)"
53e99967b7602d97021a5645,Radar Signal Retrodiffusion by Water Surface,2,1.0,10.1109/IGARSS.2009.5418122,"List(http://dx.doi.org/10.1109/IGARSS.2009.5418122, http://ieeexplore.ieee.org/xpl/abstractAuthors.jsp?tp=&arnumber=5418122)"
53e99960b7602d97021a543e,Custos Remote on-demand healthcare aided with wireless sensors and mobile phones.,,5.0,10.1109/ICSMC.2008.4811630,"List(http://dx.doi.org/10.1109/ICSMC.2008.4811630, http://ieeexplore.ieee.org/xpl/abstractAuthors.jsp?tp=&arnumber=4811630)"
53e99967b7602d97021a566b,Group Elevator Scheduling with Advanced Traffic Information for Normal Operations and Coordinated Emergency Evacuation,,13.0,10.1109/ROBOT.2005.1570314,List(http://dx.doi.org/10.1109/ROBOT.2005.1570314)
53e99960b7602d97021a5451,Weighted census transform for feature representation,,2.0,10.1109/URAI.2013.6677409,"List(http://dx.doi.org/10.1109/URAI.2013.6677409, http://ieeexplore.ieee.org/xpl/abstractAuthors.jsp?tp=&arnumber=6677409)"
53e99967b7602d97021a5814,A signaling and control architecture for mobility support in wireless ATM networks,1,134.0,10.1007/BF01193262,"List(http://dx.doi.org/10.1007/BF01193262, http://dx.doi.org/10.1109/ICC.1996.542243, http://dx.doi.org/https://doi.org/10.1007/BF01193262, https://link.springer.com/article/10.1007/BF01193262)"


In [0]:
def check_return_data(check_type, data, cur):
    if data is not None and check_type in data:
        return data[check_type]
    else:
        return cur
    
def update_df(df, doi_list, data_list, data_str, is_int=False):
    data_dict = dict(zip(doi_list, data_list))
    update_data = udf(lambda x: data_dict[x], IntegerType() if is_int else StringType())
    return df.withColumn(data_str, update_data(F.col('doi')))

def get_publication_data(df):
    doi_list = df.select(F.col("doi")).rdd.flatMap(lambda x: x).collect()
    url_list = df.select(F.col("url")).rdd.flatMap(lambda x: x).collect()
    title_list = df.select(F.col("title")).rdd.flatMap(lambda x: x).collect()
    citation_list = df.select(F.col("n_citation")).rdd.flatMap(lambda x: x).collect()
    volume_list = df.select(F.col("volume")).rdd.flatMap(lambda x: x).collect()
    new_volume_list = []
    n_citation_list = []
    series_list = []
    new_doi_list = []
    for i, doi in enumerate(doi_list):
        try:
            if doi == "" or doi is None:
                if "doi" in url_list[i][0]:
                    doi_req = url_list[i][0].split("org/")[-1]
                else:
                    raise Exception
            else:
                doi_req = doi
            
            new_doi_list.append(doi_req)
            response = requests.get(f"https://api.crossref.org/works/{doi_req}")
            data = response.json()['message']
        except Exception:
            new_doi_list.append(None)
            data = None

        n_citation_list.append(int(check_return_data('is-referenced-by-count', data, citation_list[i])))
        new_volume_list.append(check_return_data('volume', data, volume_list[i]))
        temp = check_return_data('container-title', data, None)
        series_list.append(temp if temp == None or len(temp) < 1 else temp[0])

        time.sleep(0.05)
    
    df = update_df(df, doi_list, n_citation_list, 'n_citation')
    df = update_df(df, doi_list, new_volume_list, 'volume')
    df = update_df(df, doi_list, series_list, 'series')
    df = update_df(df, doi_list, new_doi_list, 'doi')
    
    return df

In [0]:
publications_raw = get_publication_data(publication_raw).filter("n_citation > 2")
# publication = get_publication_dataget_crossref_data(publication_raw).select("_id", "title", "volume", "series", "n_citation").filter("n_citation > 2")

publications = (publications_raw.select("title", "volume", "series", "n_citation")
            .dropDuplicates()
            .withColumn("publ_id", F.monotonically_increasing_id())
          )

publications = (publications_raw.join(publications, ["title", "volume", "series", "n_citation"])
           .select("publ_id", "title", "volume", "series", "n_citation")
           .dropDuplicates()
          )

display(publications)
publications.write.format("delta").mode("overwrite").saveAsTable("publications")

publ_id,title,volume,series,n_citation
322,THE BAYESIAN ABEL BOUND ON THE MEAN SQUARE ERROR,,2006 IEEE International Conference on Acoustics Speed and Signal Processing Proceedings,13
192,MEBRS: a multiagent architecture for an experience based reasoning system,3681,Lecture Notes in Computer Science,4
497,Variable tapered pareto buffer design and implementation allowing run-time configuration for low-power embedded SRAMs,13,IEEE Transactions on Very Large Scale Integration (VLSI) Systems,20
556,An Object Observation for a Java Adaptative Distributed Application Platform,,Proceedings. International Conference on Parallel Computing in Electrical Engineering,12
138,Nash-Cournot Equilibria in Electric Power Markets with Piecewise Linear Demand Functions and Joint Constraints,55,Operations Research,136
359,Measurement and modeling of the origins of starvation of congestion-controlled flows in wireless mesh networks,17,IEEE/ACM Transactions on Networking,26
681,Intelligent preemption in construction of a manmade island for an airport,,"Proceedings of the Winter Simulation Conference, 2005.",3
1,On Necessary and Sufficient Conditions for Private Ballot Submission.,184,Computer Physics Communications,9
81,Recursive square-root ladder estimation algorithms.,5,"ICASSP '80. IEEE International Conference on Acoustics, Speech, and Signal Processing",40
98,Learning control of a parallel-link direct-drive robot manipulator,5,Robotics and Autonomous Systems,5


## Types

In [0]:
types_raw = filtered_df.limit(1000).select("_id", "volume", "issue", "doi", "url", "venue", "title")

display(types_raw)

_id,volume,issue,doi,url,venue,title
53e99967b7602d97021a560a,53,1,10.1007/s00453-007-9068-8,"List(http://dx.doi.org/10.1007/s00453-007-9068-8, http://dx.doi.org/https://doi.org/10.1007/s00453-007-9068-8, https://link.springer.com/article/10.1007/s00453-007-9068-8, https://dl.acm.org/citation.cfm?id=3118965&picked=prox&preflayout=flat, http://www.webofknowledge.com/)","List(539078f220f770854f5a882f, null, null, null, null, null, null, Algorithmica, null, null, null, null, 0)",Consistent Sets of Secondary Structures in Proteins
53e99960b7602d97021a53fa,148,1-4,10.1016/S0020-0255(02)00276-1,"List(http://dx.doi.org/10.1016/S0020-0255(02)00276-1, https://www.sciencedirect.com/science/article/pii/S0020025502002761)","List(555036b77cea80f95414b7e3, null, null, null, null, null, null, Inf. Sci., null, null, null, null, 0)",A new feature extraction technique for on-line recognition of handwritten alphanumeric characters
53e99960b7602d97021a5416,19,2,10.1093/llc/19.2.181,List(http://dx.doi.org/10.1093/llc/19.2.181),"List(555036d37cea80f95415b0b3, null, null, null, null, null, null, LLC, null, null, null, null, 0)",King John Divided
53e99960b7602d97021a541f,1,4,10.4018/jhcr.2010100102,"List(http://dx.doi.org/10.4018/jhcr.2010100102, http://dx.doi.org/10.4018/978-1-4666-0921-1.ch017)","List(555036cc7cea80f954158149, null, null, International Journal of Handheld Computing Research, null, null, null, IJHCR, null, null, null, null, 0)",Peer Assist Live Streaming Overlay for Next-Generation-Networks
53e99960b7602d97021a5427,3222,,10.1007/978-3-540-30141-7_67,"List(http://dx.doi.org/10.1007/978-3-540-30141-7_67, http://www.webofknowledge.com/)","List(null, 0302-9743, null, null, null, 1611-3349, null, LECTURE NOTES IN COMPUTER SCIENCE, null, LECTURE NOTES IN COMPUTER SCIENCE, null, J, null)",Paramecium: Assembling Raw Nodes into Composite Cells
53e99967b7602d97021a5645,2,,10.1109/IGARSS.2009.5418122,"List(http://dx.doi.org/10.1109/IGARSS.2009.5418122, http://ieeexplore.ieee.org/xpl/abstractAuthors.jsp?tp=&arnumber=5418122)","List(53a7297c20f7420be8bd4ae5, null, null, International Geoscience and Remote Sensing Symposium, null, null, null, IGARSS, null, null, null, null, 0)",Radar Signal Retrodiffusion by Water Surface
53e99960b7602d97021a543e,,,10.1109/ICSMC.2008.4811630,"List(http://dx.doi.org/10.1109/ICSMC.2008.4811630, http://ieeexplore.ieee.org/xpl/abstractAuthors.jsp?tp=&arnumber=4811630)","List(555037277cea80f954176d3b, null, null, Systems, Man and Cybernetics, null, null, null, SMC, null, null, null, null, 0)",Custos Remote on-demand healthcare aided with wireless sensors and mobile phones.
53e99967b7602d97021a566b,,,10.1109/ROBOT.2005.1570314,List(http://dx.doi.org/10.1109/ROBOT.2005.1570314),"List(53a726bb20f7420be8b7f846, null, null, International Conference on Robotics and Automation, null, null, null, ICRA, null, null, null, null, 0)",Group Elevator Scheduling with Advanced Traffic Information for Normal Operations and Coordinated Emergency Evacuation
53e99960b7602d97021a5451,,,10.1109/URAI.2013.6677409,"List(http://dx.doi.org/10.1109/URAI.2013.6677409, http://ieeexplore.ieee.org/xpl/abstractAuthors.jsp?tp=&arnumber=6677409)","List(555037017cea80f95416d334, null, null, International Conference on Ubiquitous Robots and Ambient Intelligence, null, null, null, URAI, null, null, null, null, 0)",Weighted census transform for feature representation
53e99967b7602d97021a5814,1,3,10.1007/BF01193262,"List(http://dx.doi.org/10.1007/BF01193262, http://dx.doi.org/10.1109/ICC.1996.542243, http://dx.doi.org/https://doi.org/10.1007/BF01193262, https://link.springer.com/article/10.1007/BF01193262)","List(53a730a020f7420be8cff2d8, null, null, International Conference on Communications, null, null, null, MONET, null, null, null, null, 0)",A signaling and control architecture for mobility support in wireless ATM networks


In [0]:
types_pre_api = types_raw.withColumn("type", F.when(F.col("venue.raw").contains("@"), "workshop")
                   .when(((F.col("volume").isNotNull()) & (F.col("volume") != "")) | ((F.col("issue").isNotNull()) & (F.col("issue") != "")), "journal-article")
                   .otherwise("conference paper"))

display(types_pre_api)

_id,volume,issue,doi,url,venue,title,type
53e99967b7602d97021a560a,53,1,10.1007/s00453-007-9068-8,"List(http://dx.doi.org/10.1007/s00453-007-9068-8, http://dx.doi.org/https://doi.org/10.1007/s00453-007-9068-8, https://link.springer.com/article/10.1007/s00453-007-9068-8, https://dl.acm.org/citation.cfm?id=3118965&picked=prox&preflayout=flat, http://www.webofknowledge.com/)","List(539078f220f770854f5a882f, null, null, null, null, null, null, Algorithmica, null, null, null, null, 0)",Consistent Sets of Secondary Structures in Proteins,journal-article
53e99960b7602d97021a53fa,148,1-4,10.1016/S0020-0255(02)00276-1,"List(http://dx.doi.org/10.1016/S0020-0255(02)00276-1, https://www.sciencedirect.com/science/article/pii/S0020025502002761)","List(555036b77cea80f95414b7e3, null, null, null, null, null, null, Inf. Sci., null, null, null, null, 0)",A new feature extraction technique for on-line recognition of handwritten alphanumeric characters,journal-article
53e99960b7602d97021a5416,19,2,10.1093/llc/19.2.181,List(http://dx.doi.org/10.1093/llc/19.2.181),"List(555036d37cea80f95415b0b3, null, null, null, null, null, null, LLC, null, null, null, null, 0)",King John Divided,journal-article
53e99960b7602d97021a541f,1,4,10.4018/jhcr.2010100102,"List(http://dx.doi.org/10.4018/jhcr.2010100102, http://dx.doi.org/10.4018/978-1-4666-0921-1.ch017)","List(555036cc7cea80f954158149, null, null, International Journal of Handheld Computing Research, null, null, null, IJHCR, null, null, null, null, 0)",Peer Assist Live Streaming Overlay for Next-Generation-Networks,journal-article
53e99960b7602d97021a5427,3222,,10.1007/978-3-540-30141-7_67,"List(http://dx.doi.org/10.1007/978-3-540-30141-7_67, http://www.webofknowledge.com/)","List(null, 0302-9743, null, null, null, 1611-3349, null, LECTURE NOTES IN COMPUTER SCIENCE, null, LECTURE NOTES IN COMPUTER SCIENCE, null, J, null)",Paramecium: Assembling Raw Nodes into Composite Cells,journal-article
53e99967b7602d97021a5645,2,,10.1109/IGARSS.2009.5418122,"List(http://dx.doi.org/10.1109/IGARSS.2009.5418122, http://ieeexplore.ieee.org/xpl/abstractAuthors.jsp?tp=&arnumber=5418122)","List(53a7297c20f7420be8bd4ae5, null, null, International Geoscience and Remote Sensing Symposium, null, null, null, IGARSS, null, null, null, null, 0)",Radar Signal Retrodiffusion by Water Surface,journal-article
53e99960b7602d97021a543e,,,10.1109/ICSMC.2008.4811630,"List(http://dx.doi.org/10.1109/ICSMC.2008.4811630, http://ieeexplore.ieee.org/xpl/abstractAuthors.jsp?tp=&arnumber=4811630)","List(555037277cea80f954176d3b, null, null, Systems, Man and Cybernetics, null, null, null, SMC, null, null, null, null, 0)",Custos Remote on-demand healthcare aided with wireless sensors and mobile phones.,conference paper
53e99967b7602d97021a566b,,,10.1109/ROBOT.2005.1570314,List(http://dx.doi.org/10.1109/ROBOT.2005.1570314),"List(53a726bb20f7420be8b7f846, null, null, International Conference on Robotics and Automation, null, null, null, ICRA, null, null, null, null, 0)",Group Elevator Scheduling with Advanced Traffic Information for Normal Operations and Coordinated Emergency Evacuation,conference paper
53e99960b7602d97021a5451,,,10.1109/URAI.2013.6677409,"List(http://dx.doi.org/10.1109/URAI.2013.6677409, http://ieeexplore.ieee.org/xpl/abstractAuthors.jsp?tp=&arnumber=6677409)","List(555037017cea80f95416d334, null, null, International Conference on Ubiquitous Robots and Ambient Intelligence, null, null, null, URAI, null, null, null, null, 0)",Weighted census transform for feature representation,conference paper
53e99967b7602d97021a5814,1,3,10.1007/BF01193262,"List(http://dx.doi.org/10.1007/BF01193262, http://dx.doi.org/10.1109/ICC.1996.542243, http://dx.doi.org/https://doi.org/10.1007/BF01193262, https://link.springer.com/article/10.1007/BF01193262)","List(53a730a020f7420be8cff2d8, null, null, International Conference on Communications, null, null, null, MONET, null, null, null, null, 0)",A signaling and control architecture for mobility support in wireless ATM networks,journal-article


In [0]:
def check_return_data(check_type, data, cur):
    if data is not None and check_type in data:
        return data[check_type]
    else:
        return cur
    
def update_df(df, doi_list, data_list, data_str, is_int=False):
    new_df = spark.createDataFrame(zip(df.select("doi").rdd.flatMap(lambda x: x).collect(), data_list), ["doi", data_str])
    return df.join(new_df, on="doi", how="left").select(new_df["doi"], new_df[data_str], F.col("title"))

def get_type_data(df):
    doi_list = df.select(F.col("doi")).rdd.flatMap(lambda x: x).collect()
    url_list = df.select(F.col("url")).rdd.flatMap(lambda x: x).collect()
    type_list = df.select(F.col("type")).rdd.flatMap(lambda x: x).collect()
    new_type_list = []
    for i, doi in enumerate(doi_list):
        try:
            if doi == "" or doi is None:
                if "doi" in url_list[i][0]:
                    doi_req = url_list[i][0].split("org/")[-1]
                else:
                    raise Exception
            else:
                doi_req = doi
            
            response = requests.get(f"https://api.crossref.org/works/{doi_req}")
            data = response.json()['message']
        except Exception:
            data = None

        new_type_list.append(check_return_data('type', data, type_list[i]))

        time.sleep(0.05)

    df = update_df(df, doi_list, new_type_list, 'type')
    
    return df

In [0]:
types_raw = get_type_data(types_pre_api)

types = (types_raw.select("type")
            .dropDuplicates()
            .withColumn("type_id", F.monotonically_increasing_id())
          )

types = (types_raw.join(types, ["type"])
           .select("type_id", "type", "title")
           .dropDuplicates()
        )

display(types)

types_clean = types.select("type_id", "type")
types_clean.write.format("delta").mode("overwrite").saveAsTable("types")

type_id,type,title
6,book,"Logic for Programming, Artificial Intelligence, and Reasoning, 9th International Conference, LPAR 2002, Tbilisi, Georgia, October 14-18, 2002, Proceedings"
6,book,eCall: Automatic notification of a road traffic accident.
6,book,"Buffer Overflows und Format-String-Schwachstellen - Funktionsweisen, Exploits und Gegenmaßnahmen"
6,book,Current-Reused QVCO Based on Source-Connection Coupling.
6,book,Effective information sharing using update logs
6,book,Security Issues in Health Care Process Integration ? a Research-in-Progress Report
6,book,A Social Cognitive View of Technical Support and its Influence on User Learning.
6,book,Eine integrierte Softwarelösung zur konsolidierten Rechnungslegung
6,book,How to Select Significant Workloads in Performance Models
6,book,Selection of the most effective set of subword units for an HMM-based speech recognition system


## Keywords

In [0]:
keywords_raw = filtered_df.limit(1000).select("_id", "keywords", "doi", "title")

display(keywords_raw)

_id,keywords,doi,title
53e99967b7602d97021a560a,"List(Secondary Structure, Maximum Weight, Local Prediction, Horizontal Edge, Connector Edge)",10.1007/s00453-007-9068-8,Consistent Sets of Secondary Structures in Proteins
53e99960b7602d97021a53fa,"List(adaptive on-line use, data processing system, feature extraction, inputting data, on-line recognition, handwritten alphanumeric character, feature set, new feature extraction technique, extracted feature, adaptive recognizer, new method, high generalization capability, high recognition rate, feed forward neural network, error back propagation, simulation experiment, data processing, artificial neural network)",10.1016/S0020-0255(02)00276-1,A new feature extraction technique for on-line recognition of handwritten alphanumeric characters
53e99960b7602d97021a5416,List(),10.1093/llc/19.2.181,King John Divided
53e99960b7602d97021a541f,"List(scalability, ims, p2p, ngn, next generation network)",10.4018/jhcr.2010100102,Peer Assist Live Streaming Overlay for Next-Generation-Networks
53e99960b7602d97021a5427,List(),10.1007/978-3-540-30141-7_67,Paramecium: Assembling Raw Nodes into Composite Cells
53e99967b7602d97021a5645,"List(airborne radar, backscatter, height measurement, oceanographic regions, oceanographic techniques, remote sensing by radar, synthetic aperture radar, Camargue region, DRIVE radar, France, ONERA BUSARD platform, SAR instrument, backscattering coefficient profiles, dry surfaces, future space Ka band altimetry missions, nadir backscattering, pond surfaces, sea surfaces, sensor configuration, specular behavior, steep radar incident angle, synthetic aperture radar, water surface radar signal retrodiffusion, wet surfaces, Ka band, altimetry mode, radar sensor)",10.1109/IGARSS.2009.5418122,Radar Signal Retrodiffusion by Water Surface
53e99960b7602d97021a543e,"List(Java, Web services, biomedical telemetry, health care, mobile computing, sensor fusion, telemedicine, wireless sensor networks, Custos, ICT technique, Internet, Squawk Java J2ME, Sun SPOT, UPnP digital-home platform, Web service, Wei-Gong memorial hospital, clinical trial, geriatric psychiatry ward, mobile phone, multisensor data fusion, remote on-demand healthcare, small programmable object technology, smart phone, wireless sensor network, Smart Environments, Telemedicine, Telemonitoring, Wireless Sensor Networks)",10.1109/ICSMC.2008.4811630,Custos Remote on-demand healthcare aided with wireless sensors and mobile phones.
53e99967b7602d97021a566b,"List(information technology, simulation, transportation, objective function, dynamic programming, state space, sensor network, elevators, uncertainty, testing, normal operator, look ahead, lagrangian relaxation, local search)",10.1109/ROBOT.2005.1570314,Group Elevator Scheduling with Advanced Traffic Information for Normal Operations and Coordinated Emergency Evacuation
53e99960b7602d97021a5451,"List(weighted census transform, visual feature representation method, image representation, binary code bit, face recognition, ar face datasets, modified census transform, wct, statistical analysis, feature representation model, visual image classification, feature representation, training dataset, compressed sensing, image classification, statistical approach, mct, entropy information, transforms, entropy, compressive sensing technique, pattern classification)",10.1109/URAI.2013.6677409,Weighted census transform for feature representation
53e99967b7602d97021a5814,"List(Wireless Link, Mobile Terminal, Control Protocol, Control Architecture, Control Layer)",10.1007/BF01193262,A signaling and control architecture for mobility support in wireless ATM networks


In [0]:
# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

def is_city_location_person(keyword):
    # Process the keyword using the spaCy NER model
    doc = nlp(keyword)
    
    # Check if any of the entities recognized by the model are of type GPE (city or location) or PERSON
    for ent in doc.ents:
        if ent.label_ in ['GPE', 'PERSON']:
            return True
    
    return False

def update_df(df, doi_list, data_list, data_str, is_int=False):
    data_dict = dict(zip(doi_list, data_list))
    update_data = udf(lambda x: data_dict[x], IntegerType() if is_int else StringType())
    return df.withColumn(data_str, update_data(F.col('doi')))

def check_keywords(df):
    doi_list = df.select(F.col("doi")).rdd.flatMap(lambda x: x).collect()
    keywords_list = df.select(F.col("keywords")).rdd.flatMap(lambda x: x).collect()
    new_keywords_list = []
    for i, keys in enumerate(keywords_list):
        temp = []
        for k in keys:
            if not is_city_location_person(k):
                temp.append(k)
        new_keywords_list.append(temp)
        
    df = update_df(df, doi_list, keywords_list, 'keywords')
    
    return df

In [0]:
keywords_raw = check_keywords(keywords_raw).filter(F.size(keywords_raw.keywords) > 0)


keywords = (keywords_raw.select("keywords")
            .dropDuplicates()
            .withColumn("keyw_id", F.monotonically_increasing_id())
          )

keywords = (keywords_raw.join(keywords, ["keywords"])
           .select("keyw_id", "keywords", "title")
           .dropDuplicates()
        )

display(keywords)

keywords_clean = keywords.select("keyw_id", "keywords")
keywords_clean.write.format("delta").mode("overwrite").saveAsTable("keywords")

keyw_id,keywords,title
0,"[speaker model, multiple session, large data, large data set, incremental robust adaptation method, incremental adaptation method, inaccurate speaker model, robust speaker recognition, large amount, utterance variation, speaker recognition system, incremental verstion, speaker recognition, gaussian mixture model]",Robust speaker recognition against utterance variations
1,"[asymptotic behavior, constant coefficient, rational difference equation, phase portrait, rational type difference equation, attractor, phase space]",On the asymptotic behavior of a system of two rational difference equations
2,"[Insurance, Greece, Productivity change, Bootstrapped DEA]",Efficiency in the Greek insurance industry
3,"[graph isomorphism, 3-connected graph, linear time algorithm, map isomorphism, planar graph, whitney equivalence, graph g, fixed surface, linear time, polyhedral embeddings, embedded graph, time complexity, connected graph]",Graph and map isomorphism and all polyhedral embeddings in linear time
4,"[keystroke-level model, keystroke-level model prediction, observed data, investigative step, handheld device, total duration, aggregated event, cognitive modeling research, klm mental preparation duration, human-computer interaction, model prediction, cognitive modeling, keystroke level model, cognitive model, human computer interaction]",Comparisons of keystroke-level model predictions to observed data
5,"[automated office environment, alpha beta search]",Ergonmics in the automated office environment.
6,"[Bayes methods, IEEE standards, Monte Carlo methods, maximum likelihood estimation, optimisation, protocols, state estimation, wireless LAN, Bayesian estimators, DCF, IEEE 802.11 protocol, MAP estimation, approximate maximum a posteriori estimation, backoff parameters, competing terminals, distributed coordination function, network state, ns-2 simulator, optimization mechanisms, predictive distribution, saturation mode, sequential Monte Carlo estimation]",OPTIMIZING IEEE 802.11 DCF USING BAYESIAN ESTIMATORS OF THE NETWORK STATE
7,"[error detection strategy, cognitive impairment, important contribution, users age, error detection, internet user, critical factor, modern society, older adults, present study, improving web accessibility, older adult, service provider, usability, internet, servers, color, web accessibility, quality of life, web pages, data mining]",Using an Error Detection Strategy for Improving Web Accessibility for Older Adults
8,"[word group, general constraint solver, local word, integer programming problem, important novel feature, lexicalised grammar formalism, finite state machine specification, fast grouper, indian language, karaka role, machine translation, finite state machine]",A karaka based approach to parsing of Indian languages
9,"[measurement error, spectrum, dead reckoning]",Cooperative Localization in GPS-Limited Urban Environments


## Venue

In [0]:
# https://pypi.org/project/habanero/
from habanero import Crossref
cr = Crossref()

def getVenue(doi, venue):
    result = [venue['name_d'], None]
    try:
        query = cr.works(ids = doi)['message']['event']
        result[0] = query['name']
        result[1] = query['location']
    except:
        pass
    return result

getVenueUDF = udf(getVenue, ArrayType(StringType()))

In [0]:
venues_df = (filtered_df
            .limit(1000)
            .select('title', "doi", 'venue')
            .withColumn("VENUE", getVenueUDF(F.col("doi"), F.col('venue')))
            .select('title',
                    F.col("VENUE")[0].alias("venue"),
                    F.col("VENUE")[1].alias("location")
                   )
             .dropna(subset="venue")
           )

venues = (venues_df.select("venue", "location")
            .dropDuplicates()
            .withColumn("venue_id", F.monotonically_increasing_id())
          )

venues = (venues_df.join(venues, ["venue", "location"])
           .select("venue_id", "venue", "location", "title")
           .dropDuplicates()
        )

display(venues)
venues_clean = venues.select("venue_id", "venue")
venues_clean.write.format("delta").mode("overwrite").saveAsTable("venues")

venue_id,venue,location,title
67,2012 IEEE 12th International Conference on Data Mining Workshops,"Brussels, Belgium",Comparison of the Efficiency of MapReduce and Bulk Synchronous Parallel Approaches to Large Network Processing
88,2009 IEEE 69th Vehicular Technology Conference,Barcelona,Energy Consumption Optimization for Data Collection with Precision Constraints in Wireless Sensor Networks
489,ACM SE06: ACM Southeast Regional Conference,Melbourne Florida,Density-based multipath secure communication over mobile ad hoc networks
145,the ACM SIGCSE-SIGCUE technical symposium,Not Known,Learning mathematics with recursive computer programs
279,Second IEEE and ACM International Symposium on Mixed and Augmented Reality,"Tokyo, Japan",Real-Time Localisation and Mapping with Wearable Active Vision
229,"First International Conference on Innovative Computing, Information and Control - Volume I (ICICIC'06)","Beijing, China",Numerical Analysis of Blood Flow in Vessels
404,1995 IEEE International Conference on Robotics and Automation,"Nagoya, Japan",Robot manipulator contact force control application of fuzzy-neural network
185,2009 IEEE International Workshop on Genomic Signal Processing and Statistics (GENSIPS),"Minneapolis, MN, USA",On the Mutual Information Between Interaction Perplexity and Function of Proteins
290,2008 19th International Conference on Pattern Recognition (ICPR),"Tampa, FL, USA",3D localization of partially buried object in unstructured environment
484,2009 IEEE International Geoscience and Remote Sensing Symposium,"Cape Town, South Africa",Radar Signal Retrodiffusion by Water Surface


## FieldOfStudy

In [0]:
def getFos(doi, fos):
    result = [] if not fos else fos
    query = []
    try:
        query = cr.works(ids = doi)['message']['subject']
    except:
        pass
    return [*result, *query]

getFosUDF = udf(getFos, ArrayType(StringType()))

In [0]:
fos_df = (filtered_df
          .limit(1000)
          .select('title', "doi", 'fos')
          .withColumn("FOS", getFosUDF(F.col("doi"), F.col("fos")))
          .select('title', F.col('FOS').alias('fos'))
          .filter(F.size('fos') > 0)
           )

fos = (fos_df.select("fos")
            .dropDuplicates()
            .withColumn("fos_id", F.monotonically_increasing_id())
          )

fos = (fos_df.join(fos, ["fos"])
           .select("fos_id", "fos", "title")
           .dropDuplicates()
        )

display(fos)
fos_clean = fos.select("fos_id", "fos")
fos_clean.write.format("delta").mode("overwrite").saveAsTable("fos")

fos_id,fos,title
0,"List(Penetration (firestop), Uniqueness, Water sorption, Starch gelatinization, Swelling, Mechanics, Nonlinear integral equation, Materials science, Technical literature, Applied Mathematics, Computer Science Applications, General Engineering, Statistics and Probability, Applied Mathematics, Computer Science Applications, General Engineering, Statistics and Probability)",A mathematical model for spaghetti cooking with free boundaries.
1,"List(Video server, Broadcasting, Fast Ethernet, Lifting equipment, Computer science, Scheduling (computing), Computer network, Workstation, Customer base, Scalability)",Tabbycat: an inexpensive scalable server for video-on-demand.
2,"List(Computer software, Software engineering, Computer science, Empirical process (process control model), Software maintenance, Replicate)",Replication's Role in Software Engineering
3,"List(Swap (computer programming), Return time, Program transformation, Functional programming, Inversion (meteorology), Computer science, Theoretical computer science, Swap (finance), Computation, Software)",Manipulating accumulative functions by swapping call-time and return-time computations*
4,"List(Programming language, Computer science, Extreme programming)",Extreme Programming (XP) as a Development Process Framework
5,"List(Graph theory, Discrete mathematics, Two-way deterministic finite automaton, Deterministic automaton, Finite-state machine, Finite graph, Mathematics, Monad (functional programming))",Recognizability Equals Definability for Partial k-Paths
6,"List(Metadata, Computer aided instruction, Computer science, Rights Expression Language, Distance education, Learning object, Human–computer interaction, Artificial intelligence, Granularity, Intellectual property, Industrial property)",Research on DRM-enabled learning objects model
7,"List(Financial economics, Insurance industry, Economics, Simar, Actuarial science, Open market operation, Ranking, Deregulation, Productivity change, Data envelopment analysis, Market share, Information Systems and Management, Management Science and Operations Research, Modeling and Simulation, General Computer Science, Industrial and Manufacturing Engineering)",Efficiency in the Greek insurance industry
8,"List(Wireless network, Key distribution in wireless sensor networks, Computer science, Computer network, Network simulation, Wireless WAN, Wi-Fi array, Heterogeneous network, Municipal wireless network, Wireless sensor network)",Evaluation of Cognitive Wireless Networks in Rural Area for Disaster Information Network
9,"List(Health care, Telemedicine, Population, Gerontology, Competence (human resources), Elderly population, Norm (social), Medicine, Ageing society, Health clinic)",Applicability of Portable Health Clinic to Ageing Society.


## Date

In [0]:
def getDate(doi, year):
    formated = [None, None, None]
    try:
        query = cr.works(ids = doi)['message']['issued']['date-parts'][0]
        for i in [0, 1, 2]:
            formated[i] = query[i]
    except:
        pass
    if formated[0] == None:
        formated[0] = year 
    return formated

getDateUDF = udf(getDate, ArrayType(StringType()))

In [0]:
dates_df = (filtered_df
            .limit(1000)
            .select('title', "year", "doi")
            .withColumn("Date", getDateUDF(F.col("doi"), F.col("year")))
            .select('title',
                    F.col("Date")[2].alias("day"),
                    F.col("Date")[1].alias("month"),
                    F.col("Date")[0].alias("year")
                   )
           )

dates = (dates_df.select([F.col("day").alias("day1"), F.col("month").alias("month1"), F.col("year").alias("year1")])
            .dropDuplicates()
            .withColumn("date_id", F.monotonically_increasing_id())
          )

dates = (dates_df.join(dates, ((dates_df.day.eqNullSafe(dates.day1)) & 
                               (dates_df.month.eqNullSafe(dates.month1)) &
                               (dates_df.year.eqNullSafe(dates.year1))))
           .select("date_id", "day", "month", "year", "title")
           .dropDuplicates()
        )

display(dates)

dates_clean = dates.select("date_id", "day", "month", "year")
dates_clean.write.format("delta").mode("overwrite").saveAsTable("dates")

date_id,day,month,year,title
0,10.0,11.0,2007,Consistent Sets of Secondary Structures in Proteins
1,,12.0,2002,A new feature extraction technique for on-line recognition of handwritten alphanumeric characters
2,1.0,6.0,2004,King John Divided
3,1.0,10.0,2010,Peer Assist Live Streaming Overlay for Next-Generation-Networks
4,,,2004,Paramecium: Assembling Raw Nodes into Composite Cells
5,,,2009,Radar Signal Retrodiffusion by Water Surface
6,,10.0,2008,Custos Remote on-demand healthcare aided with wireless sensors and mobile phones.
7,,,2005,Group Elevator Scheduling with Advanced Traffic Information for Normal Operations and Coordinated Emergency Evacuation
8,,10.0,2013,Weighted census transform for feature representation
9,,10.0,1996,A signaling and control architecture for mobility support in wireless ATM networks


## Language

In [0]:
display(raw_df.groupBy("lang").count())
display(filtered_df.groupBy("lang").count())

lang,count
en,249984
zh,16


lang,count
en,244930


In [0]:
lang_df = filtered_df.select("title", "lang")

lang = (lang_df.select("lang")
            .dropDuplicates()
            .withColumn("lang_id", F.monotonically_increasing_id())
          )

lang = (lang_df.join(lang, ["lang"])
           .select("lang_id", "lang", "title")
           .dropDuplicates()
        )

display(lang)

lang_clean = lang.select("lang_id", "lang")
lang_clean.write.format("delta").mode("overwrite").saveAsTable("lang")

lang_id,lang,title
0,en,An adaptive technique for content-based image retrieval
0,en,Controlling Self-Organising Software Applications with Archetypes
0,en,Non-Automatizability of Bounded-Depth Frege Proofs
0,en,"Buffer Overflows und Format-String-Schwachstellen - Funktionsweisen, Exploits und Gegenmaßnahmen"
0,en,Empowering iso-surfaces with volume data
0,en,Partitioning and Retiming of Multi-Dimensional Systems
0,en,"A Comparison Between Contour Elevation Data Sources for DEM Creation and Soil Carbon Prediction, Coshocton, Ohio"
0,en,A pairing-based publicly verifiable secret sharing scheme.
0,en,A relational model for unstructured documents
0,en,A survey on applications of the harmony search algorithm.


## MAIN TABLE

In [0]:
main_table = (publications.join(authors, ["title"])
              .join(organization, ["author"], "left")
              .join(types, ["title"])
              .join(keywords, ["title"])
              .join(venues, ["title"])
              .join(fos, ["title"])
              .join(dates, ["title"])
              .join(lang, ["title"])
              .select("publ_id", "author_id", "org_id", "type_id", "keyw_id", "venue_id", "fos_id", "date_id", "lang_id", F.col("rank").alias("author_rank"))
              )
display(main_table)
main_table.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable("DBLP_fact_table")

publ_id,author_id,org_id,type_id,keyw_id,venue_id,fos_id,date_id,lang_id,author_rank
440,36,,2,404,307,1,23,0,3
440,377,,2,404,307,1,23,0,2
440,2559,,2,404,307,1,23,0,1
526,32,,2,603,325,8,237,0,2
526,1222,,2,603,325,8,237,0,3
526,2570,,2,603,325,8,237,0,1
292,103,249.0,2,92,284,14,129,0,2
292,1163,249.0,2,92,284,14,129,0,1
190,421,,2,451,247,37,48,0,1
190,1614,,2,451,247,37,48,0,2


### Queries

In [0]:
query = f"""
SELECT 
    publ_id, author_id FROM publications
LIMIT 
    10;

"""

# Execute query
result_df = spark.sql("SELECT publ_id, author_id FROM DBLP_fact_table LIMIT 10")
display(result_df)

publ_id,author_id
440,36
440,377
440,2559
526,32
526,1222
526,2570
292,103
292,1163
190,421
190,1614


In [0]:
#Retrieve the top 10 authors with the highest number of publications:
query = f"""
SELECT 
    Authors.first_name, Authors.last_name, COUNT(*) AS num_publications
FROM 
    DBLP_fact_table as MainTable
    INNER JOIN Authors ON MainTable.author_id = Authors.author_id 
GROUP BY 
    Authors.first_name, Authors.last_name 
ORDER BY 
    num_publications DESC 
LIMIT 
    10;

"""

# Execute query
result_df = spark.sql(query)
display(result_df)

first_name,last_name,num_publications
Guang-zhong,Yang,4
Wei,Liu,4
Fanhua,Shang,4
Gang,Shen,2
Wei-ying,Ma,2
Bhiksha,Raj,2
Giancarlo,Succi,2
Francisco,Romero,2
Jian,Liu,2
Haikal,Abed,2


In [0]:
# Query to retrieve the authors who have published venue:
query = f"""
SELECT a.first_name, a.last_name, v.venue
FROM Authors a 
JOIN DBLP_fact_table as MainTable 
ON MainTable.author_id = a.author_id
JOIN Publications p 
ON p.publ_id = MainTable.publ_id
JOIN venues v 
ON v.venue_id = MainTable.venue_id

"""

# Execute query
result_df = spark.sql(query)
display(result_df)

first_name,last_name,venue
Andreas,Herzig,2013 Twenty-Eighth Annual IEEE/ACM Symposium on Logic in Computer Science (LICS 2013)
Wanda,Pratt,CSCW '12: Computer Supported Cooperative Work
Ali,Makhdoumi,"2013 51st Annual Allerton Conference on Communication, Control, and Computing (Allerton)"
Amara,Amara,"4th IEEE International Symposium on Electronic Design, Test and Applications (delta 2008)"
Xu,Wu,ICC 2013 - 2013 IEEE International Conference on Communications
Kazuo,Takahata,2011 International Conference on Computational Science and Its Applications (ICCSA)
Darrell,Long,IEEE International Conference on Communications
Barath,Raghavan,HotNets-XI: 11th ACM Workshop on Hot Topics in Networks
Rosanne,Price,2004 Australian Software Engineering Conference. Proceedings.
Fanhua,Shang,KDD '12: The 18th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining


In [0]:
#Retrieve the publications with the highest number of citations:
query = f"""
SELECT 
    Publications.Title,
    CAST(Publications.n_citation AS INT) AS n_citation
FROM 
    Publications
ORDER BY 
    n_citation DESC;

"""

# Execute query
result_df = spark.sql(query)
display(result_df)

Title,n_citation
NUPACK: Analysis and design of nucleic acid systems,1069
A Regular Layout for Parallel Adders,695
Stabilization of switched continuous-time systems with all modes unstable via dwell time switching,338
Analysis of multicomponent LFM signals by a combined Wigner-Hough transform,322
The Diffusion Limit of Transport Equations Derived from Velocity-Jump Processes,258
"Entropy, similarity measure of interval-valued intuitionistic fuzzy sets and their applications",244
Learning By Teaching: A New Agent Paradigm For Educational Software,235
Prediction error method for second-order blind identification,215
The extended Kalman filter as an exponential observer for nonlinear systems,201
A class of data structures for assoclat~ve searclung,194


In [0]:
# Retrieve the authors with publications
query = f"""
SELECT 
    Authors.author_id, DBLP_fact_table.publ_id, Authors.first_name, Authors.last_name, Publications.title, Publications.n_citation, fos.fos
FROM Authors 
INNER JOIN DBLP_fact_table ON Authors.author_id = DBLP_fact_table.author_id
INNER JOIN Publications ON DBLP_fact_table.publ_id = Publications.publ_id
INNER JOIN fos ON DBLP_fact_table.fos_id = fos.fos_id
ORDER BY Authors.first_name, Authors.last_name;

"""

# Execute query
result_df = spark.sql(query)
display(result_df)

author_id,publ_id,first_name,last_name,title,n_citation,fos
1303,14,A,Offutt,An Analysis Tool for Coupling-Based Integration Testing,4,"List(System integration testing, Programming language, Integration testing, Computer science, System testing, Manual testing, Non-regression testing, Java, Test strategy, Keyword-driven testing)"
2430,194,A.,Arslan,Analysis of active and passive microwave observations from the NoSREx campaign.,5,"List(Water equivalent, Radar, Meteorology, Microwave, Ku band, Synthetic aperture radar, Backscatter, Remote sensing, Environmental science, Snowpack, Snow)"
1308,556,A.,Bouchi,An Object Observation for a Java Adaptative Distributed Application Platform,12,"List(Load management, Distributed object, Object-oriented programming, Computer science, Load balancing (computing), Java concurrency, Real-time computing, Resource allocation, Application software, Java, Distributed computing)"
1210,3,A.,Cameron,A bayesian decision theoretic approach for adaptive goal-directed sensing,11,"List(Robot vision, Intelligent sensor, Computer science, Robustness (computer science), Artificial intelligence, Decision theory, Robot vision systems, Bayes estimator, Machine learning, Bayesian probability)"
2119,326,A.,Giulieri,Analysis of on-line self-testing policies for real-time embedded multiprocessors in DSM technologies,8,"List(Resource management, Digital electronics, Scheduling (computing), Computer science, Fault detection and isolation, Resource allocation, Integrated circuit, MPSoC, Embedded system, Built-in self-test)"
1034,534,Abhishek,Sharma,Data insufficiency in sketch versus photo face recognition.,8,"List(Training set, Facial recognition system, Image matching, Computer science, Element (criminal law), Speech recognition, Feature extraction, Suspect, Law enforcement, Sketch)"
2854,338,Aca,Gacic,Detection of anomalous events from unlabeled sensor data in smart building environments.,7,"List(Anomaly detection, Data modeling, Data mining, Minimum message length, False alarm, Dimensionality reduction, Pattern recognition, Computer science, Feature extraction, Unsupervised learning, Artificial intelligence, Mixture model)"
350,372,Ada,Diaconescu,Controlling Self-Organising Software Applications with Archetypes,11,"List(Survivability, Computer science, Software agent, Software prototyping, Software system, Software, Software architecture, Software quality, Operating system, Software development, Distributed computing)"
2367,505,Aeron,Buchanan,Combining local and global motion models for feature point tracking,12,"List(Computer vision, Pattern recognition, Computer science, Segmentation, Image noise, Feature extraction, Robustness (computer science), Image segmentation, Statistical model, Artificial intelligence, Application software, Cognitive neuroscience of visual object recognition)"
1141,109,Aharon,Bar-hillel,Workstation capacity tuning using reinforcement learning,6,"List(Multithreading, Permission, Markov process, Grid computing, Computer science, Server, Workstation, Throughput, Reinforcement learning, Distributed computing)"


In [0]:
# Retrieve the the number of publication and the total number of citations for each author:
query = f"""
SELECT 
    Authors.first_name, 
    Authors.last_name, 
    COUNT(DISTINCT DBLP_fact_table.publ_id) AS num_publications, 
    SUM(Publications.n_citation) AS total_citations
FROM Authors 
INNER JOIN DBLP_fact_table ON Authors.author_id = DBLP_fact_table.author_id
INNER JOIN Publications ON DBLP_fact_table.publ_id = Publications.publ_id
GROUP BY Authors.first_name, Authors.last_name
ORDER BY num_publications DESC;

"""

# Execute query
result_df = spark.sql(query)
display(result_df)

first_name,last_name,num_publications,total_citations
Guang-zhong,Yang,2,38.0
Chrisila,Pettey,1,6.0
Hao,Guo,1,5.0
John,Gore,1,4.0
Christoph,Becker,1,20.0
Jonghyun,Choi,1,8.0
Zhenguo,Li,1,44.0
Shigenori,Inagaki,1,3.0
Patrick,Mcdaniel,1,5.0
Amara,Amara,1,4.0


In [0]:
#Retrieve the top 5 most common set of keywords:
query = f"""
SELECT 
    Keywords.keywords, COUNT(*) AS num_occurrences
FROM 
    DBLP_fact_table as MainTable 
    INNER JOIN Keywords ON MainTable.keyw_id = Keywords.keyw_id 
GROUP BY 
    Keywords.keywords 
ORDER BY 
    num_occurrences DESC 
LIMIT 
    5;

"""

# Execute query
result_df = spark.sql(query)
display(result_df)

keywords,num_occurrences
"[health, broad agendum, tangible design outcome, design idea, cscw community, crowdsourcing, design challenge, brainstorming design, patient-generated information, new idea, small multidisciplinary group work, online community, future work, new collaboration opportunity, patient, biomedical research, idea generation, bioinformatics]",13
"[geophysical signal processing, hydrological techniques, inverse problems, microwave measurement, radar signal processing, remote sensing by radar, snow, spaceborne radar, synthetic aperture radar, AD 2009 11, Ku band SAR observations, NOSREX campaign, Nordic Snow Radar Experiment, SWE, X band SAR observations, active microwave observations, coarse scale passive microwave data inversion, high resolution two frequency SAR observations, passive microwave observations, snow water equivalent, snowpack backscattering properties, snowpack emission properties, spaceborne SAR imagery, Snow Water Equivalent, microwave emission, radar backscatter]",13
"[circuit layout CAD, integrated circuit layout, logic CAD, network routing, network-on-chip, parallel architectures, 2D-mesh network-on-chip, IP cores, NoC architecture, component placement, deadlock-free routing algorithm, floorplanning method, high density layouts, highly adaptive routing algorithm, parallel computer architecture, routing theories]",8
"[non-deterministic execution order, csp specification, industrial system, control flow graph, state explosion, recent proposal, data structure, csp]",8
"[force control, mobile robots, biologically-inspired robot, climbing ability, climbing insects, distributed inward gripping, geckos, mobile robot, screenbot, spring forces, walking inverted robot]",7


In [0]:
#Retrieve the top 5 most popular sets of fields of study:
query = f"""
SELECT 
    fos.fos, COUNT(*) AS num_publications
FROM 
    DBLP_fact_table as MainTable   
    INNER JOIN fos ON MainTable.fos_id = fos.fos_id 
GROUP BY 
    fos.fos 
ORDER BY 
    num_publications DESC 
LIMIT 
    5;
"""
# Execute query
result_df = spark.sql(query)
display(result_df)

fos,num_publications
"List(Brainstorming, World Wide Web, Online community, Computer-supported cooperative work, Multidisciplinary approach, Crowdsourcing, Sociology, Group work, Knowledge management, Ethnography)",13
"List(Water equivalent, Radar, Meteorology, Microwave, Ku band, Synthetic aperture radar, Backscatter, Remote sensing, Environmental science, Snowpack, Snow)",13
"List(Link-state routing protocol, Equal-cost multi-path routing, Multipath routing, Dynamic Source Routing, Enhanced Interior Gateway Routing Protocol, Computer science, Policy-based routing, Static routing, Routing table, Distributed computing)",8
"List(Data structure, Control flow graph, Industrial systems, Computer science, Algorithm, Order by, Distributed computing)",8
"List(Mobile computing, Web usability, Mobile search, Computer science, Usability engineering, Usability, Usability lab, Human–computer interaction, Cognitive walkthrough, Usability inspection, Multimedia)",7


In [0]:
# retrieve the total number of publications per year:
query = f"""
SELECT Dates.year, count(*) AS num_publications
FROM DBLP_fact_table
JOIN Publications ON DBLP_fact_table.publ_id = Publications.publ_id
JOIN Dates ON DBLP_fact_table.date_id = Dates.date_id
GROUP BY Dates.year
ORDER BY Dates.year ASC NULLS FIRST;
;
"""

# Execute query
result_df = spark.sql(query)
display(result_df)

year,num_publications
1984,8
1990,36
1992,16
1993,50
1995,20
1998,135
1999,37
2000,192
2001,256
2002,205


In [0]:
# the number of publications per type of publication:
query = f"""
SELECT type_id,types.type, COUNT(*) as num_publications
FROM types
GROUP BY type_id,types.type;
"""

# Execute query
result_df = spark.sql(query)
display(result_df)

type_id,type,num_publications
0,journal-article,556
3,conference paper,158
5,report,156
4,workshop,156
1,book-chapter,286
2,proceedings-article,455
6,book,156
