In [22]:
# With sparkSession we create a connection to our database
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, ArrayType, TimestampType
from pyspark.sql.functions import explode, col

# Create an entry point to the PySpark Application
spark = SparkSession.builder \
      .master("local") \
      .appName("Bibliography") \
      .getOrCreate()

INPUT_FILE = "bibliography.json"
OPTIONS = {'multiline': 'true', 'allowNumericLeadingZero': 'true','timestampFormat': "yyyy-MM-dd'T'HH:mm:ss[.ZZZ'Z']"}

In [23]:
#AUTHOR TABLE
schemaAut = StructType(
            [StructField('authors', ArrayType(StructType([
                StructField('_id', StringType(), True),
                StructField('name', StringType(), True),
                StructField('email', StringType(), True),
                StructField('bio', StringType(), True),
                ])), True)
            ])

dfAut = spark.read.format('json').options(**OPTIONS).schema(schemaAut).json(INPUT_FILE)
dfAut = dfAut.select(explode(dfAut.authors))
dfAut = dfAut.withColumnRenamed("col", "authors")
dfAut = dfAut.select("authors._id","authors.name","authors.email","authors.bio")
dfAut = dfAut.withColumnRenamed("_id", "authorID")
dfAut.printSchema()
dfAut.show()

root
 |-- authorID: string (nullable = true)
 |-- name: string (nullable = true)
 |-- email: string (nullable = true)
 |-- bio: string (nullable = true)

+--------------------+-------------------+--------------------+--------------------+
|            authorID|               name|               email|                 bio|
+--------------------+-------------------+--------------------+--------------------+
|548a2e3ddabfae9b4...|     Harry M. Sneed|harry.m..sneedbc@...|My name is Harry ...|
|53f43b64dabfaefed...|  Ilias Michalarias|ilias.michalarias...|My name is Ilias ...|
|53f43354dabfaedd7...| Arkadiy Omelchenko|arkadiy.omelchenk...|My name is Arkadi...|
|53f443b6dabfaeecd...|  Hans-Joachim Lenz|hans-joachim.lenz...|My name is Hans-J...|
|53f43640dabfaedf4...|      Pradip Thomas|pradip.thomasfc@g...|My name is Pradip...|
|53f42d5cdabfaee2a...|     Patrik Eveborn|patrik.eveborn56@...|My name is Patrik...|
|53f433bedabfaee4d...|    Patrik Flisberg|patrik.flisberg17...|My name is Patrik.

In [24]:
# PAPER TABLE
schemaPaper = StructType(
            [StructField('_id', StringType(), True),
             StructField('title', StringType(),True),
             StructField('keywords', ArrayType(StringType()), True),
             StructField('fos', ArrayType(StringType()), True),
             StructField('references', ArrayType(StringType()), True),
             StructField('page_start', IntegerType(), True),
             StructField('page_end', IntegerType(), True),
             StructField('lang', StringType(),True),
             StructField('doi', StringType(),True),
             StructField('url', ArrayType(StringType()),True),
             StructField('abstract', StringType(),True),
             StructField('publication_type', StringType(),True),
             StructField('date', TimestampType(), True)
             #Missing: 'publication_id'
            ])

dfPaper = spark.read.format('json').options(**OPTIONS).schema(schemaPaper).json(INPUT_FILE)
dfPaper.printSchema()
dfPaper.show()

root
 |-- _id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- keywords: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- fos: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- references: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- page_start: integer (nullable = true)
 |-- page_end: integer (nullable = true)
 |-- lang: string (nullable = true)
 |-- doi: string (nullable = true)
 |-- url: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- abstract: string (nullable = true)
 |-- publication_type: string (nullable = true)
 |-- date: timestamp (nullable = true)



[Stage 19:>                                                         (0 + 1) / 1]

+--------------------+--------------------+--------------------+--------------------+--------------------+----------+--------+----+--------------------+--------------------+--------------------+----------------+-------------------+
|                 _id|               title|            keywords|                 fos|          references|page_start|page_end|lang|                 doi|                 url|            abstract|publication_type|               date|
+--------------------+--------------------+--------------------+--------------------+--------------------+----------+--------+----+--------------------+--------------------+--------------------+----------------+-------------------+
|53e99784b7602d970...|Using XML to Inte...|[internet, hyperm...|[xml base, world ...|[53e9adbdb7602d97...|       167|     172|  en|10.1109/CMPSAC.20...|[http://dx.doi.or...|The eXtensible Ma...|            Book|1974-09-13 06:34:29|
|53e99784b7602d970...|               FCLOS|[molap, subsumpti...|[informa

                                                                                

In [25]:
# AFFILIATION TABLE
schemaAffiliation = StructType(
            [StructField('_id', StringType(), True),
             StructField('authors', ArrayType(StructType([
                    StructField('_id', StringType(), True),
                    StructField('org', StringType(), True)
             ])), True),
            ])

dfAff = spark.read.format('json').options(**OPTIONS).schema(schemaAffiliation).json(INPUT_FILE)
dfAff = dfAff.withColumnRenamed("_id", "paperID")
dfAff = dfAff.select("paperId", explode(dfAff.authors))
dfAff = dfAff.withColumnRenamed("col", "authors")
dfAff = dfAff.select("paperId", "authors._id","authors.org")
dfAff = dfAff.withColumnRenamed("_id", "authorID")
dfAff = dfAff.withColumnRenamed("org", "organization")
dfAff.printSchema()
dfAff.show()

root
 |-- paperId: string (nullable = true)
 |-- authorID: string (nullable = true)
 |-- organization: string (nullable = true)

+--------------------+--------------------+--------------------+
|             paperId|            authorID|        organization|
+--------------------+--------------------+--------------------+
|53e99784b7602d970...|548a2e3ddabfae9b4...|                null|
|53e99784b7602d970...|53f43b64dabfaefed...|Corresponding aut...|
|53e99784b7602d970...|53f43354dabfaedd7...|Freie Universität...|
|53e99784b7602d970...|53f443b6dabfaeecd...|Freie Universität...|
|53e99785b7602d970...|53f43640dabfaedf4...|Tel.: +61 7 336 5...|
|53e9978ab7602d970...|53f42d5cdabfaee2a...|Optimal Solutions...|
|53e9978ab7602d970...|53f433bedabfaee4d...|Division of Optim...|
|53e9978ab7602d970...|53f4538adabfaec22...|Division of Optim...|
|53e9978db7602d970...|5448a3b5dabfae87b...|Corresponding aut...|
|53e9978db7602d970...|54343235dabfaebba...|CILab – Computati...|
|53e9978db7602d970...|5405

                                                                                

In [26]:
# BOOK TABLE
schemaBook = StructType(
            [StructField('isbn', StringType(), True),
             StructField('publisher', StringType(), True),
             StructField('venue', StringType(), True),
             StructField('publication_type', StringType(),True)
             #Missing: 'publication_id'
            ])

dfBook = spark.read.format('json').options(**OPTIONS).schema(schemaBook).json(INPUT_FILE)
dfBook = dfBook.filter(col('publication_type') == 'Book')
dfBook = dfBook.filter(col('isbn') != 'null')
dfBook = dfBook.filter(col('venue') != 'null')
dfBook = dfBook.select('isbn', 'publisher', 'venue')
dfBook.printSchema()
dfBook.show()

root
 |-- isbn: string (nullable = true)
 |-- publisher: string (nullable = true)
 |-- venue: string (nullable = true)

+-----------------+--------------------+--------------------+
|             isbn|           publisher|               venue|
+-----------------+--------------------+--------------------+
|    0-7695-1727-7|            Elsevier|             COMPSAC|
|    3-540-64990-5|   Springer New York|               EWCBR|
|    0-7695-1013-2|            Elsevier|               ICALT|
|    1-4244-0387-1|      Intellect Ltd.|              APCCAS|
|    0-7695-1822-2|Multidisciplinary...|                FOCS|
|    0-7695-2616-0|       Science Press|          ICICIC (3)|
|978-0-7695-4157-0|Tsinghua Universi...|      ICPP Workshops|
|    1-59593-049-3|Taylor and Franci...|                  EC|
|    0-7695-2586-5|      Intellect Ltd.|              VL/HCC|
|    0-7695-3056-7|The American Soci...|                ICAT|
|978-1-4244-9864-2|Taylor and Franci...|Winter Simulation...|
|978-1-4503-

In [27]:
# JOURNAL TABLE
schemaJournal = StructType(
            [StructField('issn', StringType(), True),
             StructField('publisher', StringType(), True),
             StructField('venue', StringType(), True),
             StructField('volume', StringType(), True),
             StructField('issue', StringType(), True),
             StructField('publication_type', StringType(),True)
             #Missing: 'publication_id'
             #Problem: volume and issue are imported as StringType() not IntegerType() because we don't have coherent data
            ])

dfJournal = spark.read.format('json').options(**OPTIONS).schema(schemaJournal).json(INPUT_FILE)
dfJournal = dfJournal.filter(col('publication_type') == 'Journal')
dfJournal = dfJournal.filter(col('issn') != 'null')
dfJournal = dfJournal.filter(col('venue') != 'null')
dfJournal = dfJournal.filter(col('issue') != 'null')
dfJournal = dfJournal.filter(col('volume') != 'null')
dfJournal = dfJournal.select('issn', 'publisher', 'venue', 'issue', 'volume')
dfJournal.printSchema()
dfJournal.show()

root
 |-- issn: string (nullable = true)
 |-- publisher: string (nullable = true)
 |-- venue: string (nullable = true)
 |-- issue: string (nullable = true)
 |-- volume: string (nullable = true)

+---------+--------------------+--------------------+-----+------+
|     issn|           publisher|               venue|issue|volume|
+---------+--------------------+--------------------+-----+------+
|0736-5853|            Elsevier|Telematics and In...|    1|    26|
|0377-2217|            Elsevier|European Journal ...|    3|   171|
|0020-0255| IOP Publishing Ltd.|Information Sciences|   16|   178|
|1574-1192|     Springer Verlag|Pervasive and Mob...|    4|     3|
|1364-8152|John Wiley and So...|Environmental Mod...|    6|    22|
|0377-2217|Taylor and Franci...|European Journal ...|    1|   170|
|0164-1212| Pleiades Publishing|Journal of System...|    1|    79|
|0377-2217|            Elsevier|European Journal ...|    2|   181|
|0196-6774|Editorial and Pub...|Journal of Algori...|    2|    47|
|

In [28]:
# CONFERENCE TABLE
schemaConf = StructType(
            [StructField('location', StringType(), True),
             StructField('venue', StringType(), True),
             StructField('publication_type', StringType(),True)
             #Missing: 'publication_id'
            ])

dfConf = spark.read.format('json').options(**OPTIONS).schema(schemaConf).json(INPUT_FILE)
dfConf = dfConf.filter(col('publication_type') == 'Conference')
dfConf = dfConf.filter(col('venue') != 'null')
dfConf = dfConf.select('location', 'venue')
dfConf.printSchema()
dfConf.show()

root
 |-- location: string (nullable = true)
 |-- venue: string (nullable = true)

+-------------------+-----------------+
|           location|            venue|
+-------------------+-----------------+
|  Sao Paulo, Brazil|             AIAI|
|        Boston, USA|         FSKD (5)|
|       Chicago, USA|          MobiSys|
|  Sao Paulo, Brazil|             ICIP|
|Copenhagen, Denmark|            HICSS|
| Jakarta, Indonesia|             ACPR|
|      Madrid, Spain|           ICASSP|
|       Chicago, USA|   ICDM Workshops|
| Frankfurt, Germany|              KDD|
| Seoul, South Korea|              CHI|
|   Los Angeles, USA|             ICMI|
|      New York, USA|          UIC/ATC|
|        Boston, USA|      Edutainment|
|    Toronto, Canada|            ICWET|
|    Toronto, Canada|             ICSM|
| San Francisco, USA|       CODES+ISSS|
| Seoul, South Korea|             NBiS|
|       Tokyo, Japan|            ICICS|
|      Mumbai, India|             MMSP|
|       Osaka, Japan|Discovery Scienc