In [77]:
# With sparkSession we create a connection to our database
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, col, collect_list
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, ArrayType, TimestampType

# Create an entry point to the PySpark Application
spark = SparkSession.builder \
    .master("local") \
    .appName("Bibliography") \
    .getOrCreate()

INPUT_FILE = "bibliography.json"
OPTIONS = {'multiline': 'true', 'allowNumericLeadingZero': 'true', 'timestampFormat': "yyyy-MM-dd'T'HH:mm:ss[.ZZZ'Z']"}

In [119]:
#AUTHOR TABLE
schemaAut = StructType(
    [StructField('authors', ArrayType(StructType([
        StructField('_id', StringType(), True),
        StructField('name', StringType(), True),
        StructField('email', StringType(), True),
        StructField('bio', StringType(), True),
    ])), True)
     ])

dfAut = spark.read.format('json').options(**OPTIONS).schema(schemaAut).json(INPUT_FILE)
dfAut = dfAut.select(explode(dfAut.authors))
dfAut = dfAut.withColumnRenamed("col", "authors")
dfAut = dfAut.filter(col("authors._id") != 'null').select("authors._id", "authors.name", "authors.email", "authors.bio")
dfAut = dfAut.withColumnRenamed("_id", "authorID")
# Authors with same ID are merged and all the bio are collected in a list
dfAut = dfAut.groupBy('authorID', 'name', 'email').agg(collect_list('bio').alias('bio'))

dfAut.printSchema()
dfAut.show()

root
 |-- authorID: string (nullable = true)
 |-- name: string (nullable = true)
 |-- email: string (nullable = true)
 |-- bio: array (nullable = false)
 |    |-- element: string (containsNull = false)

+--------------------+------------------+--------------------+--------------------+
|            authorID|              name|               email|                 bio|
+--------------------+------------------+--------------------+--------------------+
|53f3186fdabfae9a8...|   A. M. A. Hariri|a..m..a..hariride...|[My name is A. M....|
|53f3186fdabfae9a8...|    Matthew Prowse|matthew.prowsefb@...|[My name is Matth...|
|53f31870dabfae9a8...|       Sui-ping Qi|sui-ping.qi19@gma...|[My name is Sui-p...|
|53f31871dabfae9a8...|     Renato Fabbri|renato.fabbrib7@g...|[My name is Renat...|
|53f31873dabfae9a8...|   Joachim Schimpf|joachim.schimpf8a...|[My name is Joach...|
|53f31874dabfae9a8...|    E. Di Bernardo|e..di.bernardo10@...|[My name is E. Di...|
|53f31875dabfae9a8...|    Steven F. Roth|

                                                                                

In [109]:
# PAPER TABLE
schemaPaper = StructType(
    [StructField('_id', StringType(), True),
     StructField('title', StringType(), True),
     StructField('keywords', ArrayType(StringType()), True),
     StructField('fos', ArrayType(StringType()), True),
     StructField('references', ArrayType(StringType()), True),
     StructField('page_start', IntegerType(), True),
     StructField('page_end', IntegerType(), True),
     StructField('lang', StringType(), True),
     StructField('doi', StringType(), True),
     StructField('url', ArrayType(StringType()), True),
     StructField('abstract', StringType(), True),
     StructField('publication_type', StringType(), True),
     StructField('date', TimestampType(), True)
     #Missing: 'publication_id'
     ])

dfPaper = spark.read.format('json').options(**OPTIONS).schema(schemaPaper).json(INPUT_FILE)
dfPaper = dfPaper.withColumnRenamed("_id", "paperID")
dfPaper.printSchema()
dfPaper.show()

root
 |-- paperID: string (nullable = true)
 |-- title: string (nullable = true)
 |-- keywords: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- fos: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- references: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- page_start: integer (nullable = true)
 |-- page_end: integer (nullable = true)
 |-- lang: string (nullable = true)
 |-- doi: string (nullable = true)
 |-- url: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- abstract: string (nullable = true)
 |-- publication_type: string (nullable = true)
 |-- date: timestamp (nullable = true)



[Stage 277:>                                                        (0 + 1) / 1]

+--------------------+--------------------+--------------------+--------------------+--------------------+----------+--------+----+--------------------+--------------------+--------------------+----------------+-------------------+
|             paperID|               title|            keywords|                 fos|          references|page_start|page_end|lang|                 doi|                 url|            abstract|publication_type|               date|
+--------------------+--------------------+--------------------+--------------------+--------------------+----------+--------+----+--------------------+--------------------+--------------------+----------------+-------------------+
|53e99784b7602d970...|Using XML to Inte...|[internet, hyperm...|[xml base, world ...|[53e9adbdb7602d97...|       167|     172|  en|10.1109/CMPSAC.20...|[http://dx.doi.or...|The eXtensible Ma...|            Book|1974-09-13 06:34:29|
|53e99784b7602d970...|               FCLOS|[molap, subsumpti...|[informa

                                                                                

In [114]:
# AFFILIATION TABLE
schemaAffiliation = StructType(
    [StructField('_id', StringType(), True),
     StructField('authors', ArrayType(StructType([
         StructField('_id', StringType(), True),
         StructField('org', StringType(), True)
     ])), True),
     ])

dfAff = spark.read.format('json').options(**OPTIONS).schema(schemaAffiliation).json(INPUT_FILE)
dfAff = dfAff.withColumnRenamed("_id", "paperID")
dfAff = dfAff.select("paperID", explode(dfAff.authors))
dfAff = dfAff.withColumnRenamed("col", "authors")
dfAff = dfAff.select("paperID", "authors._id", "authors.org")
dfAff = dfAff.withColumnRenamed("_id", "authorID")
dfAff = dfAff.withColumnRenamed("org", "organization")
dfAff.printSchema()
dfAff.show()

root
 |-- paperID: string (nullable = true)
 |-- authorID: string (nullable = true)
 |-- organization: string (nullable = true)

+--------------------+--------------------+--------------------+
|             paperID|            authorID|        organization|
+--------------------+--------------------+--------------------+
|53e99784b7602d970...|548a2e3ddabfae9b4...|                null|
|53e99784b7602d970...|53f43b64dabfaefed...|Corresponding aut...|
|53e99784b7602d970...|53f43354dabfaedd7...|Freie Universität...|
|53e99784b7602d970...|53f443b6dabfaeecd...|Freie Universität...|
|53e99785b7602d970...|53f43640dabfaedf4...|Tel.: +61 7 336 5...|
|53e9978ab7602d970...|53f42d5cdabfaee2a...|Optimal Solutions...|
|53e9978ab7602d970...|53f433bedabfaee4d...|Division of Optim...|
|53e9978ab7602d970...|53f4538adabfaec22...|Division of Optim...|
|53e9978db7602d970...|5448a3b5dabfae87b...|Corresponding aut...|
|53e9978db7602d970...|54343235dabfaebba...|CILab – Computati...|
|53e9978db7602d970...|5405

In [72]:
# BOOK TABLE
schemaBook = StructType(
    [StructField('isbn', StringType(), True),
     StructField('publisher', StringType(), True),
     StructField('venue', StringType(), True),
     StructField('publication_type', StringType(), True)
     #Missing: 'publication_id'
     ])

dfBook = spark.read.format('json').options(**OPTIONS).schema(schemaBook).json(INPUT_FILE)
dfBook = dfBook.filter(col('publication_type') == 'Book')
dfBook = dfBook.filter(col('isbn') != 'null')
dfBook = dfBook.filter(col('venue') != 'null')
dfBook = dfBook.select('isbn', 'publisher', 'venue')
dfBook.printSchema()
dfBook.show()

root
 |-- isbn: string (nullable = true)
 |-- publisher: string (nullable = true)
 |-- venue: string (nullable = true)



                                                                                

+-----------------+--------------------+--------------------+
|             isbn|           publisher|               venue|
+-----------------+--------------------+--------------------+
|    0-7695-1727-7|            Elsevier|             COMPSAC|
|    3-540-64990-5|   Springer New York|               EWCBR|
|    0-7695-1013-2|            Elsevier|               ICALT|
|    1-4244-0387-1|      Intellect Ltd.|              APCCAS|
|    0-7695-1822-2|Multidisciplinary...|                FOCS|
|    0-7695-2616-0|       Science Press|          ICICIC (3)|
|978-0-7695-4157-0|Tsinghua Universi...|      ICPP Workshops|
|    1-59593-049-3|Taylor and Franci...|                  EC|
|    0-7695-2586-5|      Intellect Ltd.|              VL/HCC|
|    0-7695-3056-7|The American Soci...|                ICAT|
|978-1-4244-9864-2|Taylor and Franci...|Winter Simulation...|
|978-1-4503-0923-3|Association of Fo...|Symposium on Comp...|
|    0-7695-2125-8|Asia Digital Art ...|                ECBS|
| 0-7695

In [47]:
# JOURNAL TABLE
schemaJournal = StructType(
    [StructField('issn', StringType(), True),
     StructField('publisher', StringType(), True),
     StructField('venue', StringType(), True),
     StructField('volume', IntegerType(), True),
     StructField('issue', IntegerType(), True),
     StructField('publication_type', StringType(), True)
     #Missing: 'publication_id'
     ])

dfJournal = spark.read.format('json').options(**OPTIONS).schema(schemaJournal).json(INPUT_FILE)
dfJournal = dfJournal.filter(col('publication_type') == 'Journal')
dfJournal = dfJournal.filter(col('issn') != 'null')
dfJournal = dfJournal.filter(col('venue') != 'null')
dfJournal = dfJournal.filter(col('issue') >= 0)
dfJournal = dfJournal.filter(col('volume') >= 0)
dfJournal = dfJournal.select('issn', 'publisher', 'venue', 'issue', 'volume')
dfJournal.printSchema()
dfJournal.show()

root
 |-- issn: string (nullable = true)
 |-- publisher: string (nullable = true)
 |-- venue: string (nullable = true)
 |-- issue: integer (nullable = true)
 |-- volume: integer (nullable = true)

+---------+--------------------+--------------------+-----+------+
|     issn|           publisher|               venue|issue|volume|
+---------+--------------------+--------------------+-----+------+
|0736-5853|            Elsevier|Telematics and In...|    1|    26|
|0377-2217|            Elsevier|European Journal ...|    3|   171|
|0020-0255| IOP Publishing Ltd.|Information Sciences|   16|   178|
|1574-1192|     Springer Verlag|Pervasive and Mob...|    4|     3|
|1364-8152|John Wiley and So...|Environmental Mod...|    6|    22|
|0377-2217|Taylor and Franci...|European Journal ...|    1|   170|
|0164-1212| Pleiades Publishing|Journal of System...|    1|    79|
|0377-2217|            Elsevier|European Journal ...|    2|   181|
|0196-6774|Editorial and Pub...|Journal of Algori...|    2|    47|

In [49]:
# CONFERENCE TABLE
schemaConf = StructType(
    [StructField('location', StringType(), True),
     StructField('venue', StringType(), True),
     StructField('publication_type', StringType(), True)
     #Missing: 'publication_id'
     ])

dfConf = spark.read.format('json').options(**OPTIONS).schema(schemaConf).json(INPUT_FILE)
dfConf = dfConf.filter(col('publication_type') == 'Conference')
dfConf = dfConf.filter(col('venue') != 'null')
dfConf = dfConf.select('location', 'venue')
dfConf.printSchema()
dfConf.show()

root
 |-- location: string (nullable = true)
 |-- venue: string (nullable = true)

+-------------------+-----------------+
|           location|            venue|
+-------------------+-----------------+
|  Sao Paulo, Brazil|             AIAI|
|        Boston, USA|         FSKD (5)|
|       Chicago, USA|          MobiSys|
|  Sao Paulo, Brazil|             ICIP|
|Copenhagen, Denmark|            HICSS|
| Jakarta, Indonesia|             ACPR|
|      Madrid, Spain|           ICASSP|
|       Chicago, USA|   ICDM Workshops|
| Frankfurt, Germany|              KDD|
| Seoul, South Korea|              CHI|
|   Los Angeles, USA|             ICMI|
|      New York, USA|          UIC/ATC|
|        Boston, USA|      Edutainment|
|    Toronto, Canada|            ICWET|
|    Toronto, Canada|             ICSM|
| San Francisco, USA|       CODES+ISSS|
| Seoul, South Korea|             NBiS|
|       Tokyo, Japan|            ICICS|
|      Mumbai, India|             MMSP|
|       Osaka, Japan|Discovery Scienc

In [120]:
#Command 5 create a new column with the length of the paper(number of total pages)
dfPaper_total_pages = dfPaper \
    .filter((col('page_start') >= 0) & (col('page_end') >= 0) & (col('page_start') <= col('page_end'))) \
    .withColumn('total_pages', col('page_end') - col('page_start'))

dfPaper_total_pages \
    .select(col('title'), col('page_start'), col('page_end'), col('total_pages')) \
    .show(5, truncate=False)

+-------------------------------------------------------------+----------+--------+-----------+
|title                                                        |page_start|page_end|total_pages|
+-------------------------------------------------------------+----------+--------+-----------+
|Using XML to Integrate Existing Software Systems into the Web|167       |172     |5          |
|FCLOS                                                        |192       |220     |28         |
|Bhoomi                                                       |20        |31      |11         |
|Laps                                                         |962       |976     |14         |
|Mindful                                                      |3253      |3274    |21         |
+-------------------------------------------------------------+----------+--------+-----------+
only showing top 5 rows

