In [150]:
# With sparkSession we create a connection to our database
from pyspark.sql import SparkSession

# Create an entry point to the PySpark Application
spark = SparkSession.builder \
      .master("local") \
      .appName("Bibliography") \
      .getOrCreate()

In [151]:
#CHECK DATASET PROBLEMS ON A TEST DF
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, ArrayType
from pyspark.sql.functions import explode, col

schema = StructType(
            [StructField('_id', StringType(), True),
             StructField('title', StringType(),True),
             StructField('authors', ArrayType(StructType([
                StructField('_id', StringType(), True),
                StructField('name', StringType(), True),
                StructField('email', StringType(), True),
                StructField('bio', StringType(), True),
                ])), True),
             StructField('keywords', ArrayType(StringType()), True),
             StructField('fos', ArrayType(StringType()), True),
             StructField('page_start', IntegerType(), True),
             StructField('page_end', IntegerType(), True),
             StructField('lang', StringType(),True),
             StructField('issue', StringType(), True),
             StructField('isbn', StringType(),True),
             StructField('doi', StringType(),True),
            ])


df = spark.read.format('json').option("multiline", True).schema(schema).json('bib.json')

#Checks on the 'issue' to see what kind of data it has
cnt = df.filter(col('issue').rlike('[0-9]+')).count()
null = df.filter(col('issue') == 'null').count()
string = df.filter(col('issue').rlike('[A-Za-z]+')).count()
empty = df.filter(col('issue') == '').count()
print(cnt, null, string, empty)

df.printSchema()
df.show()

                                                                                

26247 306 590 0
root
 |-- _id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- authors: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- _id: string (nullable = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- email: string (nullable = true)
 |    |    |-- bio: string (nullable = true)
 |-- keywords: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- fos: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- page_start: integer (nullable = true)
 |-- page_end: integer (nullable = true)
 |-- lang: string (nullable = true)
 |-- issue: string (nullable = true)
 |-- isbn: string (nullable = true)
 |-- doi: string (nullable = true)



[Stage 236:>                                                        (0 + 1) / 1]

+--------------------+--------------------+--------------------+--------------------+--------------------+----------+--------+----+-----+-------------+--------------------+
|                 _id|               title|             authors|            keywords|                 fos|page_start|page_end|lang|issue|         isbn|                 doi|
+--------------------+--------------------+--------------------+--------------------+--------------------+----------+--------+----+-----+-------------+--------------------+
|53e99784b7602d970...|Using XML to Inte...|[{548a2e3ddabfae9...|[internet, hyperm...|[xml base, world ...|       167|     172|  en| null|0-7695-1727-7|10.1109/CMPSAC.20...|
|53e99784b7602d970...|               FCLOS|[{53f43b64dabfaef...|[molap, subsumpti...|[information syst...|       192|     220|  en|    2|         null|10.1016/j.datak.2...|
|53e99785b7602d970...|              Bhoomi|[{53f43640dabfaed...|[icts, e governan...|[revenue, transpa...|        20|      31|  en|    

                                                                                

In [152]:
#AUTHOR TABLE
from pyspark.sql.types import StructType, StructField, StringType, ArrayType
from pyspark.sql.functions import explode

schemaAut = StructType(
            [StructField('authors', ArrayType(StructType([
                StructField('_id', StringType(), True),
                StructField('name', StringType(), True),
                StructField('email', StringType(), True),
                StructField('bio', StringType(), True),
                ])), True)
            ])

dfAut = spark.read.format('json').option("multiline", True).schema(schemaAut).json('bib.json')
dfAut = dfAut.select(explode(dfAut.authors))
dfAut = dfAut.withColumnRenamed("col", "authors")
dfAut = dfAut.select("authors._id","authors.name","authors.email","authors.bio")
dfAut = dfAut.withColumnRenamed("_id", "authorID")
dfAut.printSchema()
dfAut.show()

root
 |-- authorID: string (nullable = true)
 |-- name: string (nullable = true)
 |-- email: string (nullable = true)
 |-- bio: string (nullable = true)

+--------------------+-------------------+--------------------+--------------------+
|            authorID|               name|               email|                 bio|
+--------------------+-------------------+--------------------+--------------------+
|548a2e3ddabfae9b4...|     Harry M. Sneed|harry.m..sneedbc@...|My name is Harry ...|
|53f43b64dabfaefed...|  Ilias Michalarias|ilias.michalarias...|My name is Ilias ...|
|53f43354dabfaedd7...| Arkadiy Omelchenko|arkadiy.omelchenk...|My name is Arkadi...|
|53f443b6dabfaeecd...|  Hans-Joachim Lenz|hans-joachim.lenz...|My name is Hans-J...|
|53f43640dabfaedf4...|      Pradip Thomas|pradip.thomasfc@g...|My name is Pradip...|
|53f42d5cdabfaee2a...|     Patrik Eveborn|patrik.eveborn56@...|My name is Patrik...|
|53f433bedabfaee4d...|    Patrik Flisberg|patrik.flisberg17...|My name is Patrik.

                                                                                

In [153]:
# PAPER TABLE
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, ArrayType

schemaPaper = StructType(
            [StructField('_id', StringType(), True),
             StructField('title', StringType(),True),
             StructField('keywords', ArrayType(StringType()), True),
             StructField('fos', ArrayType(StringType()), True),
             StructField('page_start', IntegerType(), True),
             StructField('page_end', IntegerType(), True),
             StructField('lang', StringType(),True),
             StructField('doi', StringType(),True),
             StructField('url', ArrayType(StringType()),True),
             StructField('abstract', StringType(),True),
             StructField('publication_type', StringType(),True)
            ])

dfPaper = spark.read.format('json').option("multiline", True).schema(schemaPaper).json('bib.json')
dfPaper.printSchema()
dfPaper.show()

root
 |-- _id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- keywords: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- fos: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- page_start: integer (nullable = true)
 |-- page_end: integer (nullable = true)
 |-- lang: string (nullable = true)
 |-- doi: string (nullable = true)
 |-- url: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- abstract: string (nullable = true)
 |-- publication_type: string (nullable = true)



                                                                                

+--------------------+--------------------+--------------------+--------------------+----------+--------+----+--------------------+--------------------+--------------------+----------------+
|                 _id|               title|            keywords|                 fos|page_start|page_end|lang|                 doi|                 url|            abstract|publication_type|
+--------------------+--------------------+--------------------+--------------------+----------+--------+----+--------------------+--------------------+--------------------+----------------+
|53e99784b7602d970...|Using XML to Inte...|[internet, hyperm...|[xml base, world ...|       167|     172|  en|10.1109/CMPSAC.20...|[http://dx.doi.or...|The eXtensible Ma...|            Book|
|53e99784b7602d970...|               FCLOS|[molap, subsumpti...|[information syst...|       192|     220|  en|10.1016/j.datak.2...|[http://dx.doi.or...|Mobile online ana...|         Journal|
|53e99785b7602d970...|              Bhoomi|[i

In [154]:
# AFFILIATION TABLE
from pyspark.sql.types import StructType, StructField, StringType, ArrayType
from pyspark.sql.functions import explode

schemaAffiliation = StructType(
            [StructField('_id', StringType(), True),
             StructField('authors', ArrayType(StructType([
                    StructField('_id', StringType(), True),
                    StructField('org', StringType(), True)
             ])), True),
            ])

dfAff = spark.read.format('json').option("multiline", True).schema(schemaAffiliation).json('bib.json')
dfAff = dfAff.withColumnRenamed("_id", "paperID")
dfAff = dfAff.select("paperId", explode(dfAff.authors))
dfAff = dfAff.withColumnRenamed("col", "authors")
dfAff = dfAff.select("paperId", "authors._id","authors.org")
dfAff = dfAff.withColumnRenamed("_id", "authorID")
dfAff = dfAff.withColumnRenamed("organization", "authorID")
dfAff.printSchema()
dfAff.show()

root
 |-- paperId: string (nullable = true)
 |-- authorID: string (nullable = true)
 |-- org: string (nullable = true)

+--------------------+--------------------+--------------------+
|             paperId|            authorID|                 org|
+--------------------+--------------------+--------------------+
|53e99784b7602d970...|548a2e3ddabfae9b4...|                null|
|53e99784b7602d970...|53f43b64dabfaefed...|Corresponding aut...|
|53e99784b7602d970...|53f43354dabfaedd7...|Freie Universität...|
|53e99784b7602d970...|53f443b6dabfaeecd...|Freie Universität...|
|53e99785b7602d970...|53f43640dabfaedf4...|Tel.: +61 7 336 5...|
|53e9978ab7602d970...|53f42d5cdabfaee2a...|Optimal Solutions...|
|53e9978ab7602d970...|53f433bedabfaee4d...|Division of Optim...|
|53e9978ab7602d970...|53f4538adabfaec22...|Division of Optim...|
|53e9978db7602d970...|5448a3b5dabfae87b...|Corresponding aut...|
|53e9978db7602d970...|54343235dabfaebba...|CILab – Computati...|
|53e9978db7602d970...|5405bdb6dabfa