In [3]:
# Dowloading pyspark
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.1.tar.gz (281.4 MB)
[K     |████████████████████████████████| 281.4 MB 44 kB/s 
[?25hCollecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[K     |████████████████████████████████| 199 kB 42.4 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.1-py2.py3-none-any.whl size=281845512 sha256=32eacbda61a414ccdd3b1270bbec2c2c0dd65a67cd5bf597b345a78978900d60
  Stored in directory: /root/.cache/pip/wheels/43/dc/11/ec201cd671da62fa9c5cc77078235e40722170ceba231d7598
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.5 pyspark-3.3.1


In [5]:
# With sparkSession we create a connection to our database
from pyspark.sql import SparkSession

# Create an entry point to the PySpark Application
spark = SparkSession.builder \
      .master("local") \
      .appName("Bibliography") \
      .getOrCreate()

In [10]:
#AUTHOR TABLE
from pyspark.sql.types import StructType, StructField, StringType, ArrayType
from pyspark.sql.functions import explode

schemaAut = StructType(
            [StructField('authors', ArrayType(StructType([
                StructField('_id', StringType(), nullable = False),
                StructField('name', StringType(), True),
                StructField('email', StringType(), True),
                StructField('bio', StringType(), True),
                ])), True)
            ])

dfAut = spark.read.format('json').option("multiline", True).schema(schemaAut).json('/content/drive/MyDrive/bib.json')
dfAut = dfAut.select(explode(dfAut.authors))
dfAut = dfAut.withColumnRenamed("col", "authors")
dfAut = dfAut.select("authors._id","authors.name","authors.email","authors.bio")
dfAut = dfAut.withColumnRenamed("_id", "authorID")
dfAut.printSchema()
dfAut.show()

root
 |-- authorID: string (nullable = true)
 |-- name: string (nullable = true)
 |-- email: string (nullable = true)
 |-- bio: string (nullable = true)

+--------------------+-------------------+--------------------+--------------------+
|            authorID|               name|               email|                 bio|
+--------------------+-------------------+--------------------+--------------------+
|548a2e3ddabfae9b4...|     Harry M. Sneed|harry.m..sneedbc@...|My name is Harry ...|
|53f43b64dabfaefed...|  Ilias Michalarias|ilias.michalarias...|My name is Ilias ...|
|53f43354dabfaedd7...| Arkadiy Omelchenko|arkadiy.omelchenk...|My name is Arkadi...|
|53f443b6dabfaeecd...|  Hans-Joachim Lenz|hans-joachim.lenz...|My name is Hans-J...|
|53f43640dabfaedf4...|      Pradip Thomas|pradip.thomasfc@g...|My name is Pradip...|
|53f42d5cdabfaee2a...|     Patrik Eveborn|patrik.eveborn56@...|My name is Patrik...|
|53f433bedabfaee4d...|    Patrik Flisberg|patrik.flisberg17...|My name is Patrik.

In [33]:
# PAPER TABLE
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, ArrayType

schemaPaper = StructType(
            [StructField('_id', StringType(), True),
             StructField('title', StringType(),True),
             StructField('keywords', ArrayType(StringType()), True),
             StructField('fos', ArrayType(StringType()), True),
             StructField('references', ArrayType(StringType()), True),
             StructField('page_start', IntegerType(), True),
             StructField('page_end', IntegerType(), True),
             StructField('lang', StringType(),True),
             StructField('doi', StringType(),True),
             StructField('url', ArrayType(StringType()),True),
             StructField('abstract', StringType(),True),
             StructField('publication_type', StringType(),True)
            ])

dfPaper = spark.read.format('json').option("multiline", True).schema(schemaPaper).json('/content/drive/MyDrive/bib.json')
dfPaper.printSchema()
dfPaper.show()

root
 |-- _id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- keywords: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- fos: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- references: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- page_start: integer (nullable = true)
 |-- page_end: integer (nullable = true)
 |-- lang: string (nullable = true)
 |-- doi: string (nullable = true)
 |-- url: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- abstract: string (nullable = true)
 |-- publication_type: string (nullable = true)

+--------------------+--------------------+--------------------+--------------------+--------------------+----------+--------+----+--------------------+--------------------+--------------------+----------------+
|                 _id|               title|            keywords|                 fos|          references|page_start|page_end|l

In [None]:
# AFFILIATION TABLE
from pyspark.sql.types import StructType, StructField, StringType, ArrayType
from pyspark.sql.functions import explode

schemaAffiliation = StructType(
            [StructField('_id', StringType(), True),
             StructField('authors', ArrayType(StructType([
                    StructField('_id', StringType(), True),
                    StructField('org', StringType(), True)
             ])), True),
            ])

dfAff = spark.read.format('json').option("multiline", True).schema(schemaAffiliation).json('/content/drive/MyDrive/bib.json')
dfAff = dfAff.withColumnRenamed("_id", "paperID")
dfAff = dfAff.select("paperId", explode(dfAff.authors))
dfAff = dfAff.withColumnRenamed("col", "authors")
dfAff = dfAff.select("paperId", "authors._id","authors.org")
dfAff = dfAff.withColumnRenamed("_id", "authorID")
dfAff = dfAff.withColumnRenamed("org", "organization")
dfAff.printSchema()
dfAff.show()

In [34]:
# Book table
from pyspark.sql.types import StructType, StructField, StringType, LongType
from pyspark.sql.functions import count, col, monotonically_increasing_id, collect_list, explode


# Preprocessing of the books for cleaning and merging the books

book_schema_preprocessing = StructType(
    [StructField('_id', StringType(), True),
     StructField('isbn', StringType(), True),
     StructField('publisher', StringType(), True),
     StructField('venue', StringType(), True),
     StructField('publication_type', StringType(),True)])

# Reading the json file
dfbooks_to_filter = spark.read.format('json').option("multiline", True).schema(book_schema_preprocessing).json('/content/drive/MyDrive/bib.json')

# Filtering and adjusting the dataframe
dfbooks_to_filter = dfbooks_to_filter.filter(col('publication_type') == 'Book').filter(col('isbn') != 'null').filter(col('venue') != 'null')
dfbooks_to_filter = dfbooks_to_filter.groupBy('isbn', 'venue').agg(collect_list('publisher').alias('publishersArray'), collect_list('_id').alias('_id'), count(col('publisher'))) # count can be removed (I was interested in evaluating if the group by was meaningful)
dfbooks_to_insert = dfbooks_to_filter.withColumn('publisher', dfbooks_to_filter['publishersArray'][0]).select('venue', 'isbn', 'publisher', '_id')

# Adding the new column which is the id
df_books = dfbooks_to_insert.withColumn('publication id', monotonically_increasing_id())

# Adding the foreign key to the papers
exploded_books = df_books.select(explode('_id'), 'publication id')
exploded_books.show(truncate = False)

df_papers_in_books = exploded_books.join(dfPaper, exploded_books.col == dfPaper._id)
df_papers_in_books = df_papers_in_books.drop('col')

df_books = df_books.drop(df_books._id)

# Visualizing the data
print('Papers')
df_papers_in_books.show(truncate = False)
print('Schema of the books')
df_books.printSchema()
print('Books')
df_books.show(truncate=False)

+------------------------+--------------+
|col                     |publication id|
+------------------------+--------------+
|53e99940b7602d970217cbfd|0             |
|53e998d4b7602d970210e077|1             |
|53e99953b7602d9702191b62|1             |
|53e99940b7602d970217fa8e|2             |
|53e9981db7602d9702039906|3             |
|53e998efb7602d970212a57d|3             |
|53e9985fb7602d970209babb|4             |
|53e99821b7602d970203ea94|5             |
|53e99832b7602d9702056c3d|5             |
|53e9986eb7602d97020ac05e|5             |
|53e99813b7602d970202b21d|6             |
|53e99827b7602d970204a9c5|7             |
|53e99804b7602d97020172f1|8             |
|53e99808b7602d970201c0f8|8             |
|53e998e1b7602d970211d027|9             |
|53e99813b7602d970202cec7|10            |
|53e99818b7602d970203323d|10            |
|53e99846b7602d97020756fc|10            |
|53e9984bb7602d970207e0c5|10            |
|53e9984fb7602d9702083e48|10            |
+------------------------+--------

In [35]:
# Journal table
from pyspark.sql.types import StructType, StructField, StringType, LongType
from pyspark.sql.functions import count, col, monotonically_increasing_id, collect_list, explode

# Preprocessing of the journals for cleaning and merging the journals

journal_schema_preprocessing = StructType(
    [StructField('_id', StringType(), True),
     StructField('issn', StringType(), True),
     StructField('publisher', StringType(), True),
     StructField('venue', StringType(), True),
     StructField('volume', StringType(), True),
     StructField('issue', StringType(), True),
     StructField('publication_type', StringType(),True)])

# Reading the json file
df_journals_to_filter = spark.read.format('json').option("multiline", True).schema(journal_schema_preprocessing).json('/content/drive/MyDrive/bib.json')

# Filtering and adjusting the dataframe
df_journals_to_filter = df_journals_to_filter.filter(col('publication_type') == 'Journal').filter(col('issn') != 'null').filter(col('venue') != 'null').filter(col('issue') != 'null').filter(col('volume') != 'null')
df_journals_to_filter = df_journals_to_filter.groupBy('venue', 'volume', 'issue', 'issn').agg(collect_list('publisher').alias('publishersArray'), collect_list('_id').alias('_id'), count(col('publisher'))) # count can be removed (I was interested in evaluating if the group by was meaningful)
df_journals_to_insert = df_journals_to_filter.withColumn('publisher', df_journals_to_filter['publishersArray'][0]).select('venue', 'volume', 'issue', 'publisher', '_id')

# Adding the new column which contains the publication identifier
df_journals = df_journals_to_insert.withColumn("publication id", monotonically_increasing_id())

# Adding the foreign key to the papers
exploded_journals = df_journals.select(explode('_id'), 'publication id')
exploded_journals.show(truncate = False)

df_papers_in_journals = exploded_journals.join(dfPaper, exploded_journals.col == dfPaper._id, "inner")
df_papers_in_journals = df_papers_in_journals.drop('col')

df_journals = df_journals.drop(df_journals._id)

# Visualizing the data
print('Papers')
df_papers_in_journals.show(truncate = False)
print('Schema of the journals')
df_journals.printSchema()
print('Journals')
df_journals.show(truncate=False)

+------------------------+--------------+
|col                     |publication id|
+------------------------+--------------+
|53e99915b7602d970214eaa6|0             |
|53e99915b7602d97021502bb|1             |
|53e99984b7602d97021c6a1c|2             |
|53e998b0b7602d97020ebee5|3             |
|53e9994cb7602d970218a5b5|4             |
|53e9990db7602d970214c179|5             |
|53e99952b7602d9702190cba|6             |
|53e99800b7602d970200de91|7             |
|53e99858b7602d970209388f|8             |
|53e9998bb7602d97021d0975|9             |
|53e997f1b7602d9701ff00c4|10            |
|53e997ddb7602d9701fd2cc7|11            |
|53e99946b7602d9702182de6|12            |
|53e99858b7602d9702090f11|13            |
|53e997f4b7602d9701ff6953|14            |
|53e99991b7602d97021d34e9|15            |
|53e99832b7602d9702055ead|16            |
|53e9984bb7602d970207bddc|17            |
|53e9982cb7602d970204e106|18            |
|53e99813b7602d970202d7d6|19            |
+------------------------+--------

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [43]:
# Conference table
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, ArrayType
from pyspark.sql.functions import count, col, monotonically_increasing_id, collect_list, explode

# Preprocessing of the books for cleaning and merging the books

schemaConf = StructType(
    [StructField('_id', StringType(), True),
     StructField('location', StringType(), True),
     StructField('venue', StringType(), True),
     StructField('publication_type', StringType(),True)])
# Reading the json file
df_conferences_to_filter = spark.read.format('json').option("multiline", True).schema(schemaConf).json('/content/drive/MyDrive/bib.json')

# Filtering and adjusting the dataframe
df_conferences_to_filter = df_conferences_to_filter.filter(col('publication_type') == 'Conference').filter(col('venue') != 'null')
df_conferences_to_filter = df_conferences_to_filter.groupBy('venue').agg(collect_list('location').alias('locations_array'), collect_list('_id').alias('_id'), count(col('location'))) # count can be removed (I was interested in evaluating if the group by was meaningful)
df_conferences_to_insert = df_conferences_to_filter.withColumn('location', df_conferences_to_filter['locations_array'][0]).select('venue', 'location', '_id')

# Adding the new column which is the id
df_conferences = df_conferences_to_insert.withColumn('publication id', monotonically_increasing_id())

# Adding the foreign key to the papers
exploded_conferences = df_conferences.select(explode('_id'), 'publication id')
exploded_conferences.show(truncate = False)

df_papers_in_conferences = exploded_conferences.join(dfPaper, exploded_conferences.col == dfPaper._id)
df_papers_in_conferences = df_papers_in_conferences.drop('col')

df_conferences = df_conferences.drop(df_conferences._id)

# Visualizing the data
print('Papers')
df_papers_in_conferences.show(truncate = False)
print('Schema of the conferences')
df_conferences.printSchema()
print('Conferences')
df_conferences.show(truncate=False)

+------------------------+--------------+
|col                     |publication id|
+------------------------+--------------+
|53e99854b7602d970208cfc7|0             |
|53e9989bb7602d97020d636a|0             |
|53e998bfb7602d97020f886d|0             |
|53e998c0b7602d97020fbb5c|0             |
|53e998e9b7602d9702127e6f|0             |
|53e99976b7602d97021b506c|0             |
|53e9997eb7602d97021bd1e8|0             |
|53e99991b7602d97021d53ff|0             |
|53e99998b7602d97021dbc6a|0             |
|53e99859b7602d9702093eac|1             |
|53e99860b7602d970209d3cb|1             |
|53e998bfb7602d97020fa178|2             |
|53e9997eb7602d97021bd8cb|3             |
|53e998fdb7602d970213e8da|4             |
|53e9990db7602d970214a4c9|5             |
|53e99822b7602d9702043750|6             |
|53e998bfb7602d97020f69dd|6             |
|53e9991cb7602d97021559a1|6             |
|53e99940b7602d970217ea90|6             |
|53e99937b7602d9702172533|7             |
+------------------------+--------

In [45]:
# Merging the 3 dataframe which one contains the papers published in a specific media
df_papers = df_papers_in_books.union(df_papers_in_journals).union(df_papers_in_conferences)

# Visualizing the data
print('Papers schema')
df_papers.printSchema()
print('Papers data')
df_papers.show(truncate = False)

Papers schema
root
 |-- publication id: long (nullable = false)
 |-- _id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- keywords: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- fos: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- references: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- page_start: integer (nullable = true)
 |-- page_end: integer (nullable = true)
 |-- lang: string (nullable = true)
 |-- doi: string (nullable = true)
 |-- url: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- abstract: string (nullable = true)
 |-- publication_type: string (nullable = true)

Papers data
+--------------+------------------------+-------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------

In [48]:
# For checking the result
print('Papers published in books')
df_papers.filter(col('publication_type') == 'Book').select('_id', 'title', 'publication_type', 'publication id').show(truncate = False)
print('Papers published in journals')
df_papers.filter(col('publication_type') == 'Journal').select('_id', 'title', 'publication_type', 'publication id').show(truncate = False)
print('Papers published in conferences')
df_papers.filter(col('publication_type') == 'Conference').select('_id', 'title', 'publication_type', 'publication id').show(truncate = False)


Papers published in books
+------------------------+-------------------------------------------------------------------------+----------------+--------------+
|_id                     |title                                                                    |publication_type|publication id|
+------------------------+-------------------------------------------------------------------------+----------------+--------------+
|53e997d1b7602d9701fc5024|A fault diagnosis methodology for the UltraSPARC/sup TM/-I microprocessor|Book            |1775          |
|53e997e4b7602d9701fda80a|Problem Decomposition and the Learning of Skills                         |Book            |5443          |
|53e997e8b7602d9701fe0ddc|X-tract: Structure Extraction from Botanical Textual Descriptions        |Book            |204           |
|53e997e8b7602d9701fe213d|Cognitive agent programming                                              |Book            |3210          |
|53e997e9b7602d9701fe3ba8|Constraint based 