In [111]:
# Dowloading pyspark
#!pip install pyspark

PREPARE THE ENVIRONMENT, UPLOAD DATA, PREPROCESS DATA AND CREATE THE TABLES: Author, Paper, Affiliation, Book, Journal and Conference

In [112]:
# With sparkSession we create a connection to our database
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, ArrayType, TimestampType
from pyspark.sql.functions import count, col, monotonically_increasing_id, collect_list, explode

# Create an entry point to the PySpark Application
spark = SparkSession.builder \
      .master("local") \
      .appName("Bibliography") \
      .getOrCreate()

INPUT_FILE = "bibliography.json"
OPTIONS = {'multiline': 'true', 'allowNumericLeadingZero': 'true','timestampFormat': "yyyy-MM-dd'T'HH:mm:ss[.ZZZ'Z']"}

In [113]:
#AUTHOR TABLE
schemaAut = StructType(
            [StructField('authors', ArrayType(StructType([
                StructField('_id', StringType(), nullable = False),
                StructField('name', StringType(), True),
                StructField('email', StringType(), True),
                StructField('bio', StringType(), True),
                ])), True)
            ])

dfAut = spark.read.format('json').options(**OPTIONS).schema(schemaAut).json(INPUT_FILE)
dfAut = dfAut.select(explode(dfAut.authors))
dfAut = dfAut.withColumnRenamed("col", "authors")
dfAut = dfAut.filter(col("authors._id") != "null").select("authors._id","authors.name","authors.email", "authors.bio")
dfAut = dfAut.withColumnRenamed("_id", "authorID")
dfAut = dfAut.dropDuplicates(["authorID"])
dfAut.printSchema()
dfAut.show()

root
 |-- authorID: string (nullable = true)
 |-- name: string (nullable = true)
 |-- email: string (nullable = true)
 |-- bio: string (nullable = true)



[Stage 986:>                                                        (0 + 1) / 1]

+--------------------+------------------+--------------------+--------------------+
|            authorID|              name|               email|                 bio|
+--------------------+------------------+--------------------+--------------------+
|53f3186fdabfae9a8...|   A. M. A. Hariri|a..m..a..hariride...|My name is A. M. ...|
|53f3186fdabfae9a8...|    Matthew Prowse|matthew.prowsefb@...|My name is Matthe...|
|53f31870dabfae9a8...|       Sui-ping Qi|sui-ping.qi19@gma...|My name is Sui-pi...|
|53f31871dabfae9a8...|     Renato Fabbri|renato.fabbrib7@g...|My name is Renato...|
|53f31873dabfae9a8...|   Joachim Schimpf|joachim.schimpf8a...|My name is Joachi...|
|53f31874dabfae9a8...|    E. Di Bernardo|e..di.bernardo10@...|My name is E. Di ...|
|53f31875dabfae9a8...|    Steven F. Roth|steven.f..roth46@...|My name is Steven...|
|53f31878dabfae9a8...|      Nima Zahadat|nima.zahadat3d@gm...|My name is Nima Z...|
|53f3187ddabfae9a8...|         Ke Fa Cen|ke.fa.cen23@gmail...|My name is Ke 

                                                                                

In [114]:
# PAPER TABLE WITHOUT PUBLICATION_ID
schemaPaper = StructType(
            [StructField('_id', StringType(), True),
             StructField('title', StringType(),True),
             StructField('keywords', ArrayType(StringType()), True),
             StructField('fos', ArrayType(StringType()), True),
             StructField('references', ArrayType(StringType()), True),
             StructField('page_start', IntegerType(), True),
             StructField('page_end', IntegerType(), True),
             StructField('lang', StringType(),True),
             StructField('doi', StringType(),True),
             StructField('url', ArrayType(StringType()),True),
             StructField('abstract', StringType(),True),
             StructField('publication_type', StringType(),True),
             StructField('date', TimestampType(), True)
            ])

dfPaper = spark.read.format('json').options(**OPTIONS).schema(schemaPaper).json(INPUT_FILE)
dfPaper = dfPaper.withColumnRenamed("_id", "paperID")
dfPaper.printSchema()
dfPaper.show()

root
 |-- paperID: string (nullable = true)
 |-- title: string (nullable = true)
 |-- keywords: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- fos: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- references: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- page_start: integer (nullable = true)
 |-- page_end: integer (nullable = true)
 |-- lang: string (nullable = true)
 |-- doi: string (nullable = true)
 |-- url: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- abstract: string (nullable = true)
 |-- publication_type: string (nullable = true)
 |-- date: timestamp (nullable = true)



[Stage 989:>                                                        (0 + 1) / 1]

+--------------------+--------------------+--------------------+--------------------+--------------------+----------+--------+----+--------------------+--------------------+--------------------+----------------+-------------------+
|             paperID|               title|            keywords|                 fos|          references|page_start|page_end|lang|                 doi|                 url|            abstract|publication_type|               date|
+--------------------+--------------------+--------------------+--------------------+--------------------+----------+--------+----+--------------------+--------------------+--------------------+----------------+-------------------+
|53e99784b7602d970...|Using XML to Inte...|[internet, hyperm...|[xml base, world ...|[53e9adbdb7602d97...|       167|     172|  en|10.1109/CMPSAC.20...|[http://dx.doi.or...|The eXtensible Ma...|            Book|1974-09-13 06:34:29|
|53e99784b7602d970...|               FCLOS|[molap, subsumpti...|[informa

                                                                                

In [115]:
# AFFILIATION TABLE
schemaAffiliation = StructType(
            [StructField('_id', StringType(), True),
             StructField('authors', ArrayType(StructType([
                    StructField('_id', StringType(), True),
                    StructField('org', StringType(), True)
             ])), True),
            ])

dfAff = spark.read.format('json').options(**OPTIONS).schema(schemaAffiliation).json(INPUT_FILE)
dfAff = dfAff.withColumnRenamed("_id", "paperID")
dfAff = dfAff.select("paperId", explode(dfAff.authors))
dfAff = dfAff.withColumnRenamed("col", "authors")
dfAff = dfAff.filter(col("authors._id") != "null").filter(col("paperId") != "null").select("paperId", "authors._id","authors.org")
dfAff = dfAff.withColumnRenamed("_id", "authorID")
dfAff = dfAff.dropDuplicates(["authorID", "paperID"])
dfAff = dfAff.withColumnRenamed("org", "organization")
dfAff.printSchema()
dfAff.show()

root
 |-- paperId: string (nullable = true)
 |-- authorID: string (nullable = true)
 |-- organization: string (nullable = true)



[Stage 990:>                                                        (0 + 1) / 1]

+--------------------+--------------------+--------------------+
|             paperId|            authorID|        organization|
+--------------------+--------------------+--------------------+
|53e998c7b7602d970...|53f3186fdabfae9a8...|Department of Sta...|
|53e99827b7602d970...|53f3186fdabfae9a8...|Laboratory for Fo...|
|53e99924b7602d970...|53f31870dabfae9a8...|Henan Academy of ...|
|53e998dbb7602d970...|53f31871dabfae9a8...|Instituto de Físi...|
|53e998f6b7602d970...|53f31873dabfae9a8...|                null|
|53e998bfb7602d970...|53f31874dabfae9a8...|                null|
|53e9984bb7602d970...|53f31875dabfae9a8...|                null|
|53e998e8b7602d970...|53f31878dabfae9a8...|George Mason Univ...|
|53e99905b7602d970...|53f3187ddabfae9a8...|State Key Laborat...|
|53e998e9b7602d970...|53f31881dabfae9a8...|                null|
|53e9984fb7602d970...|53f31881dabfae9a8...|Tecnologico de Mo...|
|53e9980eb7602d970...|53f31883dabfae9a8...|University of Was...|
|53e997e9b7602d970...|53f

                                                                                

In [116]:
# BOOK TABLE
# Preprocessing of the books for cleaning and merging the books
book_schema_preprocessing = StructType(
    [StructField('_id', StringType(), True),
     StructField('isbn', StringType(), True),
     StructField('publisher', StringType(), True),
     StructField('venue', StringType(), True),
     StructField('publication_type', StringType(),True)])

# Reading the json file
dfbooks_to_filter = spark.read.format('json').options(**OPTIONS).schema(book_schema_preprocessing).json(INPUT_FILE)

# Filtering and adjusting the dataframe
dfbooks_to_filter = dfbooks_to_filter.filter(col('publication_type') == 'Book').filter(col('isbn') != 'null').filter(col('venue') != 'null')
dfbooks_to_filter = dfbooks_to_filter.groupBy('isbn', 'venue').agg(collect_list('publisher').alias('publishersArray'), collect_list('_id').alias('_id'), count(col('publisher'))) # count can be removed (I was interested in evaluating if the group by was meaningful)
dfbooks_to_insert = dfbooks_to_filter.withColumn('publisher', dfbooks_to_filter['publishersArray'][0]).select('venue', 'isbn', 'publisher', '_id')

# Adding the new column which is the id
df_books = dfbooks_to_insert.withColumn('publication id', monotonically_increasing_id())

# Adding the foreign key to the papers
exploded_books = df_books.select(explode('_id'), 'publication id')
# exploded_books.show(truncate = False)

df_papers_in_books = exploded_books.join(dfPaper, exploded_books.col == dfPaper.paperID)
df_papers_in_books = df_papers_in_books.drop('col')

df_books = df_books.drop(df_books._id)

# Visualizing the data
# print('Papers')
# df_papers_in_books.show(truncate = False)
print('Schema of the books')
df_books.printSchema()
print('Books')
df_books.show(truncate=False)

Schema of the books
root
 |-- venue: string (nullable = true)
 |-- isbn: string (nullable = true)
 |-- publisher: string (nullable = true)
 |-- publication id: long (nullable = false)

Books
+------------------------------------------------------------------------------------+-------------+--------------------------------------------------+--------------+
|venue                                                                               |isbn         |publisher                                         |publication id|
+------------------------------------------------------------------------------------+-------------+--------------------------------------------------+--------------+
|ACM SIGSOFT Software Engineering Notes                                              |-159593-125-2|AGH University of Science and Technology          |0             |
|Theor. Comput. Sci.                                                                 |0-0304-3975  |Elsevier                                 

                                                                                

In [117]:
# JOURNAL TABLE
# Preprocessing of the journals for cleaning and merging the journals

journal_schema_preprocessing = StructType(
    [StructField('_id', StringType(), True),
     StructField('issn', StringType(), True),
     StructField('publisher', StringType(), True),
     StructField('venue', StringType(), True),
     StructField('volume', IntegerType(), True),
     StructField('issue', IntegerType(), True),
     StructField('publication_type', StringType(),True)])

# Reading the json file
df_journals_to_filter = spark.read.format('json').options(**OPTIONS).schema(journal_schema_preprocessing).json(INPUT_FILE)

# Filtering and adjusting the dataframe
df_journals_to_filter = df_journals_to_filter.filter(col('publication_type') == 'Journal').filter(col('issn') != 'null').filter(col('venue') != 'null').filter(col('issue') >= 0).filter(col('volume') >= 0)
df_journals_to_filter = df_journals_to_filter.groupBy('venue', 'volume', 'issue', 'issn').agg(collect_list('publisher').alias('publishersArray'), collect_list('_id').alias('_id'), count(col('publisher'))) # count can be removed (I was interested in evaluating if the group by was meaningful)
df_journals_to_insert = df_journals_to_filter.withColumn('publisher', df_journals_to_filter['publishersArray'][0]).select('venue', 'volume', 'issue', 'publisher', 'issn', '_id')

# Adding the new column which contains the publication identifier
df_journals = df_journals_to_insert.withColumn("publication id", monotonically_increasing_id())

# Adding the foreign key to the papers
exploded_journals = df_journals.select(explode('_id'), 'publication id')
#exploded_journals.show(truncate = False)

df_papers_in_journals = exploded_journals.join(dfPaper, exploded_journals.col == dfPaper.paperID, "inner")
df_papers_in_journals = df_papers_in_journals.drop('col')

df_journals = df_journals.drop(df_journals._id)

# Visualizing the data
# print('Papers')
# df_papers_in_journals.show(truncate = False)
print('Schema of the journals')
df_journals.printSchema()
print('Journals')
df_journals.show(truncate=False)

Schema of the journals
root
 |-- venue: string (nullable = true)
 |-- volume: integer (nullable = true)
 |-- issue: integer (nullable = true)
 |-- publisher: string (nullable = true)
 |-- issn: string (nullable = true)
 |-- publication id: long (nullable = false)

Journals
+-----------------------+------+-----+------------------------------------------------------------------+---------+--------------+
|venue                  |volume|issue|publisher                                                         |issn     |publication id|
+-----------------------+------+-----+------------------------------------------------------------------+---------+--------------+
|4OR                    |2     |4    |Inderscience Publishers                                           |1614-2411|0             |
|4OR                    |3     |1    |Accent Social and Welfare Society                                 |1614-2411|1             |
|4OR                    |4     |1    |Innovative Information Science & 

                                                                                

In [118]:
#from google.colab import drive
#drive.mount('/content/drive')

In [119]:
# CONFERENCE TABLE
# Preprocessing of the books for cleaning and merging the books

schemaConf = StructType(
    [StructField('_id', StringType(), True),
     StructField('location', StringType(), True),
     StructField('venue', StringType(), True),
     StructField('publication_type', StringType(),True)])
# Reading the json file
df_conferences_to_filter = spark.read.format('json').options(**OPTIONS).schema(schemaConf).json(INPUT_FILE)

# Filtering and adjusting the dataframe
df_conferences_to_filter = df_conferences_to_filter.filter(col('publication_type') == 'Conference').filter(col('venue') != 'null')
df_conferences_to_filter = df_conferences_to_filter.groupBy('venue').agg(collect_list('location').alias('locations_array'), collect_list('_id').alias('_id'), count(col('location'))) # count can be removed (I was interested in evaluating if the group by was meaningful)
df_conferences_to_insert = df_conferences_to_filter.withColumn('location', df_conferences_to_filter['locations_array'][0]).select('venue', 'location', '_id')

# Adding the new column which is the id
df_conferences = df_conferences_to_insert.withColumn('publication id', monotonically_increasing_id())

# Adding the foreign key to the papers
exploded_conferences = df_conferences.select(explode('_id'), 'publication id')
#exploded_conferences.show(truncate = False)

df_papers_in_conferences = exploded_conferences.join(dfPaper, exploded_conferences.col == dfPaper.paperID)
df_papers_in_conferences = df_papers_in_conferences.drop('col')

df_conferences = df_conferences.drop(df_conferences._id)

# Visualizing the data
#print('Papers')
#df_papers_in_conferences.show(truncate = False)
print('Schema of the conferences')
df_conferences.printSchema()
print('Conferences')
df_conferences.show(truncate=False)

Schema of the conferences
root
 |-- venue: string (nullable = true)
 |-- location: string (nullable = true)
 |-- publication id: long (nullable = false)

Conferences
+----------------------------------------------------------------------------------------+--------------------------+--------------+
|venue                                                                                   |location                  |publication id|
+----------------------------------------------------------------------------------------+--------------------------+--------------+
|"EDUCON                                                                                 |Moscow, Russia            |0             |
|2012 50TH ANNUAL ALLERTON CONFERENCE ON COMMUNICATION, CONTROL, AND COMPUTING (ALLERTON)|Dublin, Ireland           |1             |
|2985415099                                                                              |Mexico City, Mexico       |2             |
|2985532720                         

                                                                                

In [120]:
# Merging the 3 dataframe which one contains the papers published in a specific media
df_papers = df_papers_in_books.union(df_papers_in_journals).union(df_papers_in_conferences)

# Visualizing the data
print('Papers schema')
df_papers.printSchema()
print('Papers data')
df_papers.show()

Papers schema
root
 |-- publication id: long (nullable = false)
 |-- paperID: string (nullable = true)
 |-- title: string (nullable = true)
 |-- keywords: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- fos: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- references: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- page_start: integer (nullable = true)
 |-- page_end: integer (nullable = true)
 |-- lang: string (nullable = true)
 |-- doi: string (nullable = true)
 |-- url: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- abstract: string (nullable = true)
 |-- publication_type: string (nullable = true)
 |-- date: timestamp (nullable = true)

Papers data


[Stage 1005:>               (0 + 1) / 1][Stage 1007:>               (0 + 0) / 1]

+--------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+--------+----+--------------------+--------------------+--------------------+----------------+-------------------+
|publication id|             paperID|               title|            keywords|                 fos|          references|page_start|page_end|lang|                 doi|                 url|            abstract|publication_type|               date|
+--------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+--------+----+--------------------+--------------------+--------------------+----------------+-------------------+
|          1775|53e997d1b7602d970...|A fault diagnosis...|[stuck at defects...|[computer testing...|[53e9bca6b7602d97...|       494|     494|  en|10.1109/EDTC.1997...|[http://dx.doi.or...|In this paper we ...|            Book|1950-09-08 12:23:35|
|          5

                                                                                

In [121]:
# For checking the result
print('Papers published in books')
df_papers.filter(col('publication_type') == 'Book').select('paperID', 'title', 'publication_type', 'publication id').show()
print('Papers published in journals')
df_papers.filter(col('publication_type') == 'Journal').select('paperID', 'title', 'publication_type', 'publication id').show()
print('Papers published in conferences')
df_papers.filter(col('publication_type') == 'Conference').select('paperID', 'title', 'publication_type', 'publication id').show()


Papers published in books


                                                                                

+--------------------+--------------------+----------------+--------------+
|             paperID|               title|publication_type|publication id|
+--------------------+--------------------+----------------+--------------+
|53e997d1b7602d970...|A fault diagnosis...|            Book|          1775|
|53e997e4b7602d970...|Problem Decomposi...|            Book|          5443|
|53e997e8b7602d970...|X-tract: Structur...|            Book|           204|
|53e997e8b7602d970...|Cognitive agent p...|            Book|          3210|
|53e997e9b7602d970...|Constraint based ...|            Book|          2086|
|53e997ecb7602d970...|Automatic input r...|            Book|          7645|
|53e997ecb7602d970...|Acceptability-ori...|            Book|          2962|
|53e997ecb7602d970...|Anomalous Neighbo...|            Book|          7756|
|53e997f1b7602d970...|A Digital Watermark.|            Book|          1635|
|53e997f4b7602d970...|Independent Tree ...|            Book|          5856|
|53e997f5b76

                                                                                

+--------------------+--------------------+----------------+--------------+
|             paperID|               title|publication_type|publication id|
+--------------------+--------------------+----------------+--------------+
|53e99915b7602d970...|A note on robust ...|         Journal|             0|
|53e99984b7602d970...|A new approach fo...|         Journal|             1|
|53e998b0b7602d970...|Two-machine flow ...|         Journal|             2|
|53e9994cb7602d970...|Stochastic semide...|         Journal|             3|
|53e9990db7602d970...|Models and algori...|         Journal|             4|
|53e99952b7602d970...|Maximizing the mi...|         Journal|             5|
|53e99800b7602d970...|Integer extended ...|         Journal|             6|
|53e99858b7602d970...|Attraction probab...|         Journal|             7|
|53e9998bb7602d970...|Strategy vs risk ...|         Journal|             8|
|53e99915b7602d970...|A necessary 4-cyc...|         Journal|             9|
|53e997f1b76

[Stage 1045:> (0 + 1) / 1][Stage 1047:> (0 + 0) / 1][Stage 1049:> (0 + 0) / 1]]

+--------------------+--------------------+----------------+--------------+
|             paperID|               title|publication_type|publication id|
+--------------------+--------------------+----------------+--------------+
|53e99854b7602d970...|  The EOLES project.|      Conference|             0|
|53e9989bb7602d970...|Life is engineeri...|      Conference|             0|
|53e998bfb7602d970...|Gaining and maint...|      Conference|             0|
|53e998c0b7602d970...|From manuals towa...|      Conference|             0|
|53e998e9b7602d970...|Learning with com...|      Conference|             0|
|53e99976b7602d970...|Cloud E-learning ...|      Conference|             0|
|53e9997eb7602d970...|Motivating progra...|      Conference|             0|
|53e99991b7602d970...|Monitoring studen...|      Conference|             0|
|53e99998b7602d970...|OLAREX project: O...|      Conference|             0|
|53e99859b7602d970...|Studying dynamic ...|      Conference|             1|
|53e99860b76

                                                                                

COMMANDS

In [122]:
#Command 4: delete a group of rows

# Use the function year to extract the year from the timestamp
from pyspark.sql.functions import year

# Drop rows with conditions – where clause
# From 37626 to 37175 -> delete all the rows that represent papers published before 1950, because obsolete
df_papers = df_papers.where(year('date') > '1950')
df_papers.select('title', 'publication_type', 'date').orderBy('date').show()

                                                                                

+--------------------+----------------+-------------------+
|               title|publication_type|               date|
+--------------------+----------------+-------------------+
|Generalized one-u...|      Conference|1951-01-03 03:43:07|
|Predicting PDZ do...|         Journal|1951-01-03 03:43:07|
|A New EDI-based D...|         Journal|1951-01-03 03:43:07|
|Search-based Exec...|      Conference|1951-01-03 03:43:07|
|Multi-structural ...|         Journal|1951-01-03 03:43:07|
|Corpus-based ling...|            Book|1951-01-03 03:43:07|
|A comparative stu...|         Journal|1951-01-03 03:43:07|
|Fast Solution of ...|         Journal|1951-01-03 03:43:07|
|Local Hausdorff D...|         Journal|1951-01-03 03:43:07|
|Grammatical Evolu...|            Book|1951-01-03 03:43:07|
|Global optimizati...|         Journal|1951-01-03 03:43:07|
|Processing UML Mo...|            Book|1951-01-03 03:43:07|
|Balancing buffer ...|         Journal|1951-01-03 03:43:07|
|The Animation of ...|            Book|1

In [123]:
#Command 5 create a new column with the length of the paper(number of total pages)
dfPaper_total_pages = dfPaper \
    .filter((col('page_start') >= 0) & (col('page_end') >= 0) & (col('page_start') <= col('page_end'))) \
    .withColumn('total_pages', col('page_end') - col('page_start'))

dfPaper_total_pages \
    .select(col('title'), col('page_start'), col('page_end'), col('total_pages')) \
    .show(5, truncate=False)

+-------------------------------------------------------------+----------+--------+-----------+
|title                                                        |page_start|page_end|total_pages|
+-------------------------------------------------------------+----------+--------+-----------+
|Using XML to Integrate Existing Software Systems into the Web|167       |172     |5          |
|FCLOS                                                        |192       |220     |28         |
|Bhoomi                                                       |20        |31      |11         |
|Laps                                                         |962       |976     |14         |
|Mindful                                                      |3253      |3274    |21         |
+-------------------------------------------------------------+----------+--------+-----------+
only showing top 5 rows



QUERIES

In [124]:
#Query 4: GROUP BY, JOIN, AS
from pyspark.sql.functions import collect_set, concat, size

df_journals_venue_rename = df_journals.withColumnRenamed('venue', 'venueJournals')
df_books_venue_rename = df_books.withColumnRenamed('venue', 'venueBooks')
df = df_books_venue_rename\
    .join(df_journals_venue_rename,
          df_books_venue_rename.publisher == df_journals_venue_rename.publisher,
          "inner")\
    .drop(df_journals.publisher)\
    .select('venueBooks', 'venueJournals', 'publisher')\
    .dropDuplicates(['venueBooks', 'venueJournals', 'publisher'])\
    .groupBy('publisher')\
    .agg(collect_set('venueBooks').alias('books'),
         collect_set('venueJournals').alias('journals'))\
    .withColumn("total_publications_per_publisher",
                concat(col("books"), col("journals")))\
    .filter(size(col("total_publications_per_publisher")) > '500')\
    .select('publisher', "total_publications_per_publisher")\
    .show(truncate = 50)

[Stage 1076:>                                                       (0 + 1) / 1]

+-----------------------------------------+--------------------------------------------------+
|                                publisher|                  total_publications_per_publisher|
+-----------------------------------------+--------------------------------------------------+
|                  Taylor and Francis Ltd.|[SAS, Special Interest Group on Software Engine...|
|                                 Elsevier|[SAS, VAST, Focus on Scientific Visualization, ...|
|Association for Computing Machinery (ACM)|[SRDS, SAS, Special Interest Group on Software ...|
|                          Springer Verlag|[ECP, Special Interest Group on Software Engine...|
+-----------------------------------------+--------------------------------------------------+



                                                                                

In [125]:
#Query 9: WHERE, GROUP BY, HAVING, 1 JOIN


