In [0]:
Large_file_path = "dbfs:/FileStore/tables/large.json.gz"

Journal_file_path = "dbfs:/FileStore/tables/journal_information.csv"

journal_df = spark.read.format("csv").option("header", "true").load(Journal_file_path)
large_df = spark.read.format("json").load(Large_file_path)

In [0]:
#Convert DataFrame objects (large_df and journal_df) into Resilient Distributed Datasets (RDDs)
large_rdd = large_df.rdd
journal_rdd = journal_df.rdd


In [0]:
"""1. Programmatically confirm that all papers have unique IDs and output the number of papers in the file."""
def analyze_rdd_papers(rdd):
   
    # Extracting the "corpusid" column from each row
    corpus_ids = rdd.map(lambda row: row["corpusid"])

    # Count the total number of papers
    total_papers_count = corpus_ids.count()

    # Count the number of distinct corpus IDs
    unique_papers_count = corpus_ids.distinct().count()

    print("Total number of papers:", total_papers_count)
    print("Number of unique papers:", unique_papers_count)
    
analyze_rdd_papers(large_rdd)


Total number of papers: 150000
Number of unique papers: 150000


In [0]:
"""2. What is the average number of authors per paper?"""
def calculate_avg_authors_per_paper_rdd(rdd):

    # Calculating the number of authors for each paper
    authors_per_paper_rdd = rdd.map(lambda row: len(row.authors))

    # Calculating the total number of authors and the sum of authors per paper
    total_authors_count = authors_per_paper_rdd.count()
    total_authors_sum = authors_per_paper_rdd.sum()

    # Calculating the average number of authors per paper
    avg_authors_per_paper = total_authors_sum / total_authors_count

    print("Average number of authors per paper:", avg_authors_per_paper)

calculate_avg_authors_per_paper_rdd(large_rdd)


Average number of authors per paper: 2.81628


In [0]:
""" 3. How many different journals were the papers published in?"""
def count_different_journals(rdd):
    
    # Filtering out rows with None values in the "journal" column
    cleaned_rdd = rdd.filter(lambda row: row["journal"] is not None and 
                             row["journal"]["name"] is not None and row["journal"]["name"] != "")

    # Extracting the journal names
    journal_names_rdd = cleaned_rdd.map(lambda row: row["journal"]["name"])

    # Count the number of distinct journal names
    distinct_journals_count = journal_names_rdd.distinct().count()

    print("Number of different journals:", distinct_journals_count)

count_different_journals(large_rdd)


Number of different journals: 33916


In [0]:
"""4. Find the 5 authors with the highest number of publications. 
    Give their names along with the number of publications they contributed to."""
def top_authors_by_publications(rdd):
    
    # Explode the authors array to have one row per author
    authors_rdd = rdd.flatMap(lambda row: [(author["authorId"], author["name"]) for author in row["authors"]])

    # Mapping each author to a tuple with authorId as key and 1 as value for counting
    author_counts_rdd = authors_rdd.map(lambda author: (author, 1))

    # Using Reduce by key to count the no of publications/author
    author_publication_counts_rdd = author_counts_rdd.reduceByKey(lambda x, y: x + y)

    # Swapping the key-value pairs to sort by publication count
    sorted_author_publication_counts_rdd = author_publication_counts_rdd.map(lambda x: (x[1], x[0]))

    # Sorting the publication count in descending order
    sorted_author_publication_counts_rdd = sorted_author_publication_counts_rdd.sortByKey(ascending=False)

    top_5_authors_rdd = sorted_author_publication_counts_rdd.take(5)

    for count, author in top_5_authors_rdd:
        print("Author:", author[1], "(", author[0], ") - Number of Publications:", count)

top_authors_by_publications(large_rdd)


Author: B. Noble ( 2149377746 ) - Number of Publications: 23
Author: S. Sukhoruchkin ( 90537224 ) - Number of Publications: 16
Author: Z. Soroko ( 88842366 ) - Number of Publications: 16
Author: M. Kumar ( 49898687 ) - Number of Publications: 15
Author: Anonymous ( None ) - Number of Publications: 10


In [0]:
journal_df1.printSchema()
small_df2.printSchema()

root
 |-- Journal Name: string (nullable = true)
 |-- ISSN: string (nullable = true)
 |-- EISSN: string (nullable = true)
 |-- Category & Journal Quartiles: string (nullable = true)
 |-- Citations: string (nullable = true)
 |-- JCI: string (nullable = true)
 |-- percentageOAGold: string (nullable = true)
 |-- IF: string (nullable = true)

root
 |-- authors: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- authorId: string (nullable = true)
 |    |    |-- name: string (nullable = true)
 |-- citationcount: long (nullable = true)
 |-- corpusid: long (nullable = true)
 |-- externalids: struct (nullable = true)
 |    |-- ACL: string (nullable = true)
 |    |-- ArXiv: string (nullable = true)
 |    |-- CorpusId: string (nullable = true)
 |    |-- DBLP: string (nullable = true)
 |    |-- DOI: string (nullable = true)
 |    |-- MAG: string (nullable = true)
 |    |-- PubMed: string (nullable = true)
 |    |-- PubMedCentral: string (nullable = true)
 |-- in

In [0]:
"""Created a function for joining of rdds for simplicity of further analysis"""
def joined_rdd(rdd, journal_rdd):

    # Convert the DataFrame to RDD, skipping the header row
    journal_rdd = journal_rdd.zipWithIndex().filter(lambda x: x[1] > 0).map(lambda x: x[0])

    # Filter out null values from small_rdd
    filtered_rdd = rdd.filter(lambda row: row.journal is not None and row.journal.name is not None)

    # Flatten filtered_small_rdd to extract journal name
    flattened_rdd = filtered_rdd.map(lambda row: (row.journal.name, row))

    # Join with journal_rdd using journal name as key
    joined = flattened_rdd.join(journal_rdd.map(lambda row: (row['Journal Name'], row)))
   
    return joined

joined = joined_rdd(large_rdd, journal_rdd)



In [0]:
"""5. Find the top 5 authors with the highest cummulative impact factor.
       Output both the author information and the cummulative impact factor."""
# Grouping and Aggregating
author_impact_rdd = joined.flatMap(lambda x: [(author['authorId'], author['name'], 
float(x[1][1]['IF'])) for author in x[1][0]['authors'] if x[1][1]['IF'] is not None]) \
    .map(lambda x: ((x[0], x[1]), x[2])) \
    .reduceByKey(lambda x, y: x + y) \
    .map(lambda x: (x[0][0], x[0][1], x[1]))

#  Sorting and Selecting Top Authors
top_5_authors_rdd = author_impact_rdd.takeOrdered(5, key=lambda x: -x[2])

#  Displaying Results
for author in top_5_authors_rdd:
    print("Author ID:", author[0])
    print("Author Name:", author[1])
    print("Cumulative Impact Factor:", author[2])
    print()


Author ID: 2155504929
Author Name: Ying Li
Cumulative Impact Factor: 93.832

Author ID: 5152451
Author Name: L. Andrade
Cumulative Impact Factor: 92.238

Author ID: 144797099
Author Name: M. Viana
Cumulative Impact Factor: 92.238

Author ID: 49900836
Author Name: H. Wood
Cumulative Impact Factor: 90.422

Author ID: 7695437
Author Name: A. M. Ruscio
Cumulative Impact Factor: 87.899



In [0]:
"""6. You’d like some additional information about publication trends. 
        How many publications with impact factor > 1 were published in each of the years between 2010-2020?"""

filtered_rdd = joined.filter(lambda x: x[1][1]['IF'] is not None 
                                and x[1][0]['year'] is not None 
                                and float(x[1][1]['IF']) > 1 
                                and 2010 <= int(x[1][0]['year']) <= 2020)

year_count_rdd = filtered_rdd.map(lambda x: (int(x[1][0]['year']), 1))

publications_per_year_rdd = year_count_rdd.reduceByKey(lambda x, y: x + y)

publications_per_year_rdd = publications_per_year_rdd.sortByKey()

result = publications_per_year_rdd.collect()

for year, count in result:
    print("Year:", year, "| Number of Publications:", count)


Year: 2010 | Number of Publications: 112
Year: 2011 | Number of Publications: 139
Year: 2012 | Number of Publications: 165
Year: 2013 | Number of Publications: 178
Year: 2014 | Number of Publications: 241
Year: 2015 | Number of Publications: 243
Year: 2016 | Number of Publications: 283
Year: 2017 | Number of Publications: 329
Year: 2018 | Number of Publications: 365
Year: 2019 | Number of Publications: 396
Year: 2020 | Number of Publications: 444
