# 08. Conference Top Referenced Venues     

In [None]:
from pyspark import SparkContext
from pyspark.sql import SparkSession, Window
from pyspark.sql.types import *
import pyspark.sql.functions as sf


rootpath = 'wasbs://mag-2018-09-27@magtrainingsource.blob.core.windows.net/mag/'
outputDir = '/output/jiaxin/pyspark/'
conferenceShortName = 'WWW'
conferenceAnalyticsBaseDir = '/output/conferenceAnalytics/'

In [None]:
sc = SparkContext.getOrCreate()
spark = SparkSession(sc)

In [None]:
# The visualization can join against this table to get venue information. 
ConferenceSeries = spark.read.load(rootpath + "ConferenceSeries.txt", format="csv", sep="\t") \
.toDF("ConferenceSeriesId", "Rank", "NormalizedName", "DisplayName", "PaperCount", "CitationCount", "CreatedDate") \
.selectExpr("ConferenceSeriesId as VenueId", "NormalizedName as VenueShortName", "DisplayName as VenueName")

Journals = spark.read.load(rootpath + "Journals.txt", format="csv", sep="\t") \
.toDF("JournalId", "Rank", "NormalizedName", "DisplayName", "Issn", "Publisher", "Webpage", "PaperCount", "CitationCount", "CreatedDate") \
.selectExpr("JournalId as VenueId", "NormalizedName as VenueShortName", "DisplayName as VenueName")

venues = ConferenceSeries.union(Journals)

venues.write.csv(outputDir + "venues.csv", mode='overwrite', header='true')

In [None]:
# First find the conference series id by matching conference short name.

@targetConferenceSeriesId =
    SELECT (long?) ConferenceSeriesId AS ConferenceSeriesId //Cast long to long? to join against Paper table
    FROM ConferenceSeries
    WHERE NormalizedName == @conferenceShortName;


# Get all conference papers by conference series Id.

@conferencePapers =
    SELECT Papers.PaperId,
           Papers.Year,
           Papers.ConferenceSeriesId AS VenueId

    FROM Papers
         INNER JOIN
             @targetConferenceSeriesId
         ON Papers.ConferenceSeriesId == @targetConferenceSeriesId.ConferenceSeriesId;

 

# Get all reference from [Conference Paper] -> [Other Paper]

@conferencePaperReferences =
    SELECT @conferencePapers.PaperId,
           @conferencePapers.Year,
           @conferencePapers.VenueId,
           Papers.PaperId AS ReferenceId,
           //A single paper may be published in conference and in a journal later.
           //Use conference as its venue if present otherwise, journal
           Papers.ConferenceSeriesId == null ? JournalId : ConferenceSeriesId AS ReferenceVenueId

    FROM Papers
         INNER JOIN
             PaperReferences
         ON Papers.PaperId == PaperReferences.PaperReferenceId
         INNER JOIN
             @conferencePapers
         ON PaperReferences.PaperId == @conferencePapers.PaperId;



# Count number of references between conference and referenced conference

@conferencePaperReferenceByVenues =
    SELECT @conferencePaperReferences.VenueId,
           @conferencePaperReferences.ReferenceVenueId,
           COUNT( * ) AS ReferenceCount

    FROM @conferencePaperReferences

    GROUP BY @conferencePaperReferences.VenueId,
             @conferencePaperReferences.ReferenceVenueId;


conferencePaperReferenceByVenues.write.csv(outputDir + "conferencePaperReferenceByVenues.csv", mode='overwrite', header='true')

In [None]:
# For each yeah of conference, count number of references between conference and referenced conference

@conferencePaperReferenceByYearsReferenceVenues =
    SELECT @conferencePaperReferences.Year,
           @conferencePaperReferences.ReferenceVenueId,
           COUNT( * ) AS ReferenceCount

    FROM @conferencePaperReferences

    GROUP BY @conferencePaperReferences.Year,
             @conferencePaperReferences.ReferenceVenueId;
        

conferencePaperReferences.write.csv(outputDir + "conferencePaperReferences.csv", mode='overwrite', header='true')

In [None]:
sc.stop()