# 07. Conference Memory of References    

In [None]:
from pyspark import SparkContext
from pyspark.sql import SparkSession, Window
from pyspark.sql.types import *
import pyspark.sql.functions as sf


rootpath = 'wasbs://mag-2018-09-27@magtrainingsource.blob.core.windows.net/mag/'
outputDir = '/output/jiaxin/pyspark/'
conferenceShortName = 'WWW'

In [None]:
sc = SparkContext.getOrCreate()
spark = SparkSession(sc)

In [None]:
# First find the conference series id by matching conference short name
targetConferenceSeriesId = spark.read.load(rootpath + "ConferenceSeries.txt", format="csv", sep="\t") \
.toDF("ConferenceSeriesId", "Rank", "NormalizedName", "DisplayName", "PaperCount", 
      "CitationCount", "CreatedDate") \
.where(sf.col("NormalizedName").isin(conferenceShortName)) \
.select("ConferenceSeriesId")


# Get all conference papers by conference series Id.
Papers = spark.read.load(rootpath + "Papers.txt", format="csv", sep="\t") \
.toDF("PaperId", "Rank", "Doi", "DocType", "PaperTitle", "OriginalTitle", "BookTitle", 
      "Year", "Date", "Publisher", "JournalId", "ConferenceSeriesId", "ConferenceInstanceId", 
      "Volume", "Issue", "FirstPage", "LastPage", "ReferenceCount", "CitationCount", 
      "EstimatedCitationCount", "CreatedDate")

conferencePapers = Papers.join(targetConferenceSeriesId, "ConferenceSeriesId", 'inner') \
.select("PaperId", "Year", "CitationCount")


# Get all reference from [conference Paper] -> [referenced paper]
PaperReference = spark.read.load(rootpath + "PaperReferences.txt", format="csv", sep="\t") \
.toDF("PaperId", "PaperReferenceId")

conferencePaperReferences = conferencePapers.join(PaperReference, "PaperId", 'inner') \
.join(Papers, Paper.PaperId == PaperReference.PaperReferenceId, 'inner') \
.selectExpr("conferencePapers.PaperId as PaperId", "conferencePapers.Year as Year", "Papers.PaperId AS ReferenceId", "Paper.Year AS ReferenceYear")


#//Aggregate reference count per conference year and reference paper year to create year matrix for reference count
conferencePaperReferenceByYears = conferencePaperReferences.where(("Year" + 1) >= (ReferenceYear)) \
.groupby("Year", "ReferenceYear") \
.agg("*": "count") \
.withColumnRenamed("count(1)", "ReferenceCount")


conferencePaperReferenceByYears.write.csv(outputDir + "conferencePaperReferenceByYears.csv", mode='overwrite', header='true')

In [None]:
sc.stop()