# 07. Conference Memory of References    

In [None]:
from pyspark import SparkContext
from pyspark.sql import SparkSession, Window
from pyspark.sql.types import *
import pyspark.sql.functions as sf


rootpath = 'wasbs://mag-2018-09-27@magtrainingsource.blob.core.windows.net/mag/'
outputDir = '/output/jiaxin/pyspark/'
conferenceShortName = 'WWW'

In [None]:
sc = SparkContext.getOrCreate()
spark = SparkSession(sc)

In [None]:
# First find the conference series id by matching conference short name
targetConferenceSeriesId = spark.read.load(rootpath + "ConferenceSeries.txt", format="csv", sep="\t") \
.toDF("ConferenceSeriesId", "Rank", "NormalizedName", "DisplayName", "PaperCount", 
      "CitationCount", "CreatedDate") \
.where(sf.col("NormalizedName").isin(conferenceShortName)) \
.select("ConferenceSeriesId")


# Get all conference papers by conference series Id.
Papers = spark.read.load(rootpath + "Papers.txt", format="csv", sep="\t") \
.toDF("PaperId", "Rank", "Doi", "DocType", "PaperTitle", "OriginalTitle", "BookTitle", 
      "Year", "Date", "Publisher", "JournalId", "ConferenceSeriesId", "ConferenceInstanceId", 
      "Volume", "Issue", "FirstPage", "LastPage", "ReferenceCount", "CitationCount", 
      "EstimatedCitationCount", "CreatedDate")

conferencePapers = Paper.join(targetConferenceSeriesId, "ConferenceSeriesId", 'inner') \
.select("PaperId", "Year", "CitationCount")


# Get all reference from [conference Paper] -> [referenced paper]
References = spark.read.load(rootpath + "PaperReferences.txt", format="csv", sep="\t") \
.toDF("PaperId", "PaperReferenceId")

conferencePaperReferences = Paper.join(PaperReference, Paper.PaperId == PaperReference.PaperReferenceId, 'inner') \
.join(conferencePapers, "PaperId", 'inner') \



    SELECT @conferencePapers.PaperId,
           @conferencePapers.Year,
           Paper.PaperId AS ReferenceId,
           Paper.Year AS ReferenceYear
        
    FROM Paper
         INNER JOIN
             PaperReference
         ON Paper.PaperId == PaperReference.PaperReferenceId
         INNER JOIN
             @conferencePapers
         ON PaperReference.PaperId == @conferencePapers.PaperId;




#//Aggregate reference count per conference year and reference paper year to create year matrix for reference count

@conferencePaperReferenceByYears =

    SELECT @conferencePaperReferences.Year,
           @conferencePaperReferences.ReferenceYear,
           COUNT( * ) AS ReferenceCount

    FROM @conferencePaperReferences
    //Papers shouldn't be citing papers published beyond one year. 
    //Microsoft Academic Graph does contain "future citations" due to newer book versions or noise. We exclude these for visualization

    WHERE @conferencePaperReferences.Year + 1 >= @conferencePaperReferences.ReferenceYear 
    GROUP BY @conferencePaperReferences.Year,
             @conferencePaperReferences.ReferenceYear;





OUTPUT @conferencePaperReferenceByYears
TO @conferenceReferenceMemoryOutPath
USING Outputters.Tsv(outputHeader : true);

In [None]:
sc.stop()