# 12. Organization Insight

In [None]:
from pyspark import SparkContext
from pyspark.sql import SparkSession, Window
from pyspark.sql.types import *
import pyspark.sql.functions as sf


rootpath = 'wasbs://mag-2018-09-27@magtrainingsource.blob.core.windows.net/mag/'
outputDir = '/output/user99/pyspark/'
organizationName = 'microsoft'
organizationPaperMinYear = 1991
maDetailPagePrefix = 'https://academic.microsoft.com/#/detail/'

In [None]:
sc = SparkContext.getOrCreate()
spark = SparkSession(sc)

In [None]:
# Find the affiliation for the input organization
Affiliations = spark.read.load(rootpath + "Affiliations.txt", format="csv", sep="\t") \
.toDF("AffiliationId", "Rank", "NormalizedName", "DisplayName", "GridId", "OfficialPage", "WikiPage", "PaperCount", "CitationCount", 
      "CreatedDate")

targetAffiliation = Affiliations.where(sf.col("NormalizedName").isin(organizationName)) \
.selectExpr("AffiliationId", "DisplayName as AffiliationName")

targetAffiliation.write.csv(outputDir + "targetAffiliation.csv", mode='overwrite', header='true')

targetAffiliationId = targetAffiliation.select("AffiliationId")

In [None]:
# Get all [Paper]->[Author]->[Affiliation(input organization] relationships
PaperAuthorAffiliations = spark.read.load(rootpath + "PaperAuthorAffiliations.txt", format="csv", sep="\t") \
.toDF("PaperId", "AuthorId", "AffiliationId", "AuthorSequenceNumber", "OriginalAffiliation") 

orgPaperAuthorAffiliation = PaperAuthorAffiliations.join(targetAffiliationId, "AffiliationId", 'inner') \
.select("PaperId", "PaperId", "AffiliationId", "AuthorSequenceNumber")

orgPaperAuthorAffiliation.write.csv(outputDir + "orgPaperAuthorAffiliation.csv", mode='overwrite', header='true')

In [None]:
# Get all org author details
Authors = spark.read.load(rootpath + "Authors.txt", format="csv", sep="\t") \
.toDF("AuthorId", "Rank", "NormalizedName", "DisplayName", "LastKnownAffiliationId", "PaperCount", "CitationCount", "CreatedDate")

orgAuthors = Authors.join(orgAuthorIds, "AuthorId", 'inner') \
.selectExpr("AuthorId", "DisplayName as AuthorName")

orgAuthors.write.csv(outputDir + "orgAuthors.csv", mode='overwrite', header='true')

In [None]:
# Get all paper Ids for the input organization
orgPaperIds = orgPaperAuthorAffiliation.select("PaperId").distinct()


# Get all org paper details
Papers = spark.read.load(rootpath + "Papers.txt", format="csv", sep="\t") \
.toDF("PaperId", "Rank", "Doi", "DocType", "PaperTitle", "OriginalTitle", "BookTitle", "Year", "Date", "Publisher", "JournalId", 
      "ConferenceSeriesId", "ConferenceInstanceId", "Volume", "Issue", "FirstPage", "LastPage", "ReferenceCount", "CitationCount", 
      "EstimatedCitationCount", "CreatedDate")

orgPapers = Papers.join(orgPaperIds, "PaperId", 'inner') \
.where("Year" >= organizationPaperMinYear) \
.selectExpr("PaperId", "PaperTitle as Title", "EstimatedCitation as CitationCount", "Date", "")
# @orgPapers =
#     SELECT Papers.PaperId,
#            Papers.PaperTitle AS Title,
#            Papers.EstimatedCitation AS CitationCount,
#            Papers.Date,
#            //Use "Not avaliable" to represent the publication type if it is unknown.
#            String.IsNullOrEmpty(Papers.DocType) ? "Not avaliable" : Papers.DocType AS PublicationType,
#            Math.Exp(Papers.Rank * -0.001) AS LogProb,
#            @maDetailPagePrefix + Papers.PaperId AS Url,
#            Papers.ConferenceSeriesId == null ? Papers.JournalId : Papers.ConferenceSeriesId AS VId,
#            Year
#     FROM Papers
#          INNER JOIN
#              @orgPaperIds
#          ON Papers.PaperId == @orgPaperIds.PaperId
#     WHERE Year >= @organizationPaperMinYear;
orgPapers.write.csv(outputDir + "orgPapers.csv", mode='overwrite', header='true')

In [None]:
# Get all [Paper]->[Author/Affiliation] relationships for the org
orgAllPaperAuthorAffiliations = PaperAuthorAffiliations.join(orgPapers, "PaperId", 'inner') \
.select("PaperId", "AuthorId", "AffiliationId", "AuthorSequenceNumber")


# All distinct affiliation id from [Paper] -> [Author/Affiliation] relationship are the partner affiliations
orgPartnerPaperAuthorAffiliations = orgPaperAuthorAffiliation.exceptAll(orgAllPaperAuthorAffiliations)

orgPartnerPaperAuthorAffiliations.write.csv(outputDir + "orgPartnerPaperAuthorAffiliations.csv", mode='overwrite', header='true')

In [None]:
# Get all partner orgs' affiliation Ids
orgPartnerAffiliationIds = orgPartnerPaperAuthorAffiliations.where(sf.col("AffiliationId").isnotnull()) \
.select("AffiliationId").distinct()


# Get all partner orgs' affiliation details
orgPartnerAffiliations = Affiliations.join(orgPartnerAffiliationIds, "AffiliationId", 'inner') \
.selectExpr("AffiliationId", "DisplayName as AffiliationName")

orgPartnerAffiliations.write.csv(outputDir + "orgPartnerAffiliations.csv", mode='overwrite', header='true')

In [None]:
# Get all partner authors' Ids
orgPartnerAuthorIds = orgPartnerPaperAuthorAffiliations.select("AuthorId").distinct()


#Get all partner authors' details
orgPartnerAuthors = Authors.join(orgPartnerAuthorIds, "AuthorId", 'inner') \
.selectExpr("AuthorId", "DisplayName as AuthorName")

orgPartnerAuthors.write.csv(outputDir + "orgPartnerAuthors.csv", mode='overwrite', header='true')

In [None]:
# Get all [Paper]->[Field of Study] relationships for the input organization
PaperFieldsOfStudy = spark.read.load(rootpath + "PaperFieldsOfStudy.txt", format="csv", sep="\t") \
.toDF("PaperId", "FieldOfStudyId", "Score")

orgPaperFieldOfStudy = PaperFieldsOfStudy.join(orgPapers, "PaperId", 'inner') \
.select("PaperId", "FieldOfStudyId")

orgPaperFieldOfStudy.write.csv(outputDir + "orgPaperFieldOfStudy.csv", mode='overwrite', header='true')

In [None]:
# Get all Field of Study Ids for the input organization
orgFieldOfStudyIds = orgPaperFieldOfStudy.select("FieldOfStudyId").distinct()


# Get all fields of study details for the input organization
FieldsOfStudy = spark.read.load(rootpath + "FieldsOfStudy.txt", format="csv", sep="\t") \
.toDF("FieldOfStudyId", "Rank", "NormalizedName", "DisplayName", "MainType", "Level", "PaperCount", "CitationCount", "CreatedDate")

fieldOfStudyOut = FieldsOfStudy.join(orgFieldOfStudyIds, "FieldOfStudyId", 'inner') \
.selectExpr("FieldOfStudyId", "Level as FieldLevel", "DisplayName as FieldName")

fieldOfStudyOut.write.csv(outputDir + "fieldOfStudyOut.csv", mode='overwrite', header='true')

In [None]:
# Get all Conference/Journal details as Venue details
venue = Journals.selectExpr("JournalId as VId", "DisplayName as VenueName", "NormalizedName as VenueShortName") \
.union(ConferenceSeries.selectExpr("ConferenceSeriesId as VId", "DisplayName as VenueName", "NormalizedName as VenueShortName"))

venue.write.csv(outputDir + "venue.csv", mode='overwrite', header='true')

In [None]:
sc.stop()