# 12. Organization Insight   
1. //Find the affiliation for the input organization  
1. //Get all [Paper]->[Author]->[Affiliation(input organization] relationships  
1. //Get all org author Ids  
1. //Get all org author details  
1. //Get all paper Ids for the input organization  
1. //Get all org paper details  
1. //Get all [Paper]->[Author/Affiliation] relationships for the org  
1. //All distinct affiliation id from [Paper] -> [Author/Affiliation] relationship are the partner affiliations  
1. //Don't include target org's affiliation in the distinct  
1. //Get all partner orgs' affiliation Ids  
1. //Get all partner orgs' affiliation details  
1. //Get all partner authors' Ids  
1. //Get all partner authors' details  
1. //Get all [Paper]->[Field of Study] relationships for the input organization  
1. //Get all Field of Study Ids for the input organization  
1. //Get all fields of study details for the input organization  
1. //Get all Conference/Journal details as Venue details  

In [None]:
from pyspark import SparkContext
from pyspark.sql import SparkSession, Window
from pyspark.sql.types import *
import pyspark.sql.functions as sf


rootpath = 'wasbs://mag-2018-09-27@magtrainingsource.blob.core.windows.net/mag/'
outputDir = '/output/user99/pyspark/'
organizationName = 'microsoft'
organizationPaperMinYear = int(1991)
maDetailPagePrefix = 'https://academic.microsoft.com/#/detail/'

In [None]:
# start Spark context
sc = SparkContext.getOrCreate()
spark = SparkSession(sc)


# First find the conference series id by matching conference short name
targetConferenceSeriesId = spark.read.load(rootpath + "ConferenceSeries.txt", format="csv", sep="\t") \
.toDF("ConferenceSeriesId", "Rank", "NormalizedName", "DisplayName", "PaperCount", 
      "CitationCount", "CreatedDate") \
.where(sf.col("NormalizedName").isin(conferenceShortName)) \
.select("ConferenceSeriesId")


# Get all conference papers by conference series Id
conferencePapers = spark.read.load(rootpath + "Papers.txt", format="csv", sep="\t") \
.toDF("PaperId", "Rank", "Doi", "DocType", "PaperTitle", "OriginalTitle", "BookTitle", 
      "Year", "Date", "Publisher", "JournalId", "ConferenceSeriesId", "ConferenceInstanceId", 
      "Volume", "Issue", "FirstPage", "LastPage", "ReferenceCount", "CitationCount", 
      "EstimatedCitationCount", "CreatedDate") \
.join(targetConferenceSeriesId, "ConferenceSeriesId", 'inner') \
.select("PaperId", "OriginalTitle", "Year", "CitationCount", "EstimatedCitationCount")
## Will use estimated citation count to select top papers


# Select top conference papers based on estimated citation count
## Microsoft Academic's entity detail page for given entity id
conferenceTopPapers = conferencePapers.select("*") \
.withColumn("DetailsUrl", sf.concat(sf.lit(maDetailPagePrefix), "PaperId")) \
.orderBy("EstimatedCitationCount", ascending=False) \
.select("DetailsUrl", "OriginalTitle", "Year", "CitationCount", "EstimatedCitationCount")


# Save results
conferenceTopPapers.write.csv(outputDir + "conferenceTopPapers.csv", mode='overwrite', header='true')


# Stop Spark context
sc.stop()