In [None]:
# Generate_Html_and_Analysis_Data
#
# This ipython notebook is used to analysis the phi and theta from the Author Topic Model (c++)
# And generate the html file for every research
#
# Dependency: json, pickle, numpy, scipy, csv, sklearn, string
# 
# Input file: corpus_uchicago_small.tsv_phi (generate by the c++ code)
#             corpus_uchicago_small.tsv_theta (generate by the c++ code)
#             author_scopus_matches.csv (Used to find the author in uchicago scope)
#
# Output file: network json file
#              personal html for every researcher
#
# Usage: Just go through this file
#
# Author: Cha Chen
# Email: jamworld@uchicago.edu
#

In [108]:
# import all the dependency file
import json
import pickle
import numpy as np
import scipy as scp
from scipy import stats
import csv
import sklearn
from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import DistanceMetric
from sklearn.neighbors.ball_tree import BallTree
from sklearn.cluster import KMeans
from string import Template

In [9]:
# Read the phi and theta file into the TopicList and AuthorTopicList
f = open("corpus_uchicago_small.tsv_phi",'r')
test = f.read()
f.close()
TopicList = [x.split('\t') for x in test.split('\n')]
TopicList.remove([''])

f = open("corpus_uchicago_small.tsv_theta",'r')
test = f.read()
f.close()
AuthorTopicList = [x.split('\t') for x in test.split('\n')]
AuthorTopicList.remove([''])

In [11]:
# Generate the graph json file
node = []
link = []
authorList = {}
count = 0
for author in AuthorTopicList:
    if not authorList.get(author[0]):
        authorList[author[0]] = count
        count = count + 1
        node.append({
                "id": "author_" + author[0],
                "name": "author: " + author[0],
                "type": "author",
                "link": author[0] + ".html"
            })
    if float(author[2]) > 0.1:
        link.append({
                "source": "author_" + author[0],
                "target": "topic_" + author[1],
                "prob": float(author[2])
            })
topicList = {}
wordList = {}
count = 0
for topic in TopicList:
    if not topicList.get(topic[0]):
        topicList[topic[0]] = count
        count = count + 1
        node.append({
                "id": "topic_" + topic[0],
                "name": "topic: " + topic[0],
                "type": "topic",
                "link": topic[0] + ".html"
            })
    link.append({
            "source": "topic_" + topic[0],
            "target": "word_" + topic[1],
            "prob": float(topic[2])
        })
    if not wordList.get(topic[1]):
        wordList[topic[1]] = 1
        node.append({
                "id": "word_" + topic[1],
                "name": "word: " + topic[1],
                "type": "words",
                "link": topic[1] + ".html"
            })
        
AuthorTopicData = {"nodes": node, "links": link}

In [12]:
# write the json file
f = open('AT-network.json','w')
f.write(json.dumps(AuthorTopicData, sort_keys=True, indent=4))
f.close()

In [13]:
# build the Distribution Matrix here, used for knn and KL divergence

In [14]:
DisOfAuthorMatrix = np.zeros([len(authorList),100]) + 10**(-16)

In [15]:
authorMap = {}
authorReverseMap = {}
count = 0
DisOfAuthorMatrix = np.zeros([len(authorList),100]) + 10**(-16)
for author in AuthorTopicList:
    if not author[0] in authorReverseMap:
        authorMap[count] = author[0]
        authorReverseMap[author[0]] = count
        count = count + 1
    DisOfAuthorMatrix[authorReverseMap[author[0]],int(author[1])] = float(author[2]) 

In [18]:
# Load the author Id map list
authorIdMap = pickle.load(open('authorIdMapUChicago.p', 'rb'))

In [42]:
# Read the uchicago author id 
UChicagoScopus = {}
with open('author_scopus_matches_hi.csv') as csvfile:
    readCSV = csv.reader(csvfile, delimiter=',')
    for row in readCSV:
        UChicagoScopus[row[6]] = row

In [70]:
# Find all the author in the uchciago scope
aIdList = []
for item in AuthorTopicList:
    if UChicagoScopus.get(item[0]):
        temp = []
        temp.append(UChicagoScopus.get(item[0])[6])
        temp.append(UChicagoScopus.get(item[0])[8])
        temp.append(item[1])
        temp.append(item[2])
        aIdList.append(temp)

In [132]:
# build the network json for the first time, something needs to modify here
# we will only use some of the mid-variable here
node = []
link = []
authorList = {}
count = 0
for author in aIdList:
    if not authorList.get(author[0]):
        authorList[author[0]] = count
        count = count + 1
        node.append({
                "id": "author_" + author[0],
                "name": "author: " + author[1],
                "type": "author",
                "link": author[1] + ".html"
            })
    if float(author[3]) > 0.1:
        link.append({
                "source": "author_" + author[0],
                "target": "topic_" + author[2],
                "prob": float(author[3])
            })
topicList = {}
wordList = {}
count = 0
for topic in TopicList:
    if not topicList.get(topic[0]):
        topicList[topic[0]] = count
        count = count + 1
        node.append({
                "id": "topic_" + topic[0],
                "name": "topic: " + topic[0],
                "type": "topic",
                "link": topic[0] + ".html"
            })
    link.append({
            "source": "topic_" + topic[0],
            "target": "word_" + topic[1],
            "prob": float(topic[2])
        })
    if not wordList.get(topic[1]):
        wordList[topic[1]] = 1
        node.append({
                "id": "word_" + topic[1],
                "name": "word: " + topic[1],
                "type": "words",
                "link": topic[1] + ".html"
            })
        
AuthorTopicData = {"nodes": node, "links": link}

In [90]:
# build the distribution matrix for the uchicago author
DisOfAuthorMatrix = np.zeros([len(authorList),100]) + 10**(-16)

In [92]:
authorMap = {}
authorReverseMap = {}
count = 0
DisOfAuthorMatrix = np.zeros([len(authorList),100]) + 10**(-16)
for author in aIdList:
    if not author[0] in authorReverseMap:
        authorMap[count] = author[0]
        authorReverseMap[author[0]] = count
        count = count + 1
    DisOfAuthorMatrix[authorReverseMap[author[0]],int(author[2])] = float(author[3]) 

In [93]:
# caculate the sysmetry KL-divergence between all the researchers in the uchicago
entropyMatAuthor = [[scp.stats.entropy(x,y)+scp.stats.entropy(y,x) for y in DisOfAuthorMatrix] for x in DisOfAuthorMatrix]
simMatAuthor = [sorted(range(len(x)),key=lambda k: x[k]) for x in entropyMatAuthor]

In [94]:
# initialize an KNN model
# set the cluster number to 100
# total variation (TV) -> l1 norm
kMN = KMeans(n_clusters=100, init='k-means++', n_init=10, max_iter=3000, tol=0.0001, precompute_distances='auto', verbose=0, random_state=None, copy_x=True, n_jobs=1)

In [95]:
# fit the knn model
kMN.fit(DisOfAuthorMatrix)

KMeans(copy_x=True, init='k-means++', max_iter=3000, n_clusters=100,
    n_init=10, n_jobs=1, precompute_distances='auto', random_state=None,
    tol=0.0001, verbose=0)

In [96]:
DisOfTopicMatrix = np.zeros([np.ceil(len(TopicList)/10),len(wordList)]) + 10**(-16)
wordMap = {}
wordReverseMap = {}
count = 0
for topic in TopicList:
    if not wordMap.get(topic[1]):
        wordMap[topic[1]] = count
        wordReverseMap[count] = topic[1]
        count = count + 1
    DisOfTopicMatrix[int(topic[0]),wordMap[topic[1]]] = float(topic[2]) 

  if __name__ == '__main__':


In [133]:
# Build the real json file for the network
# use the knn result to classify the reasearchers
node = []
link = []
authorList = {}
for author in aIdList:
    if not authorList.get(author[0]):
        authorList[author[0]] = 1
        
        node.append({
                "id": "author_" + author[0],
                "name": "author: " + author[1],
                "type": "author",
                "link": author[1] + ".html",
                "cluster": str(kMN.labels_[authorReverseMap[author[0]]])
            })
    if float(author[3]) > 0.1:
        link.append({
                "source": "author_" + author[0],
                "target": "topic_" + author[2],
                "prob": float(author[3])
            })
topicList = {}
wordList = {}
for topic in TopicList:
    if not topicList.get(topic[0]):
        topicList[topic[0]] = 1
        node.append({
                "id": "topic_" + topic[0],
                "name": "topic: " + topic[0],
                "type": "topic",
                "link": topic[0] + ".html"
            })
    link.append({
            "source": "topic_" + topic[0],
            "target": "word_" + topic[1],
            "prob": float(topic[2])
        })
    if not wordList.get(topic[1]):
        wordList[topic[1]] = 1
        node.append({
                "id": "word_" + topic[1],
                "name": "word: " + topic[1],
                "type": "words",
                "link": topic[1] + ".html"
            })
        
AuthorTopicData = {"nodes": node, "links": link}

In [134]:
# write the json file
f = open('AT-network-UChicago.json','w')
f.write(json.dumps(AuthorTopicData, sort_keys=True, indent=4))
f.close()

In [106]:
# Read in the author finger print page template
f = open("template.txt",'r')
template = f.read()
f.close()

In [109]:
s = Template(template)

In [110]:
# build the reverse author hash table
authorReverseList = {}
for author in authorList:
    authorReverseList[authorList[author]] = author

In [141]:
# build the hash table for the template of the finger print page
simAuthorSet = []
for author in authorList:
    authorPage = {}
    authorIndex = authorReverseMap[author]
    
    # author's information
    authorPage['author'] = UChicagoScopus.get(author)[8]
    count = 0
    for authorTopic in aIdList[(authorReverseMap[author])*10:((authorReverseMap[author])*10+10)]:
        index = 'Topic' + str(count)
        authorPage[index] = '"Topic ' + authorTopic[2] + '"'
        index = 'Value' + str(count)
        authorPage[index] = authorTopic[3]
        count = count + 1
    
    # similiar author
    for i in range(1,5):
        index = 'simAuthor' + str(i)
        authorPage[index] = UChicagoScopus.get(authorMap[simMatAuthor[authorIndex][i]])[8]
        localIndex = authorMap[simMatAuthor[authorIndex][i]]
        count = 0
        for authorTopic in aIdList[(authorReverseMap[localIndex])*10:((authorReverseMap[localIndex])*10+10)]:
            subIndex = 'Sim' + str(i) + 'Topic' + str(count)
            authorPage[subIndex] = '"Topic ' + authorTopic[2] + '"'
            subIndex = 'Sim' + str(i) + 'Value' + str(count)
            authorPage[subIndex] = authorTopic[3]
            count = count + 1
        
    simAuthorSet.append(authorPage)

In [136]:
# Generate all the html file
for authorDataSet in simAuthorSet:
    f = open('newHtml/'+ authorDataSet['author']+'.html','w')
    f.write(s.substitute(authorDataSet))
    f.close()