In [None]:
# This python script is uesd to generate the tsv file for the c++ version of AT-Model
#
# original format: pickle file of the corpus
# NOTE: The corpus pickle file should be put in the corpus directory
#       Or you can change the directory in the walk function yourself
#
# output file:
#              corpus.tsv  (Input of the c++ version of AT-Model)
#              authorIdMap.p  (Used to find the author information with the scorpus ID)
#
# Author: Cha Chen
#
# Usage: 
#         Put all the corpus file into the corpus directory (pickle)
#         Go

In [None]:
# Import all the necessary package
# dependency: pickle, logging, numpy, matplotlib, re, nltk, ptm, json, os

In [1]:
import pickle
import logging
from os import walk
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import re
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
stop = stopwords.words('english')

tokenizer = RegexpTokenizer(r'\w+')
tokenizer.tokenize('Eighty-seven miles to go, yet.  Onward!')

from ptm import AuthorTopicModel
from ptm.utils import convert_cnt_to_list, get_top_words

logger = logging.getLogger('AuthorTopicModel')
logger.propagate=False
import json

%matplotlib inline

In [None]:
# read the file

In [2]:
# read all the corpus file from the corpus directory
# You can change the directory here
files = []
for (dirpath, dirnames, filenames) in walk('corpus'): # Change in diretory name in walk function
    files.extend(filenames)
    break

In [3]:
# define a function to extract all the necessary information from the raw text file 
def getDataFromDocument(document_raw,doc,doc_author,authorList,wordList,authorMap,authorIdMap):
    count = len(wordList)
    count_author = len(authorList)
    # get the word list result
    abstractExistFlag = False
    if document_raw.get('abstract',-1) != -1:
        mayBeSentence = document_raw.get('abstract')
        if type(mayBeSentence) != str:
            if type(mayBeSentence) == list:
                sentence = ''
                for tempSentence in mayBeSentence:
                    if type(tempSentence) == dict:
                        sentence = tempSentence.get('#text')
                    else:
                        sentence = sentence + tempSentence
                    docWordList = tokenizer.tokenize(sentence)
            else:
                if mayBeSentence.get('#text'):
                    sentence = mayBeSentence.get('#text')
                    docWordList = tokenizer.tokenize(sentence)
        else:
            sentence = mayBeSentence
            docWordList = tokenizer.tokenize(sentence)
        abstractExistFlag = True
    else:
        docWordList = []

    local_doc_ids = {}

    # remove the stop words
    docWordList = [x.lower() for x in docWordList if x.lower() not in stop and not x.isdigit()]
    for word in docWordList:
        wordId = wordList.get(word, -1)
        if wordId == -1:
            wordList[word] = count
            wordId = count
            count = count + 1
        if local_doc_ids.get(wordId,-1) == -1:
            local_doc_ids[wordId] = 1
        else:
            local_doc_ids[wordId] = local_doc_ids[wordId] + 1

    # get the author list result
    local_doc_author = []
    if document_raw.get('authors',-1) != -1:
        docAuthorList = document_raw.get('authors')
        abstractExistFlag = True
    else:
        abstractExistFlag = False
        docAuthorList = []


    for author in docAuthorList:
        auid = author.get('@auid')
        authorId = authorList.get(auid,-1)
        if authorId == -1:
            authorList[auid] = count_author
            authorId = count_author
            count_author = count_author + 1
            authorMap[auid] = author
            authorIdMap[auid] = author
        local_doc_author.append(authorId)


    if abstractExistFlag and local_doc_ids != {} and local_doc_author != []:
        doc.append(local_doc_ids)
        doc_author.append(local_doc_author)
    return doc,doc_author,authorList,wordList,authorMap,authorIdMap       

In [4]:
# define a function to extract the necessary information from the pickle file
def getDataFromCorpus(corpus_raw,doc,doc_author,authorList,wordList,authorMap,authorIdMap):
    testCorpus = corpus_raw
    print(len(testCorpus))
    for i in range(len(testCorpus)):
        # test if the corpus is str
        if type(testCorpus[i]) == str:
            testCorpus[i] = testCorpus[i].replace('}{','}\r\n{').split('\r\n')
            for document in testCorpus[i]:
                document = json.loads(document)
                doc,doc_author,authorList,wordList,authorMap,authorIdMap = getDataFromDocument(document,doc,doc_author,authorList,wordList,authorMap,authorIdMap)
        else:
            doc,doc_author,authorList,wordList,authorMap,authorIdMap = getDataFromDocument(testCorpus[i],doc,doc_author,authorList,wordList,authorMap,authorIdMap)
    return doc,doc_author,authorList,wordList,authorMap,authorIdMap 

In [8]:
# write author Id map into the pickle file 
pickle.dump(authorIdMap,open("authorIdMapUChicago.p","wb"))

In [5]:
# read all the corpus file into the local variable
doc = []
doc_author = []
authorList = {}
authorMap = {}
authorIdMap = {}
wordList = {}
count = 0
for file in files[-7:]: # you can change the files range here
    print('start: '+ file)
    count = count + 1
    testCorpus = pickle.load(open('corpus/'+file, 'rb'))
    doc,doc_author,authorList,wordList,authorMap,authorIdMap = getDataFromCorpus(testCorpus,doc,doc_author,authorList,wordList,authorMap,authorIdMap)

start: 2010_detailed_scopus_uchicago_entry.p
3407
start: 2011_detailed_scopus_uchicago_entry.p
3270
start: 2012_detailed_scopus_uchicago_entry.p
3298
start: 2013_detailed_scopus_uchicago_entry.p
3100
start: 2014_detailed_scopus_uchicago_entry.p
2999
start: 2015_detailed_scopus_uchicago_entry.p
2473
start: 2016_detailed_scopus_uchicago_entry.p
597


In [6]:
## get the corpus format

In [7]:
doc_ids = []
doc_cnt = []
for i in range(len(doc)):
    local_doc_ids = []
    local_doc_cnt = []
    for j in doc[i]:
        local_doc_ids.append(j)
        local_doc_cnt.append(doc[i][j])
    doc_ids.append(local_doc_ids)
    doc_cnt.append(local_doc_cnt)
corpus = convert_cnt_to_list(doc_ids,doc_cnt)

In [8]:
n_doc = len(corpus)
n_topic = 10
n_author = len(authorList)
n_voca = len(wordList)
max_iter = 50

In [9]:
voca = []
for i in range(len(wordList)):
    voca.append('')
for i in wordList:
    voca[wordList[i]] = i

In [10]:
reverseAuthorList = {}
for i in authorList:
    reverseAuthorList[authorList[i]] = i

In [27]:
# write the tsv file
f = open('corpus_uchicago_small.tsv', 'w')
for i in range(len(corpus)):
    # write the author
    f.write(':'.join([reverseAuthorList[k] for k in doc_author[i]]))
    f.write('\t')
    f.write(':'.join([voca[k] for k in corpus[i]]))
    f.write('\n')
f.close()

In [82]:
# test the url function
# NOTE: To be completed. Get the h-index from the url

62293

In [52]:
import urllib.request
import xml.etree.ElementTree as ET

In [53]:
test = urllib.request.urlopen(testCorpus[0]['authors'][0]['author-url']).read().decode("utf-8") 

In [54]:
root = ET.fromstring(test)

In [55]:
root.tag

'author-retrieval-response'

In [49]:
for url in e.findall('prism:url'):
    print(url)

In [64]:
for x in root[0]:
    print(x)

<Element '{http://purl.org/dc/elements/1.1/}identifier' at 0x1201ef598>
<Element 'eid' at 0x1201ef3b8>
<Element '{http://prismstandard.org/namespaces/basic/2.0/}url' at 0x1201efef8>
<Element 'link' at 0x1201eff48>
<Element 'link' at 0x1201eff98>


In [72]:
root[0][3].attrib['href']

'https://www.scopus.com/authid/detail.url?partnerID=HzOxMe3b&authorId=57060814600&origin=inward'

In [76]:
page = urllib.request.urlopen('https://www.google.com/#q=sf').read()

In [77]:
page

b'<!doctype html><html itemscope="" itemtype="http://schema.org/WebPage" lang="en"><head><meta content="Search the world\'s information, including webpages, images, videos and more. Google has many special features to help you find exactly what you\'re looking for." name="description"><meta content="noodp" name="robots"><meta content="text/html; charset=UTF-8" http-equiv="Content-Type"><meta content="/logos/doodles/2016/lotte-reinigers-117th-birthday-5079873255112704-hp.gif" itemprop="image"><meta content="Lotte Reiniger&#8217;s 117th birthday! #GoogleDoodle" property="og:description"><meta content="http://www.google.com/logos/doodles/2016/lotte-reinigers-117th-birthday-5079873255112704.5-thp.png" property="og:image"><meta content="391" property="og:image:width"><meta content="220" property="og:image:height"><title>Google</title><script>(function(){window.google={kEI:\'dtdPV8jyL8PWyQL0pa2YDg\',kEXPI:\'1350654,1350947,3700276,3700389,4028875,4029370,4029815,4031109,4032677,4036509,40365