<h1 align = "center">DAC Network Construction</h1>

In [1]:
import json

## Author Object

In [2]:
class Author():
    def __init__(self, name, aid):
        self.name = name
        self.aid = aid
        self.nicknames = []
        self.paper_ids = []
    
    def add_paper(self, pid):
        if pid not in self.paper_ids:
            self.paper_ids.append(pid)
            
    def add_nickname(self, name):
        if name not in self.nicknames:
            self.nicknames.append(name)
        

## Paper Object

In [3]:
class Paper():
    def __init__(self, title, abstract, year, author_names, b_topic, topics, pid, detc, url):
        
        # Basic info
        self.title = title
        self.abstract = abstract
        self.year = year
        self.author_names = author_names
        self.broad_topic = b_topic
        self.topics = topics
        self.pid = pid
        self.detc = detc
        self.url = url
        
        # add later
        self.author_ids = []
        self.citations = []
        self.cited_by = []
    
    def add_author_id(self, aid):
        if aid not in self.author_ids:
            self.author_ids.append(aid)
        

## Procedure 1. Read papers 

In [4]:
file_path = "../2016_data/DAC_Entire_DataBase_2016.json"

with open(file_path, "r") as f:
    database = json.load(f)

stats = {}
for p in database:
    stats[p['DETC']] = p['Year']

In [5]:
len(database)

1668

In [6]:
papers = {}
for p in database:
    year = p['Year']
    if p['DETC'] in stats:
        year = stats[p['DETC']]
    paper = Paper(p['Title'], p['Abstract'], year,p['Authors'], p['Broad_Topic'],\
                  p['Topics'], p['PaperID'],p['DETC'], p['URL'])
    papers[paper.pid] = paper

In [7]:
stats = {}

for p in papers.values():
    if p.year in stats:
        stats[p.year]+=1
    else:
        stats[p.year]=1
stats

{2002: 117,
 2003: 143,
 2004: 115,
 2005: 128,
 2006: 118,
 2007: 125,
 2008: 119,
 2009: 122,
 2010: 112,
 2011: 110,
 2012: 123,
 2013: 114,
 2014: 108,
 2015: 114}

## Procedure 2. Read authors

In [8]:
## add author into the dataset
author_names = {}

for p in papers.values():
    for n in p.author_names:
        author_names[n] = 1

In [9]:
# Assign IDs to each author

id = 0
for n in author_names.keys():
    author_names[n] = str(id)
    id += 1

In [10]:
authors = {}

for name in author_names.keys():
    authors[author_names[name]] = Author(name, author_names[name])

In [11]:
def make_name_to_author_dict(authors):
    ret = {}
    for author in authors.values():
        ret[author.name] = author
    return ret

## 3. Build Connection (between author and paper)

In [12]:
name2author = make_name_to_author_dict(authors)

In [13]:
len(name2author)

2528

### Let each author has paper_id list

In [14]:
for paper in papers.values():
    for name in paper.author_names:
        author = name2author[name]
        
        author.add_paper(paper.pid)

### Let each paper has author_id list

In [15]:
for paper in papers.values():
    for name in paper.author_names:
        paper.add_author_id(name2author[name].aid)

## 4. Name Disambiguation

### Detect similar name pairs

Running the following cell will generate lines of similar names. Each line is formatted as "author_id, name, author_id, name". For each line, it the two are indeed similar, them copy and paste the line into data/disambiguation.txt file.

In [16]:
from fuzzywuzzy import fuzz
import Levenshtein
keys = name2author.keys()

for i in range(0, len(keys)):
    for j in range(i+1, len(keys)):
        p1 = name2author[keys[i]]
        p2 = name2author[keys[j]]
        
        first = p1.name
        second = p2.name
        
        pdist = fuzz.partial_ratio(first, second)
        dist = Levenshtein.distance(first, second)
        lv_ra = Levenshtein.ratio(first, second)
        
        if pdist >90 or dist <=2 or lv_ra >0.8:
            print p1.aid+"\t"+first+"\t"+p2.aid+"\t"+second

3	James Allison	463	James T. Allison
5	David J. Gorsich	1769	David Gorsich
8	Q. Cheng	609	Heidi Q. Chen
11	Steve C. Wang	257	C. Wang
12	Niclas Stromberg	220	Niclas Strömberg
12	Niclas Stromberg	679	Niclas Strömberg
13	Yu Gu	18	P. Gu
13	Yu Gu	934	J. Gu
13	Yu Gu	989	Yu Liu
13	Yu Gu	1605	Xu Guo
13	Yu Gu	2400	Y. Fu
18	P. Gu	97	C. Yu
18	P. Gu	934	J. Gu
18	P. Gu	942	Ashwin P. Gurnani
18	P. Gu	1870	W. Hu
18	P. Gu	2400	Y. Fu
23	Le Chen	503	Jie Chen
23	Le Chen	691	Ken Chen
23	Le Chen	1414	Wei Chen
23	Le Chen	2018	Li Chen
23	Le Chen	2291	Wen Chen
28	Tucker J. Marion	837	Tucker Marion
32	J.-C. Léon	2456	J. C. Léon
42	Ashraf Nassef	1404	Ashraf O. Nassef
49	Zhe Zhang	1560	Jie Zhang
52	John Ziegert	1898	John C. Ziegert
53	Shen Lu	2255	Zhen Hu
60	Mian Li	118	Xiang Li
60	Mian Li	458	Jia Li
60	Mian Li	930	Meifang Li
60	Mian Li	1313	Yan Li
60	Mian Li	1339	Ming Li
530	Matthew Watkins	1965	Matthew Parkinson
76	V. Krishnamurthy	771	Vivek Krishnamurthy
79	James L. Mathieson	2364	James J. Mason
93	Chao Qi	16

### Function for merging name1 and name2

In [17]:
def merge(id1, id2, authors, papers):
    author1 = authors[id1]
    author2 = authors[id2]
    
    # 1. On Author level
    
    # let 1 has 2's all paper_ids
    for pid in author2.paper_ids:
        author1.add_paper(pid)
    
    # make 2's name as 1's nickname
    author1.add_nickname(author2.name)
    
    # 2. On Papers level
    # Make author2's papers that contain author2.id now contain author1.id
    for pid in author2.paper_ids:
        paper = papers[pid]
        paper.author_ids = [id1 if x == id2 else x for x in paper.author_ids]
    
    # remove id2
    authors.pop(id2)
    
    print author1.name, " AND ", author2.name, "ARE MERGED!"

### Read from disambiguation file

Think of these name pairs as edges in graph, we need to find connected components of that graph and each component is referring to a person's name set.

In [18]:
import networkx as nx
G=nx.Graph()
disamb_file_path = "../process_data/dis.txt"

dependency = []
with open(disamb_file_path, "rb") as f:
    for line in f:
        segs = line.strip().split("\t")
        id1 = segs[0]
        id2 = segs[2]
        G.add_edge(int(id1), int(id2))

names = [sorted(list(c)) for c in sorted(nx.connected_components(G), key=len, reverse=True)]

### Perform merging

In [19]:
for name_list in names:
    for i in range(0, len(name_list)-1):
        idx = len(name_list) - 1 - i
        merge(str(name_list[idx-1]), str(name_list[idx]), authors, papers)

Gül E. Okudan  AND  Gül E. Okudan ARE MERGED!
Gül Okudan  AND  Gül E. Okudan ARE MERGED!
Gül E. Okudan Kremer  AND  Gül Okudan ARE MERGED!
Richard J. Malak, Jr.  AND  Richard J. Malak ARE MERGED!
Richard Malak  AND  Richard J. Malak, Jr. ARE MERGED!
Nam H. Kim  AND  Nam Ho Kim ARE MERGED!
Nam-Ho Kim  AND  Nam H. Kim ARE MERGED!
George Fadel  AND  Georges M. Fadel ARE MERGED!
Georges Fadel  AND  George Fadel ARE MERGED!
Niclas Strömberg  AND  Niclas Strömberg ARE MERGED!
Niclas Stromberg  AND  Niclas Strömberg ARE MERGED!
Matthew B. Parkinson  AND  Matthew Parkinson ARE MERGED!
Matthew Watkins  AND  Matthew B. Parkinson ARE MERGED!
Kenneth Chase  AND  Ken W. Chase ARE MERGED!
Kenneth W. Chase  AND  Kenneth Chase ARE MERGED!
Ali Farhang-Mehr  AND  Ali Farhang Mehr ARE MERGED!
A. Farhang-Mehr  AND  Ali Farhang-Mehr ARE MERGED!
Horea T. Ilieş  AND  Horea Ilies ARE MERGED!
Horea T. Ilies  AND  Horea T. Ilieş ARE MERGED!
Jami J. Shah  AND  Jami Shah ARE MERGED!
J. J. Shah  AND  Jami J. Shah 

In [20]:
authors.keys()

['344',
 '345',
 '346',
 '347',
 '340',
 '341',
 '342',
 '343',
 '348',
 '349',
 '1653',
 '2318',
 '2319',
 '2316',
 '2317',
 '2314',
 '2315',
 '2312',
 '2313',
 '2310',
 '2311',
 '298',
 '299',
 '296',
 '297',
 '294',
 '295',
 '292',
 '293',
 '290',
 '291',
 '2147',
 '270',
 '271',
 '273',
 '274',
 '275',
 '276',
 '277',
 '278',
 '279',
 '1780',
 '2146',
 '2268',
 '2269',
 '2262',
 '2263',
 '2260',
 '2261',
 '2266',
 '2267',
 '2264',
 '2265',
 '2442',
 '2443',
 '2440',
 '2441',
 '1781',
 '2447',
 '2444',
 '2445',
 '2448',
 '2449',
 '2189',
 '108',
 '109',
 '102',
 '103',
 '100',
 '101',
 '106',
 '107',
 '104',
 '105',
 '2046',
 '2047',
 '2044',
 '2045',
 '2042',
 '2043',
 '2041',
 '2048',
 '1372',
 '2038',
 '1001',
 '1210',
 '1375',
 '1655',
 '1374',
 '99',
 '98',
 '91',
 '90',
 '93',
 '92',
 '95',
 '94',
 '97',
 '96',
 '1623',
 '1622',
 '1621',
 '1620',
 '1627',
 '1626',
 '1624',
 '1998',
 '1629',
 '1377',
 '559',
 '558',
 '555',
 '554',
 '557',
 '556',
 '551',
 '550',
 '553',
 '552'

## 5. Network Construction

In [21]:
import networkx as nx

In [22]:
def make_pairs(input_list):
    length = len(input_list)
    ret = []
    if length == 1:
        return [(input_list[0], input_list[0])]
    for i in range(0, length-1):
        for j in range(i+1, length):
            ret.append((input_list[i], input_list[j]))
    return ret

In [23]:
def papers_by_year(papers, inf, sup):
    ret = []
    for p in papers.values():
        if p.year <= sup and p.year >= inf:
            ret.append(p)
    return ret

In [24]:
def make_edges(papers_selected):
    edge_list = []
    for p in papers_selected:
        edge_list.extend(make_pairs(p.author_ids))
    return edge_list

In [25]:
def author_network(papers, inf_year, sup_year):
    papers_between = papers_by_year(papers, inf_year, sup_year)
    edge_list = make_edges(papers_between)
    
    G=nx.Graph()
    for edge in edge_list:
        G.add_edge(edge[0], edge[1])
    return G

In [26]:
def print_author_node(author_list):
    node_list = ["id\tname"]
    for author in author_list:
        node_info = "\t".join([author.aid, author.name])
        node_list.append(node_info)
    return node_list

In [27]:
def print_paper_node(paper_list):
    node_list = ["id\ttitle"]
    for paper in paper_list:
        node_info = "\t".join([paper.pid, paper.title])
        node_list.append(node_info)
    return node_list

In [28]:
def print_to_file(info_list, filename):
    with open(filename, "wb") as f:
        for line in info_list:
            f.write(line)
            f.write("\n")
    print filename, "DONE!"

### 5.1 Preparation

In [29]:
class Interpreter():
    def __init__(self):
        self.digit_holder = {}
        self.string_holder = {}
        
    def add(self, key, value):
        self.digit_holder[key] = value
        self.string_holder[value] = key
    
    def lookup(self, key):
        if type(key) is int:
            return self.digit_holder[key]
        else:
            return self.string_holder[key]

In [30]:
fake_aid = Interpreter()
fake_pid = Interpreter()

for i in range(0, len(authors.keys())):
    aid = authors.keys()[i]
    fake_aid.add(i, aid)
    
for i in range(0, len(papers.keys())):
    pid = papers.keys()[i]
    fake_pid.add(i, pid)

In [31]:
def make_pairs(input_list):
    length = len(input_list)
    ret = []
    if length <= 1:
        return []
    for i in range(0, length-1):
        for j in range(i+1, length):
            ret.append((input_list[i], input_list[j]))
    return ret

# 6 Author Network Construction

In [32]:
def author_network(authors, papers, start_year, end_year):
    edge_list = []
    #edge_list = ["from\tto\tweight\tpaper"]
    for p in papers.values():
        if p.year < start_year or p.year > end_year:
            continue
        author_ids = p.author_ids
        edges = make_pairs(author_ids)
        for edge in edges:
            # print edge
            edge_list.append("\t".join([edge[0], edge[1], "10", p.title]))
    return edge_list

In [33]:
def author_from_edgelist(edge_list):
    author_set = set()
    for edge in edge_list:
        segs = edge.split("\t")
        author_set.add(segs[0])
        author_set.add(segs[1])
    return list(author_set)

In [34]:
def network_output(authors, papers, start_year, end_year, edge_file, node_file):
    edge_list = author_network(authors, papers, start_year, end_year)
    
    with open(edge_file, "wb") as f:
        f.write("from\tto\tweight\tpaper\n")
        for line in edge_list[0:50]:
            f.write(line.encode('utf8'))
            f.write("\n")
    node_list = author_from_edgelist(edge_list[0:50])
    
    with open(node_file, "wb") as f:
        f.write("ID\tName\tType\n")
        for au in node_list[0:50]:
            line = "\t".join([au, authors[au].name, "P"])
            f.write(line.encode('utf8'))
            f.write("\n")
    return

## 10. Export

In [35]:
import pickle

In [36]:
out_au = []
for au in authors.values():
    out_au.append(au.__dict__)

with open("../process_data/Author_Data.json", "wb") as f:
    json.dump(out_au, f)

In [37]:
out_pp = []
for pp in papers.values():
    out_pp.append(pp.__dict__)

with open("../process_data/Paper_Data.json", "wb") as f:
    json.dump(out_pp, f)

In [38]:
with open("../process_data/Author_Data.pickle", "wb") as f:
    pickle.dump(authors, f)

In [39]:
with open("../process_data/Paper_Data.pickle", "wb") as f:
    pickle.dump(papers, f)