# Title 

#### Imports

In [6]:
from typing import Dict, List, Tuple, Optional
from collections import namedtuple
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from functional import seq, pseq
from functional.streams import Sequence
from IPython.core.display import HTML

def display_seq(sequence:Sequence,rows:int)-> None:
        sequence._repr_html_= lambda :sequence.tabulate(rows,tablefmt='html')
        display(sequence)
        sequence._repr_html_= lambda :sequence.tabulate(10,tablefmt='html')

In [7]:
import functools
def compose(*functions):
    def compose2(f, g):
        return lambda x: f(g(x))
    return functools.reduce(compose2, functions, lambda x: x)

## Section 1

In [8]:
from util import get_envs, cached_load
base_dir = get_envs()["base_dir"]
base_dir

'/home/owner/blob/data/data_mining/dblp/csv/'

In [9]:
label_name = 'infomap_minor'

In [10]:
subdir = base_dir  + label_name + "/"
os.mkdir(subdir) if not os.path.exists(subdir) else None

infomap_author_community_fn = subdir + "author_community.csv"
infomap_coauthor_community_edges_fn =   subdir + "coauthor_community_edges.csv"

In [15]:
def infomap_minor_author_community():
    infomap_author_community = pd.read_csv(infomap_author_community_fn,header=None)
    infomap_author_community.columns = ['major','minor','key']
    max_major = infomap_author_community.major.max()
    infomap_author_community['label'] = infomap_author_community.major * max_major + infomap_author_community.minor
    infomap_author_community = infomap_author_community.drop(['minor','major'],axis=1)
    return infomap_author_community

In [16]:
infomap= infomap_minor_author_community()
infomap.head()

Unnamed: 0,key,label
0,qiang wang_pers,57628
1,philip h. s. torr_pers,57628
2,jiri matas_pers,57628
3,haibin ling_pers,57628
4,zheng zhang_pers,57628


In [18]:
pers_infomap_minor_dict = seq(infomap).to_dict()
coauthorships = pd.read_csv(base_dir + "/coauthorships.csv",header=None)
coauthorships.columns= ['a','b','count']
coauthorships.head()

Unnamed: 0,a,b,count
0,jurgen annevelink_pers,rafiul ahad_pers,1
1,jurgen annevelink_pers,amelia carlson_pers,1
2,jurgen annevelink_pers,daniel h. fishman_pers,1
3,jurgen annevelink_pers,michael l. heytens_pers,1
4,jurgen annevelink_pers,william kent_pers,3


In [21]:
info_coauthorships = coauthorships.copy()
info_coauthorships['a_label'] = coauthorships.a.apply(lambda x: pers_infomap_minor_dict[x])
info_coauthorships['b_label'] = coauthorships.b.apply(lambda x: pers_infomap_minor_dict[x])
info_coauthorships = info_coauthorships.drop(['a','b'],axis=1)
info_coauthorships = info_coauthorships[info_coauthorships.a_label != info_coauthorships.b_label].dropna()

In [46]:
info_coauthorships.head()

Unnamed: 0,count,a_label,b_label
15,5,518710,518663
17,4,518663,518710
18,1,518710,2708560
20,1,1037289,518937
21,1,1037289,518663


In [36]:
coauthorships['count'].sum()

22617544

In [33]:
from typing import Tuple
def ordered_labels(a,b):
    return min(a,b), max(a,b)
def fix(x:Tuple[int,int],counts):
    a = x
    return (a[0],a[1],seq(counts).map(lambda x: x[0]).sum())

In [49]:
info_coauthorships.head()

Unnamed: 0,count,a_label,b_label
15,5,518710,518663
17,4,518663,518710
18,1,518710,2708560
20,1,1037289,518937
21,1,1037289,518663


In [50]:
info_community_edges = seq(info_coauthorships)\
.group_by(lambda x: ordered_labels(x[1],x[2]))\
.map(lambda a: fix(a[0],a[1]) )\
.to_pandas(columns=['c1','c2','count'])


In [55]:
len(info_community_edges)

2419475

Let's just take edges with at least 10 count

In [59]:
len(info_community_edges)

4.02300003099846

In [65]:
trimmed_info_community_edges = info_community_edges.where(info_community_edges['count'] > 7).dropna()

In [66]:
len(trimmed_info_community_edges) * 100 / len(info_community_edges)

10.989078209115615

In [67]:
np.mean(trimmed_info_community_edges['count'])

19.49700990679936

In [69]:
trimmed_info_community_edges.to_csv(infomap_coauthor_community_edges_fn,index=False)

In [None]:
trimmed_info_community_edges = pd.read_csv(infomap_coauthor_community_edges_fn)

In [71]:
trimmed_info_community_edges.head()

Unnamed: 0,c1,c2,count
0,518663.0,518710.0,40.0
3,518663.0,1037289.0,21.0
13,231022.0,576408.0,9.0
18,518670.0,11928798.0,18.0
19,60100.0,518675.0,12.0


In [70]:
len(trimmed_info_community_edges)

265878

In [76]:
info_community_nodes = seq(trimmed_info_community_edges.c1,trimmed_info_community_edges.c2).flat_map(lambda x: x).distinct().map(lambda x: int(x))

In [83]:
from py2neo import Graph,Node
graph = Graph("bolt://127.0.0.1:7687",auth=("neo4j", "admin"))

In [84]:
graph.run("""
unwind {nodes} as node_id
CREATE (a:community_infomap_minor { key: node_id })
""",nodes=info_community_nodes.to_list())

<py2neo.database.Cursor at 0x7fe2ab76ee50>

In [85]:
info_community_nodes.size()

47056

In [86]:
chunk_nb = int(len(trimmed_info_community_edges) *1.0 / 1000)
chunk_nb

265

In [88]:
import tqdm
for i,chunk in tqdm.tqdm(enumerate(np.array_split(trimmed_info_community_edges,chunk_nb)),total=chunk_nb):
    entries = seq(chunk).map(lambda x : {'a':x[0],'b':x[1],'cnt':x[2]}).to_list()
    tx= graph.begin()
    tx.run("""
unwind {entries} as e
MATCH (a:community_infomap_minor{key:e.a})
with a,e
MATCH (b:community_infomap_minor{key:e.b})
merge (a)-[:COMMUNITY_COAUTHORS_WITH{count:e.cnt}]-(b)
""",entries=entries)
    tx.commit()


  0%|          | 0/265 [00:00<?, ?it/s][A
  0%|          | 1/265 [00:00<02:34,  1.70it/s][A
  1%|          | 2/265 [00:00<02:00,  2.18it/s][A
  1%|          | 3/265 [00:00<01:36,  2.73it/s][A
  2%|▏         | 4/265 [00:01<01:16,  3.41it/s][A
  2%|▏         | 6/265 [00:01<01:00,  4.26it/s][A
  3%|▎         | 7/265 [00:01<00:50,  5.13it/s][A
  3%|▎         | 8/265 [00:01<00:43,  5.91it/s][A
  4%|▍         | 10/265 [00:01<00:36,  7.07it/s][A
  5%|▍         | 12/265 [00:01<00:31,  8.09it/s][A
  5%|▌         | 14/265 [00:01<00:28,  8.76it/s][A
  6%|▌         | 16/265 [00:02<00:25,  9.91it/s][A
  7%|▋         | 18/265 [00:02<00:24, 10.27it/s][A
  8%|▊         | 20/265 [00:02<00:21, 11.43it/s][A
  8%|▊         | 22/265 [00:02<00:20, 11.94it/s][A
  9%|▉         | 24/265 [00:02<00:19, 12.22it/s][A
 10%|▉         | 26/265 [00:02<00:19, 12.46it/s][A
 11%|█         | 28/265 [00:02<00:18, 12.70it/s][A
 11%|█▏        | 30/265 [00:03<00:18, 12.62it/s][A
 12%|█▏        | 32/265 [00