In [3]:
import numpy as np
import pandas as pd

df = pd.read_csv("cleaned_data.csv")
df["Author(s) ID"].head()

0    57208326453;57221875424;57221865054;5722100955...
1                                          6603844230;
2                             52763405400;56082843200;
3    57205603988;57215379404;57190290261;5721692686...
4                 57223043474;46661903700;57218503046;
Name: Author(s) ID, dtype: object

In [4]:
def break_to_list(author_ids):
    split = str(author_ids).split(";")
    split.pop()
    return split

In [5]:
df["Author(s) ID"] = df["Author(s) ID"].apply(break_to_list)
df["Author(s) ID"].head()

0    [57208326453, 57221875424, 57221865054, 572210...
1                                         [6603844230]
2                           [52763405400, 56082843200]
3    [57205603988, 57215379404, 57190290261, 572169...
4              [57223043474, 46661903700, 57218503046]
Name: Author(s) ID, dtype: object

In [6]:
author_paper_count = {}
co_author = {}
author_set = []
for author_list in df["Author(s) ID"]:
    for i in range(len(author_list)):
        if (author_list[i]) not in author_set:
            author_set.append(author_list[i])
        if (author_list[i]) not in author_paper_count.keys():
            author_paper_count[author_list[i]] = 0
        author_paper_count[author_list[i]] += 1
        for j in range(i+1,len(author_list)):
            if (author_list[i],author_list[j]) not in co_author.keys() and (author_list[j],author_list[i]) not in co_author.keys():
                co_author[(author_list[i],author_list[j])] = 1
            else:
                if (author_list[i],author_list[j]) in co_author.keys():
                    co_author[(author_list[i],author_list[j])] += 1
                if (author_list[j],author_list[i]) in co_author.keys():
                    co_author[(author_list[j],author_list[i])] += 1

In [7]:
list(co_author.items())[:10]

[(('57208326453', '57221875424'), 1),
 (('57208326453', '57221865054'), 1),
 (('57208326453', '57221009557'), 1),
 (('57208326453', '56974102100'), 1),
 (('57221875424', '57221865054'), 1),
 (('57221875424', '57221009557'), 1),
 (('57221875424', '56974102100'), 1),
 (('57221865054', '57221009557'), 1),
 (('57221865054', '56974102100'), 1),
 (('57221009557', '56974102100'), 1)]

In [8]:
sorted_author_count = sorted(author_paper_count.items(), key=lambda item: item[1], reverse=True)
len(sorted_author_count)

14342

In [9]:
import networkx as nx

G = nx.Graph()
G.add_nodes_from(author_set)

for edge,weight in co_author.items():
        G.add_edge(edge[0],edge[1],weight=weight)
sorted(list(G.edges(data=True)), key=lambda item: item[2]['weight'], reverse=True)[:10]

[('57208709138', '57221673353', {'weight': 10}),
 ('57212091944', '57220181337', {'weight': 6}),
 ('57221738247', '57191408081', {'weight': 6}),
 ('57191408081', '6507247374', {'weight': 6}),
 ('7402612526', '57218701161', {'weight': 5}),
 ('7402612526', '57218701087', {'weight': 5}),
 ('57218701161', '57218701087', {'weight': 5}),
 ('57015320300', '7004398913', {'weight': 5}),
 ('26638963500', '57221558760', {'weight': 5}),
 ('36730450600', '57221738247', {'weight': 5})]

In [10]:
# save for gephi
nx.write_gexf(G, "co_author.gexf")

In [11]:
isolated_nodes = list(nx.isolates(G))
# remove isolated nodes
G.remove_nodes_from(isolated_nodes)
print(len(isolated_nodes))
nx.write_gexf(G, "co_author_remove_iso.gexf")

426


In [12]:
deg = dict(G.degree())
sorted(deg.items(), key=lambda item: item[1], reverse=True)[:10]

[('35234686300', 103),
 ('21833343000', 86),
 ('7402212831', 86),
 ('57208464039', 86),
 ('23972137300', 86),
 ('12141507000', 86),
 ('7004284732', 86),
 ('7006337145', 86),
 ('6603246496', 86),
 ('23005132600', 86)]

In [13]:
author_paper_count['35234686300']

2

In [14]:
author_paper_count_filter = dict([(author,count) for author,count in sorted_author_count if count >= 2])
author_paper_count_filter

{'57208709138': 10,
 '57221673353': 10,
 '36057015500': 8,
 '57212091944': 8,
 '57191408081': 8,
 '7402612526': 7,
 '15127615500': 7,
 '16031087300': 6,
 '57192178208': 6,
 '57221558760': 6,
 '57220181337': 6,
 '57221738247': 6,
 '6507247374': 6,
 '55194163800': 6,
 '37561894100': 6,
 '57218701161': 5,
 '57218701087': 5,
 '57208540261': 5,
 '57015320300': 5,
 '7004398913': 5,
 '26638963500': 5,
 '36730450600': 5,
 '57215274134': 5,
 '36550158900': 5,
 '55540790300': 5,
 '57203396952': 4,
 '57193346253': 4,
 '7102275804': 4,
 '6507058092': 4,
 '57209738143': 4,
 '57195940928': 4,
 '57203666754': 4,
 '57220128905': 4,
 '6506195756': 4,
 '56347687400': 4,
 '7401738392': 4,
 '55918811400': 4,
 '57203958756': 4,
 '7004890572': 4,
 '7006103457': 4,
 '56605566900': 4,
 '57215096761': 4,
 '57216195234': 4,
 '56234283900': 4,
 '36667966200': 4,
 '56891499700': 4,
 '57215379404': 3,
 '8671009200': 3,
 '56115682600': 3,
 '56845382800': 3,
 '57219758386': 3,
 '57193049810': 3,
 '55928575100': 3,
 

In [15]:
import networkx as nx

G = nx.Graph()
G.add_nodes_from(author_paper_count_filter.keys())

for edge,weight in co_author.items():
    if edge[0] in author_paper_count_filter.keys() and edge[1] in author_paper_count_filter.keys():
        G.add_edge(edge[0],edge[1],weight=weight)
sorted(list(G.edges(data=True)), key=lambda item: item[2]['weight'], reverse=True)[:10]

[('57208709138', '57221673353', {'weight': 10}),
 ('57212091944', '57220181337', {'weight': 6}),
 ('57191408081', '57221738247', {'weight': 6}),
 ('57191408081', '6507247374', {'weight': 6}),
 ('57191408081', '36730450600', {'weight': 5}),
 ('7402612526', '57218701161', {'weight': 5}),
 ('7402612526', '57218701087', {'weight': 5}),
 ('15127615500', '37561894100', {'weight': 5}),
 ('57221558760', '26638963500', {'weight': 5}),
 ('57221738247', '36730450600', {'weight': 5})]

In [16]:
# save for gephi
nx.write_gexf(G, "co_author_filter.gexf")

In [17]:
# add label as author name

In [19]:
sub_graph = [x for x in list(G.edges(data=True)) if x[0]  == "57197854295" or x[1] == "57197854295"]
sub_graph

[('55918811400', '57197854295', {'weight': 1}),
 ('57203958756', '57197854295', {'weight': 1}),
 ('35732954700', '57197854295', {'weight': 2}),
 ('57197854295', '57201013684', {'weight': 1}),
 ('57197854295', '36639415000', {'weight': 1}),
 ('57197854295', '35070838500', {'weight': 1}),
 ('57197854295', '57201009814', {'weight': 1}),
 ('57197854295', '35070872100', {'weight': 1}),
 ('57197854295', '57205435311', {'weight': 1}),
 ('57197854295', '57188925513', {'weight': 2}),
 ('57197854295', '57188923612', {'weight': 2}),
 ('57197854295', '57192089894', {'weight': 2}),
 ('57197854295', '37036085800', {'weight': 2}),
 ('57197854295', '57216199758', {'weight': 2}),
 ('57197854295', '57208696388', {'weight': 1})]

In [21]:
sub_graph_node = []
for x in sub_graph:
    if x[0] not in sub_graph_node:
        sub_graph_node.append(x[0])
    if x[1] not in sub_graph_node:
        sub_graph_node.append(x[1])
sub_graph_node

['55918811400',
 '57197854295',
 '57203958756',
 '35732954700',
 '57201013684',
 '36639415000',
 '35070838500',
 '57201009814',
 '35070872100',
 '57205435311',
 '57188925513',
 '57188923612',
 '57192089894',
 '37036085800',
 '57216199758',
 '57208696388']

In [32]:
node_article = {}
for x in sub_graph_node:
    if x not in node_article.keys():
        node_article[x] = []
    mask = df["Author(s) ID"].apply(lambda y: x in y)
    df_filter = df[mask]
    df_filter["Title"].apply(lambda y: node_article[x].append(y) )
node_article
    

{'55918811400': ['A framework of computational model for predicting the spread of COVID-19 pandemic in Saudi Arabia',
  'Predicting Hospitals Hygiene Rate during COVID-19 Pandemic',
  'Fuzzy based decision making approach for evaluating the severity of COVID-19 pandemic in cities of Kingdom of Saudi Arabia',
  'Benchmarking Methodology for Selection of Optimal COVID-19 Diagnostic Model Based on Entropy and TOPSIS Methods'],
 '57197854295': ['Helping doctors hasten COVID-19 treatment: Towards a rescue framework for the transfusion of best convalescent plasma to the most critical patients based on biological requirements via ml and novel MCDM methods',
  'COVID-CheXNet: hybrid deep learning framework for identifying COVID-19 virus in chest X-rays images',
  'Benchmarking Methodology for Selection of Optimal COVID-19 Diagnostic Model Based on Entropy and TOPSIS Methods'],
 '57203958756': ['A framework of computational model for predicting the spread of COVID-19 pandemic in Saudi Arabia',


In [36]:
import json
f = open("co_author_exception.json","w")
json.dump(node_article,f)

In [41]:
total_papers = []
for x in node_article.values():
    total_papers = total_papers + x
set(total_papers)


{'A framework of computational model for predicting the spread of COVID-19 pandemic in Saudi Arabia',
 'Automated medical diagnosis of COVID-19 through EfficientNet convolutional neural network',
 'Benchmarking Methodology for Selection of Optimal COVID-19 Diagnostic Model Based on Entropy and TOPSIS Methods',
 'COVID-CheXNet: hybrid deep learning framework for identifying COVID-19 virus in chest X-rays images',
 'Fuzzy based decision making approach for evaluating the severity of COVID-19 pandemic in cities of Kingdom of Saudi Arabia',
 'Helping doctors hasten COVID-19 treatment: Towards a rescue framework for the transfusion of best convalescent plasma to the most critical patients based on biological requirements via ml and novel MCDM methods',
 'Machine Learning Algorithms for Forecasting COVID 19 Confirmed Cases in America',
 'Multi-Biological Laboratory Examination Framework for the Prioritization of Patients with COVID-19 Based on Integrated AHP and Group VIKOR Methods',
 'Predi

In [43]:
"ANWESH REDDY PADURI".lower()

'anwesh reddy paduri'