In [1]:
import pandas as pd
import numpy as np
import json

f = open("ref_json.json",'r')
json_ref = json.load(f)
json_ref["0"][0]

{'arxivId': None,
 'authors': [{'authorId': '1902236', 'name': 'B. Neuman'},
  {'authorId': '34855576', 'name': 'B. Adair'},
  {'authorId': '39730804', 'name': 'C. Yoshioka'},
  {'authorId': '3058907', 'name': 'J. Quispe'},
  {'authorId': '8955493', 'name': 'Gretchen Orca'},
  {'authorId': '144823036', 'name': 'P. Kuhn'},
  {'authorId': '2163999', 'name': 'R. Milligan'},
  {'authorId': '122319052', 'name': 'M. Yeager'},
  {'authorId': '4833612', 'name': 'M. Buchmeier'}],
 'doi': '10.1128/JVI.00645-06',
 'intent': ['background'],
 'isInfluential': True,
 'paperId': 'b9d42f32ff34e005655c4a17b3f19e02e6500a15',
 'title': 'Supramolecular Architecture of Severe Acute Respiratory Syndrome Coronavirus Revealed by Electron Cryomicroscopy',
 'url': 'https://www.semanticscholar.org/paper/b9d42f32ff34e005655c4a17b3f19e02e6500a15',
 'venue': 'Journal of Virology',
 'year': 2006}

In [2]:
def find_co_reference(json_ref):
    ref_paper = {}
    for key,paper_ref in json_ref.items():
        for paper in paper_ref:
            if paper['paperId'] not in ref_paper.keys():
                ref_paper[paper['paperId']] = []
            ref_paper[paper['paperId']].append(key)
    return ref_paper


In [3]:
ref_paper = find_co_reference(json_ref)
print(list(ref_paper.items())[0])
print(len(list(ref_paper.items())))

('b9d42f32ff34e005655c4a17b3f19e02e6500a15', ['0', '3590'])
52704


In [4]:
def cal_edge_weight():
    edge_dict = {}
    for val in ref_paper.values():
        for i in range(len(val)):
            for j in range(i+1,len(val)):
                if (val[i],val[j]) not in edge_dict.keys():
                    edge_dict[(val[i],val[j])] = 0
                edge_dict[(val[i],val[j])] += 1
    return edge_dict

In [5]:
import networkx as nx

paper_num = len(list(json_ref.items()))
G = nx.Graph()
G.add_nodes_from(np.arange(0,paper_num))
edge_weight = cal_edge_weight()
for edge,weight in edge_weight.items():
    G.add_edge(int(edge[0]),int(edge[1]), weight=weight)
sorted_weight_list = sorted(list(G.edges(data=True)), key=lambda item: item[2]['weight'], reverse=True)
sorted_weight_list[:10]

[(1737, 3060, {'weight': 54}),
 (3138, 3301, {'weight': 54}),
 (1759, 2354, {'weight': 45}),
 (2879, 3450, {'weight': 40}),
 (239, 440, {'weight': 39}),
 (562, 3450, {'weight': 38}),
 (562, 2879, {'weight': 37}),
 (2702, 3128, {'weight': 36}),
 (440, 490, {'weight': 35}),
 (239, 490, {'weight': 34})]

In [6]:
sorted(dict(G.degree(weight="weight")).items(),key=lambda x:x[1],reverse=True)[:10]

[(3842, 1589),
 (833, 1573),
 (799, 1538),
 (3304, 1464),
 (1249, 1322),
 (2951, 1318),
 (1695, 1198),
 (3837, 1170),
 (906, 1119),
 (3461, 1098)]

In [7]:
# save for gephi
nx.write_gexf(G, "bibliographic_coupling.gexf")

## detect topic from this graph

In [8]:
# clustering 
import community as community_louvain

#first compute the best partition
partition = community_louvain.best_partition(G, weight='weight')

In [9]:
partition

{0: 0,
 1: 1,
 2: 2,
 3: 3,
 4: 4,
 5: 428,
 6: 0,
 7: 1,
 8: 428,
 9: 6,
 10: 4,
 11: 0,
 12: 1,
 13: 7,
 14: 1,
 15: 1,
 16: 8,
 17: 1,
 18: 9,
 19: 10,
 20: 11,
 21: 1,
 22: 1,
 23: 1,
 24: 1,
 25: 4,
 26: 0,
 27: 12,
 28: 4,
 29: 1,
 30: 13,
 31: 14,
 32: 0,
 33: 0,
 34: 0,
 35: 0,
 36: 15,
 37: 0,
 38: 1,
 39: 12,
 40: 4,
 41: 13,
 42: 13,
 43: 428,
 44: 428,
 45: 1,
 46: 1,
 47: 12,
 48: 428,
 49: 4,
 50: 0,
 51: 16,
 52: 4,
 53: 1,
 54: 428,
 55: 0,
 56: 17,
 57: 18,
 58: 19,
 59: 4,
 60: 12,
 61: 1,
 62: 0,
 63: 0,
 64: 4,
 65: 0,
 66: 0,
 67: 1,
 68: 20,
 69: 21,
 70: 428,
 71: 22,
 72: 428,
 73: 428,
 74: 0,
 75: 12,
 76: 4,
 77: 23,
 78: 0,
 79: 0,
 80: 4,
 81: 24,
 82: 25,
 83: 13,
 84: 428,
 85: 12,
 86: 12,
 87: 428,
 88: 4,
 89: 0,
 90: 4,
 91: 13,
 92: 23,
 93: 12,
 94: 26,
 95: 0,
 96: 0,
 97: 1,
 98: 4,
 99: 1,
 100: 4,
 101: 1,
 102: 1,
 103: 27,
 104: 1,
 105: 1,
 106: 28,
 107: 29,
 108: 4,
 109: 1,
 110: 13,
 111: 30,
 112: 1,
 113: 1,
 114: 0,
 115: 0,
 116: 13,


In [10]:
dict(filter(lambda elem: elem[1] == 1, partition.items()))

{1: 1,
 7: 1,
 12: 1,
 14: 1,
 15: 1,
 17: 1,
 21: 1,
 22: 1,
 23: 1,
 24: 1,
 29: 1,
 38: 1,
 45: 1,
 46: 1,
 53: 1,
 61: 1,
 67: 1,
 97: 1,
 99: 1,
 101: 1,
 102: 1,
 104: 1,
 105: 1,
 109: 1,
 112: 1,
 113: 1,
 126: 1,
 132: 1,
 151: 1,
 188: 1,
 189: 1,
 191: 1,
 196: 1,
 201: 1,
 210: 1,
 212: 1,
 227: 1,
 271: 1,
 276: 1,
 280: 1,
 281: 1,
 296: 1,
 393: 1,
 398: 1,
 405: 1,
 414: 1,
 415: 1,
 422: 1,
 462: 1,
 465: 1,
 479: 1,
 480: 1,
 484: 1,
 504: 1,
 515: 1,
 535: 1,
 558: 1,
 560: 1,
 561: 1,
 565: 1,
 571: 1,
 572: 1,
 573: 1,
 574: 1,
 582: 1,
 590: 1,
 593: 1,
 600: 1,
 601: 1,
 602: 1,
 609: 1,
 615: 1,
 620: 1,
 623: 1,
 626: 1,
 630: 1,
 637: 1,
 642: 1,
 644: 1,
 645: 1,
 649: 1,
 651: 1,
 652: 1,
 657: 1,
 658: 1,
 659: 1,
 660: 1,
 661: 1,
 662: 1,
 679: 1,
 690: 1,
 703: 1,
 719: 1,
 759: 1,
 838: 1,
 845: 1,
 871: 1,
 946: 1,
 947: 1,
 968: 1,
 1005: 1,
 1006: 1,
 1007: 1,
 1024: 1,
 1030: 1,
 1041: 1,
 1058: 1,
 1061: 1,
 1109: 1,
 1110: 1,
 1117: 1,
 1130: 1,
 

In [11]:
sorted(list(set(partition.values())),reverse=True)

[1389,
 1388,
 1387,
 1386,
 1385,
 1384,
 1383,
 1382,
 1381,
 1380,
 1379,
 1378,
 1377,
 1376,
 1375,
 1374,
 1373,
 1372,
 1371,
 1370,
 1369,
 1368,
 1367,
 1366,
 1365,
 1364,
 1363,
 1362,
 1361,
 1360,
 1359,
 1358,
 1357,
 1356,
 1355,
 1354,
 1353,
 1352,
 1351,
 1350,
 1349,
 1348,
 1347,
 1346,
 1345,
 1344,
 1343,
 1342,
 1341,
 1340,
 1339,
 1338,
 1337,
 1336,
 1335,
 1334,
 1333,
 1332,
 1331,
 1330,
 1329,
 1328,
 1327,
 1326,
 1325,
 1324,
 1323,
 1322,
 1321,
 1320,
 1319,
 1318,
 1317,
 1316,
 1315,
 1314,
 1313,
 1312,
 1311,
 1310,
 1309,
 1308,
 1307,
 1306,
 1305,
 1304,
 1303,
 1302,
 1301,
 1300,
 1299,
 1298,
 1297,
 1296,
 1295,
 1294,
 1293,
 1292,
 1291,
 1290,
 1289,
 1288,
 1287,
 1286,
 1285,
 1284,
 1283,
 1282,
 1281,
 1280,
 1279,
 1278,
 1277,
 1276,
 1275,
 1274,
 1273,
 1272,
 1271,
 1270,
 1269,
 1268,
 1267,
 1266,
 1265,
 1264,
 1263,
 1262,
 1261,
 1260,
 1259,
 1258,
 1257,
 1256,
 1255,
 1254,
 1253,
 1252,
 1251,
 1250,
 1249,
 1248,
 1247,

In [12]:
isolated_nodes = list(nx.isolates(G))
# remove isolated nodes
G.remove_nodes_from(isolated_nodes)
print(len(G.nodes),len(isolated_nodes))

2485 1367


In [13]:
partition = community_louvain.best_partition(G, weight='weight',random_state=100)

In [14]:
sorted(list(set(partition.values())),reverse=True)

[21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]

In [15]:
len(set(partition.values()))

22

In [16]:
nx.set_node_attributes(G, partition, "cluster")

In [17]:
G.nodes[0]["cluster"]

0

In [18]:
nx.write_gexf(G, "bibliographic_coupling_remove_isolated.gexf")

In [19]:
len(isolated_nodes)

1367

In [20]:
df = pd.read_csv("topic_modelling_res.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,Title,1st_topic,1st_topic_percentage,2nd_topic,2nd_topic_percentage
0,0,MCNN: A deep learning based rapid diagnosis me...,15.0,29.78,17.0,7.51
1,1,French-language COVID-19 terminology Internati...,5.0,13.18,17.0,12.42
2,2,Exploring working group's psychological subjec...,16.0,16.42,14.0,13.02
3,3,A novel virtual screening procedure identifies...,13.0,41.29,2.0,12.33
4,4,"Geospatial mapping, Epidemiological modelling,...",4.0,25.83,11.0,13.6


In [21]:
node_list = list(G.nodes)

In [22]:
df.columns = ["Index","Title","1st_topic","1st_topic_percentage","2nd_topic","2nd_topic_percentage"]

In [23]:
df.head()

Unnamed: 0,Index,Title,1st_topic,1st_topic_percentage,2nd_topic,2nd_topic_percentage
0,0,MCNN: A deep learning based rapid diagnosis me...,15.0,29.78,17.0,7.51
1,1,French-language COVID-19 terminology Internati...,5.0,13.18,17.0,12.42
2,2,Exploring working group's psychological subjec...,16.0,16.42,14.0,13.02
3,3,A novel virtual screening procedure identifies...,13.0,41.29,2.0,12.33
4,4,"Geospatial mapping, Epidemiological modelling,...",4.0,25.83,11.0,13.6


In [24]:
df_filter = df[df["Index"].isin(node_list)]

In [25]:
nx.set_node_attributes(G, df_filter["1st_topic"], "topics")

In [26]:
nx.write_gexf(G, "bibliographic_coupling_add_topics.gexf")

In [27]:
def find_topic(cluster_num):   
    cluster_2_node = list(dict(filter(lambda x:x[1]==cluster_num,partition.items())).keys())
    cluster_2_node_topic = df[df["Index"].isin(cluster_2_node)]
    topic_label_count = dict(cluster_2_node_topic["1st_topic"].value_counts())
    total = sum(topic_label_count.values())
    return topic_label_count,total

In [28]:
cluster_acc = {}
cluster_count = {}
for x in range(22):
    topic_label_count,total = find_topic(x)
    cluster_count[x] = total
    cluster_acc[x] = {}
    cluster_acc[x]["dominant_topic"] = list(topic_label_count.keys())
    cluster_acc[x]["percentage"] = str(round(list(topic_label_count.values())[0]/total*100,2))+"%" 
print(cluster_acc)
print(cluster_count)

{0: {'dominant_topic': [15.0, 8.0, 20.0, 4.0, 12.0, 9.0, 7.0, 5.0, 11.0, 1.0, 0.0, 14.0, 19.0, 18.0, 13.0, 17.0, 2.0, 6.0, 10.0], 'percentage': '76.23%'}, 1: {'dominant_topic': [13.0, 15.0, 20.0, 4.0, 12.0, 9.0, 11.0, 7.0, 17.0, 21.0, 0.0, 18.0, 6.0, 5.0, 8.0, 10.0, 3.0, 14.0], 'percentage': '54.12%'}, 2: {'dominant_topic': [4.0, 11.0, 18.0, 7.0, 12.0, 20.0, 21.0, 16.0, 15.0, 10.0, 6.0, 0.0, 14.0, 8.0, 9.0, 1.0, 17.0, 13.0, 2.0, 19.0, 5.0, 3.0], 'percentage': '41.42%'}, 3: {'dominant_topic': [0.0, 12.0, 6.0, 18.0, 14.0, 10.0, 20.0, 16.0, 8.0, 7.0, 3.0, 15.0, 1.0, 19.0, 5.0, 21.0, 11.0, 17.0, 2.0, 9.0, 4.0, 13.0], 'percentage': '33.1%'}, 4: {'dominant_topic': [16.0, 0.0], 'percentage': '50.0%'}, 5: {'dominant_topic': [15.0, 13.0, 21.0, 20.0], 'percentage': '40.0%'}, 6: {'dominant_topic': [8.0, 12.0, 11.0, 15.0, 18.0, 20.0, 16.0, 0.0, 5.0, 1.0, 17.0, 19.0, 4.0, 14.0, 2.0, 21.0], 'percentage': '41.18%'}, 7: {'dominant_topic': [11.0, 18.0], 'percentage': '50.0%'}, 8: {'dominant_topic': [5.

In [29]:
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.adjusted_rand_score.html 
# https://complexnetworks.org/

In [30]:
cluster_result = {}
for paper,cluster in partition.items():
    if cluster not in cluster_result.keys():
        cluster_result[cluster]=[]
    cluster_result[cluster].append(paper)
cluster_result

{0: [0,
  6,
  11,
  26,
  32,
  33,
  34,
  35,
  37,
  50,
  55,
  62,
  63,
  65,
  66,
  74,
  78,
  79,
  89,
  95,
  96,
  114,
  115,
  127,
  128,
  129,
  134,
  156,
  161,
  168,
  174,
  176,
  178,
  179,
  217,
  240,
  241,
  249,
  250,
  253,
  254,
  256,
  257,
  259,
  262,
  265,
  291,
  339,
  345,
  346,
  352,
  375,
  384,
  386,
  404,
  406,
  408,
  437,
  466,
  472,
  473,
  476,
  477,
  487,
  513,
  527,
  528,
  548,
  553,
  555,
  557,
  581,
  604,
  611,
  616,
  617,
  619,
  628,
  633,
  635,
  643,
  648,
  655,
  665,
  684,
  698,
  723,
  737,
  752,
  753,
  757,
  798,
  799,
  803,
  822,
  826,
  827,
  828,
  829,
  831,
  832,
  833,
  834,
  835,
  837,
  839,
  840,
  841,
  877,
  902,
  904,
  905,
  906,
  969,
  989,
  992,
  996,
  1018,
  1040,
  1045,
  1050,
  1052,
  1100,
  1160,
  1161,
  1162,
  1212,
  1225,
  1247,
  1249,
  1251,
  1258,
  1291,
  1293,
  1300,
  1332,
  1340,
  1351,
  1376,
  1378,
  1379,
  1381,
 

In [77]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words = stop_words + ["the", "of", "and", "to", "in"]


In [80]:
from sklearn.feature_extraction.text import TfidfVectorizer
def tf_idf_compute(corpus):
    vectorizer = TfidfVectorizer(stop_words=stop_words)
    X = vectorizer.fit_transform(corpus)
    return X,vectorizer

In [81]:
df_clean = pd.read_csv("cleaned_data.csv")
df_clean = df_clean.rename(columns={'Unnamed: 0': 'Index'})

In [82]:
def get_cluster_corpus(cluster_no):
    df_filter = df_clean[df_clean["Index"].isin(cluster_result[cluster_no])]
    text_set = df_filter['Title'] + ' ' + df_filter['Abstract']
    return tf_idf_compute(text_set)

In [89]:
import matplotlib.pyplot as plt
def draw_bar_graph(val,topic):
    words = [x for x in val.keys()]
    portion = [x for x in val.values()]

    plt.barh(words,portion)
    plt.title("Cluster "+str(topic))
    plt.ylabel('Keyword')
    plt.xlabel('Frequency')
    title = "c"+str(topic)+".png"
    plt.savefig(title,bbox_inches="tight")
    plt.show()

In [91]:
for i in range(22):
    keyword = {}
    X, vectorizer = get_cluster_corpus(i)
    for j in range(X.shape[0]):
        df = pd.DataFrame(X[j].T.todense(
        ), index=vectorizer.get_feature_names(), columns=["TF-IDF"])
        df = df.sort_values('TF-IDF', ascending=False)
        for word in df.head(5).index.tolist():
            if word not in keyword.keys():
                keyword[word] = 0
            keyword[word] += 1
    top_keywords = dict(list(sorted(keyword.items(),key=lambda x : x[1], reverse=True))[:5])
    print(top_keywords)
    # draw_bar_graph(top_keywords,i)


{'covid': 21, 'ct': 21, '19': 15, 'segmentation': 14, 'ray': 12}
{'ace2': 17, 'cov': 15, 'drugs': 12, 'compounds': 12, 'sars': 12}
{'model': 24, 'epidemic': 17, 'data': 16, 'india': 15, 'cases': 14}
{'learning': 68, 'online': 48, 'students': 32, 'digital': 29, 'tracing': 26}
{'fully': 1, 'online': 1, 'team': 1, 'icdec': 1, 'conference': 1}
{'algorithm': 3, 'ray': 2, 'sma': 1, 'images': 1, 'infected': 1}
{'face': 8, 'data': 6, 'framework': 5, 'masks': 5, 'distancing': 5}
{'news': 1, 'data': 1, 'sources': 1, 'model': 1, 'web': 1}
{'news': 17, 'sentiment': 17, 'tweets': 13, 'public': 12, 'social': 11}
{'chapter': 1, 'pandemic': 1, 'online': 1, 'teaching': 1, 'schools': 1}
{'cryptocurrencies': 2, '2016': 2, 'mid': 1, 'study': 1, 'market': 1}
{'passengers': 3, 'boarding': 3, 'airplane': 2, 'solution': 1, 'seats': 1}
{'behavior': 1, 'privacy': 1, 'mobile': 1, 'contextual': 1, 'usage': 1}
{'robot': 2, 'cloud': 1, 'service': 1, 'world': 1, 'computing': 1}
{'level': 1, 'anxiety': 1, 'people': 1