# Web scraping, Stage 11

Making the initial graph

Author csv structure:
- name: string (full name)
- id: int (Scholar id)
- gender: M/F
- role: string (role in affiliation)
- affiliation: string
- citedby: int
- citedby5y: int
- hindex: int
- hindex5y: int
- i10index: int
- i10index5y: int
- num_publications: int
- citations: string, e.g. ... 2018-954 2019-1048 ...
- interests: string, e.g. Wireless_Networks IoT

Coauthor csv structure:
- name: string (full name)
- id: int (Scholar id)
+ affiliation: string
+ citedby: int
+ citebyd5y: int
+ hindex: int
+ hindex5y: int
+ i10index: int
+ i10index5y: int
+ num_publications: int

Edges csv structure:
- node1: string (author full name)
- node2: string (coauthor full name)

In [1]:
import json
import pandas as pd

In [2]:
authors = json.load(open('../stage10/uni_authors.json'))
coauthors = json.load(open('../stage9/coauthors.json'))

Set custom affiliation to authors

In [3]:
for uni in authors:
    for auth in uni['authors']:
        auth['affiliation'] = 'University of ' + uni['university'].capitalize()

In [4]:
authors_flat = []
for uni in authors:
    authors_flat += uni['authors']

In [5]:
len(authors_flat)

340

In [6]:
author_df = pd.DataFrame(authors_flat)
author_df

Unnamed: 0,scholar_id,name,affiliation,gender,role,url_picture,coauthors,i10index,i10index5y,hindex,hindex5y,citedby,citedby5y,num_publications,interests,cites_per_year
0,72e5VYEAAAAJ,Steven M. LaValle,University of Oulu,M,Professor,https://scholar.google.com/citations?view_op=m...,"[1NyT9gQAAAAJ, CPze844AAAAJ, jkRa2LEAAAAJ, -JP...",158,91,59,34,30593,12679,297,"[Virtual reality, robotics, sensor fusion, mot...","{'1999': 100, '2000': 132, '2001': 196, '2002'..."
1,9P2jyr8AAAAJ,Timo Ojala,University of Oulu,M,Professor,https://scholar.google.com/citations?view_op=m...,"[bjEpXBoAAAAJ, sxK1vQwAAAAJ, JnWGV4kAAAAJ, xFj...",110,60,41,28,33500,16240,208,"[Virtual reality, hybrid reality, ubiquitous c...","{'2001': 92, '2002': 132, '2003': 145, '2004':..."
2,8aM6EeEAAAAJ,Jukka Riekki,University of Oulu,M,Professor,https://scholar.google.com/citations?view_op=m...,"[f-jBRoIAAAAJ, Ij_W2wsAAAAJ, GA4D9f8AAAAJ, 6Dd...",113,46,32,22,4011,1767,299,"[edge computing, distributed artificial intell...","{'2000': 15, '2001': 19, '2002': 7, '2003': 23..."
3,d4rhcDAAAAAJ,Denzil Ferreira,University of Oulu,M,Associate Professor,https://scholar.google.com/citations?view_op=m...,"[JnWGV4kAAAAJ, SkQ6OisAAAAJ, ydA8Q5AAAAAJ, Qzl...",49,47,28,27,2981,2484,110,"[Sensors and Instrumentation, Mobile and Ubiqu...","{'2012': 36, '2013': 65, '2014': 135, '2015': ..."
4,lwakBbMAAAAJ,Georgi V. Georgiev,University of Oulu,M,Associate Professor,https://scholar.google.com/citations?view_op=m...,"[nguLAo0AAAAJ, GA4D9f8AAAAJ, 8aM6EeEAAAAJ, J13...",10,7,10,8,317,208,101,"[design creativity, digital fabrication, idea ...","{'2007': 3, '2008': 1, '2009': 9, '2010': 11, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
335,AiqUHA8AAAAJ,Athanasios Kehagias,University of Thessaloniki,M,Associate Professor,https://scholar.google.com/citations?view_op=m...,"[IxCZDBQAAAAJ, 0SQP4bwAAAAJ, 3RiPf3wAAAAJ, bo7...",48,21,29,15,2249,647,120,"[Applied Mathematics, mathematical models, pro...","{'1995': 11, '1996': 17, '1997': 24, '1998': 3..."
336,TmGjsAoAAAAJ,George Sergiadis,University of Thessaloniki,M,Professor,https://scholar.google.com/citations?view_op=m...,[],36,22,19,13,1867,921,108,[Biomedical Engineering],"{'2004': 7, '2005': 6, '2006': 9, '2007': 43, ..."
337,YWalTEUAAAAJ,Dimitrios Chrissoulidis,University of Thessaloniki,M,Professor,https://scholar.google.com/citations?view_op=m...,[],7,3,8,5,216,73,32,[],"{'2000': 1, '2001': 3, '2002': 4, '2003': 7, '..."
338,FWPjh-AAAAAJ,Charis Demoulias,University of Thessaloniki,M,Professor,https://scholar.google.com/citations?view_op=m...,"[LIuIdj0AAAAJ, y6UkvosAAAAJ, bitSbpYAAAAJ, jY_...",33,25,19,18,1232,872,78,[Electrical engineering],"{'2001': 5, '2002': 6, '2003': 7, '2004': 9, '..."


In [7]:
coauthor_df = pd.DataFrame(coauthors['successful'])
coauthor_df

Unnamed: 0,scholar_id,name,affiliation,url_picture,i10index,i10index5y,hindex,hindex5y,citedby,citedby5y,num_publications,coauthors
0,stT2bnoAAAAJ,Gunther Eggeler,"Institute for Materials, Ruhr-University Bochu...",https://scholar.googleusercontent.com/citation...,310,225,72,49,21876,12366,742,"[W-52j5AAAAAJ, FrOUKW4AAAAJ, EGDi4BMAAAAJ, Qav..."
1,_bs1TE0AAAAJ,Lidija Petkovska,"Professor of Electrical Machines and Drives, S...",https://scholar.googleusercontent.com/citation...,15,5,11,7,689,243,148,[S-4AR8AAAAAJ]
2,WWKtZZYAAAAJ,James Dedrick,University of York,,8,6,9,8,246,193,56,"[y_qoKAoAAAAJ, CcrxVa8AAAAJ, 9DnjXXkAAAAJ, xmt..."
3,C8Ir-n4AAAAJ,Finn Kuusisto,FANTM,https://scholar.googleusercontent.com/citation...,7,7,7,7,143,127,32,"[okf5bmQAAAAJ, UMGysigAAAAJ, tFdKEgEAAAAJ, 0fA..."
4,NlBmV2QAAAAJ,João Saraiva,HASLab / INESC TEC and Universidade do Minho,https://scholar.googleusercontent.com/citation...,62,27,27,18,2435,1085,205,"[b-zzik0AAAAJ, fYCE5-sAAAAJ, 7VPNqiEAAAAJ, EEe..."
...,...,...,...,...,...,...,...,...,...,...,...,...
2315,2vrAze8AAAAJ,Marco Jacobs,Marco Jacobs Consulting,https://scholar.googleusercontent.com/citation...,4,2,5,3,232,72,27,"[1FQwYvoAAAAJ, ni6owYQAAAAJ, 1Us1N7AAAAAJ]"
2316,KLE-AyEAAAAJ,Jacques Robin,Université Paris 1 Panthéon-Sorbonne,https://scholar.googleusercontent.com/citation...,30,7,18,9,1387,231,69,"[ujDhg2sAAAAJ, Is0pLz0AAAAJ, _ZkpywYAAAAJ, 2Px..."
2317,J8tTJP8AAAAJ,Irène Durand,University of Bordeaux,,13,2,11,4,419,75,68,[]
2318,E1gfeiMAAAAJ,Themistoklis Diamantopoulos,Research Fellow at Aristotle University of The...,,8,8,9,9,227,208,44,"[339uVZQAAAAJ, zVNCDMsAAAAJ, KRG5Mg8AAAAJ, S3C..."


Do we still have authors in coauthors?

In [8]:
common_names = set.intersection(set(coauthor_df['name']), set(author_df['name']))
common_names

{'Adam Wojciechowski'}

In [9]:
common_ids = set.intersection(set(coauthor_df['scholar_id']), set(author_df['scholar_id']))
common_ids

set()

Let's investigate.

In [10]:
author_df[author_df['name'] == 'Adam Wojciechowski']

Unnamed: 0,scholar_id,name,affiliation,gender,role,url_picture,coauthors,i10index,i10index5y,hindex,hindex5y,citedby,citedby5y,num_publications,interests,cites_per_year
229,mRiW0csAAAAJ,Adam Wojciechowski,University of Lodz,M,Associate Professor,https://scholar.google.com/citations?view_op=m...,[],8,5,9,8,269,196,74,"[human-computer interaction, computer graphics...","{'2006': 4, '2007': 5, '2008': 4, '2009': 3, '..."


In [11]:
coauthor_df[coauthor_df['name'] == 'Adam Wojciechowski']

Unnamed: 0,scholar_id,name,affiliation,url_picture,i10index,i10index5y,hindex,hindex5y,citedby,citedby5y,num_publications,coauthors
105,WBhGYE8AAAAJ,Adam Wojciechowski,Poznan University of Technology,,13,7,12,7,764,203,46,[]
368,g628b44AAAAJ,Adam Wojciechowski,Instytut Logistyki i Magazynowania,,3,3,5,5,208,107,214,[]


We will rename the Adam Wojciechowski coauthors to Adam Wojciechowski(2) and Adam Wojciechowski(3).

In [12]:
coauthor_df.at[105, 'name'] = 'Adam Wojciechowski(2)'
coauthor_df.at[368, 'name'] = 'Adam Wojciechowski(3)'

In [13]:
coauthor_df.loc[coauthor_df['scholar_id'] == 'WBhGYE8AAAAJ']['name'], coauthor_df.loc[coauthor_df['scholar_id'] == 'g628b44AAAAJ']['name']

(105    Adam Wojciechowski(2)
 Name: name, dtype: object,
 368    Adam Wojciechowski(3)
 Name: name, dtype: object)

## Create author csv

In [14]:
def format_interests(l):
    return ' '.join(
        list(
            map(lambda x: x.replace(' ', '_'), l)
        )
    )

def format_cites(d):
    def format_item(kv):
        return kv[0] + '-' + str(kv[1])
    
    return ' '.join(
        list(
            map(format_item, d.items())
        )
    )

In [15]:
author_df['interests'] = author_df['interests'].map(format_interests)
author_df['cites_per_year'] =  author_df['cites_per_year'].map(format_cites)
author_df

Unnamed: 0,scholar_id,name,affiliation,gender,role,url_picture,coauthors,i10index,i10index5y,hindex,hindex5y,citedby,citedby5y,num_publications,interests,cites_per_year
0,72e5VYEAAAAJ,Steven M. LaValle,University of Oulu,M,Professor,https://scholar.google.com/citations?view_op=m...,"[1NyT9gQAAAAJ, CPze844AAAAJ, jkRa2LEAAAAJ, -JP...",158,91,59,34,30593,12679,297,Virtual_reality robotics sensor_fusion motion_...,1999-100 2000-132 2001-196 2002-236 2003-300 2...
1,9P2jyr8AAAAJ,Timo Ojala,University of Oulu,M,Professor,https://scholar.google.com/citations?view_op=m...,"[bjEpXBoAAAAJ, sxK1vQwAAAAJ, JnWGV4kAAAAJ, xFj...",110,60,41,28,33500,16240,208,Virtual_reality hybrid_reality ubiquitous_comp...,2001-92 2002-132 2003-145 2004-206 2005-326 20...
2,8aM6EeEAAAAJ,Jukka Riekki,University of Oulu,M,Professor,https://scholar.google.com/citations?view_op=m...,"[f-jBRoIAAAAJ, Ij_W2wsAAAAJ, GA4D9f8AAAAJ, 6Dd...",113,46,32,22,4011,1767,299,edge_computing distributed_artificial_intellig...,2000-15 2001-19 2002-7 2003-23 2004-35 2005-55...
3,d4rhcDAAAAAJ,Denzil Ferreira,University of Oulu,M,Associate Professor,https://scholar.google.com/citations?view_op=m...,"[JnWGV4kAAAAJ, SkQ6OisAAAAJ, ydA8Q5AAAAAJ, Qzl...",49,47,28,27,2981,2484,110,Sensors_and_Instrumentation Mobile_and_Ubiquit...,2012-36 2013-65 2014-135 2015-218 2016-312 201...
4,lwakBbMAAAAJ,Georgi V. Georgiev,University of Oulu,M,Associate Professor,https://scholar.google.com/citations?view_op=m...,"[nguLAo0AAAAJ, GA4D9f8AAAAJ, 8aM6EeEAAAAJ, J13...",10,7,10,8,317,208,101,design_creativity digital_fabrication idea_gen...,2007-3 2008-1 2009-9 2010-11 2011-19 2012-17 2...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
335,AiqUHA8AAAAJ,Athanasios Kehagias,University of Thessaloniki,M,Associate Professor,https://scholar.google.com/citations?view_op=m...,"[IxCZDBQAAAAJ, 0SQP4bwAAAAJ, 3RiPf3wAAAAJ, bo7...",48,21,29,15,2249,647,120,Applied_Mathematics mathematical_models probab...,1995-11 1996-17 1997-24 1998-38 1999-22 2000-1...
336,TmGjsAoAAAAJ,George Sergiadis,University of Thessaloniki,M,Professor,https://scholar.google.com/citations?view_op=m...,[],36,22,19,13,1867,921,108,Biomedical_Engineering,2004-7 2005-6 2006-9 2007-43 2008-46 2009-78 2...
337,YWalTEUAAAAJ,Dimitrios Chrissoulidis,University of Thessaloniki,M,Professor,https://scholar.google.com/citations?view_op=m...,[],7,3,8,5,216,73,32,,2000-1 2001-3 2002-4 2003-7 2004-11 2005-4 200...
338,FWPjh-AAAAAJ,Charis Demoulias,University of Thessaloniki,M,Professor,https://scholar.google.com/citations?view_op=m...,"[LIuIdj0AAAAJ, y6UkvosAAAAJ, bitSbpYAAAAJ, jY_...",33,25,19,18,1232,872,78,Electrical_engineering,2001-5 2002-6 2003-7 2004-9 2005-3 2006-9 2007...


Filter duplicates

In [16]:
author_df = author_df[~author_df['scholar_id'].duplicated()]
author_df

Unnamed: 0,scholar_id,name,affiliation,gender,role,url_picture,coauthors,i10index,i10index5y,hindex,hindex5y,citedby,citedby5y,num_publications,interests,cites_per_year
0,72e5VYEAAAAJ,Steven M. LaValle,University of Oulu,M,Professor,https://scholar.google.com/citations?view_op=m...,"[1NyT9gQAAAAJ, CPze844AAAAJ, jkRa2LEAAAAJ, -JP...",158,91,59,34,30593,12679,297,Virtual_reality robotics sensor_fusion motion_...,1999-100 2000-132 2001-196 2002-236 2003-300 2...
1,9P2jyr8AAAAJ,Timo Ojala,University of Oulu,M,Professor,https://scholar.google.com/citations?view_op=m...,"[bjEpXBoAAAAJ, sxK1vQwAAAAJ, JnWGV4kAAAAJ, xFj...",110,60,41,28,33500,16240,208,Virtual_reality hybrid_reality ubiquitous_comp...,2001-92 2002-132 2003-145 2004-206 2005-326 20...
2,8aM6EeEAAAAJ,Jukka Riekki,University of Oulu,M,Professor,https://scholar.google.com/citations?view_op=m...,"[f-jBRoIAAAAJ, Ij_W2wsAAAAJ, GA4D9f8AAAAJ, 6Dd...",113,46,32,22,4011,1767,299,edge_computing distributed_artificial_intellig...,2000-15 2001-19 2002-7 2003-23 2004-35 2005-55...
3,d4rhcDAAAAAJ,Denzil Ferreira,University of Oulu,M,Associate Professor,https://scholar.google.com/citations?view_op=m...,"[JnWGV4kAAAAJ, SkQ6OisAAAAJ, ydA8Q5AAAAAJ, Qzl...",49,47,28,27,2981,2484,110,Sensors_and_Instrumentation Mobile_and_Ubiquit...,2012-36 2013-65 2014-135 2015-218 2016-312 201...
4,lwakBbMAAAAJ,Georgi V. Georgiev,University of Oulu,M,Associate Professor,https://scholar.google.com/citations?view_op=m...,"[nguLAo0AAAAJ, GA4D9f8AAAAJ, 8aM6EeEAAAAJ, J13...",10,7,10,8,317,208,101,design_creativity digital_fabrication idea_gen...,2007-3 2008-1 2009-9 2010-11 2011-19 2012-17 2...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
335,AiqUHA8AAAAJ,Athanasios Kehagias,University of Thessaloniki,M,Associate Professor,https://scholar.google.com/citations?view_op=m...,"[IxCZDBQAAAAJ, 0SQP4bwAAAAJ, 3RiPf3wAAAAJ, bo7...",48,21,29,15,2249,647,120,Applied_Mathematics mathematical_models probab...,1995-11 1996-17 1997-24 1998-38 1999-22 2000-1...
336,TmGjsAoAAAAJ,George Sergiadis,University of Thessaloniki,M,Professor,https://scholar.google.com/citations?view_op=m...,[],36,22,19,13,1867,921,108,Biomedical_Engineering,2004-7 2005-6 2006-9 2007-43 2008-46 2009-78 2...
337,YWalTEUAAAAJ,Dimitrios Chrissoulidis,University of Thessaloniki,M,Professor,https://scholar.google.com/citations?view_op=m...,[],7,3,8,5,216,73,32,,2000-1 2001-3 2002-4 2003-7 2004-11 2005-4 200...
338,FWPjh-AAAAAJ,Charis Demoulias,University of Thessaloniki,M,Professor,https://scholar.google.com/citations?view_op=m...,"[LIuIdj0AAAAJ, y6UkvosAAAAJ, bitSbpYAAAAJ, jY_...",33,25,19,18,1232,872,78,Electrical_engineering,2001-5 2002-6 2003-7 2004-9 2005-3 2006-9 2007...


In [17]:
author_df.drop(columns='coauthors').to_csv('authors.csv', index=False)

## Create coauthor csv

In [18]:
coauthor_df = coauthor_df[~coauthor_df['scholar_id'].duplicated()]
coauthor_df

Unnamed: 0,scholar_id,name,affiliation,url_picture,i10index,i10index5y,hindex,hindex5y,citedby,citedby5y,num_publications,coauthors
0,stT2bnoAAAAJ,Gunther Eggeler,"Institute for Materials, Ruhr-University Bochu...",https://scholar.googleusercontent.com/citation...,310,225,72,49,21876,12366,742,"[W-52j5AAAAAJ, FrOUKW4AAAAJ, EGDi4BMAAAAJ, Qav..."
1,_bs1TE0AAAAJ,Lidija Petkovska,"Professor of Electrical Machines and Drives, S...",https://scholar.googleusercontent.com/citation...,15,5,11,7,689,243,148,[S-4AR8AAAAAJ]
2,WWKtZZYAAAAJ,James Dedrick,University of York,,8,6,9,8,246,193,56,"[y_qoKAoAAAAJ, CcrxVa8AAAAJ, 9DnjXXkAAAAJ, xmt..."
3,C8Ir-n4AAAAJ,Finn Kuusisto,FANTM,https://scholar.googleusercontent.com/citation...,7,7,7,7,143,127,32,"[okf5bmQAAAAJ, UMGysigAAAAJ, tFdKEgEAAAAJ, 0fA..."
4,NlBmV2QAAAAJ,João Saraiva,HASLab / INESC TEC and Universidade do Minho,https://scholar.googleusercontent.com/citation...,62,27,27,18,2435,1085,205,"[b-zzik0AAAAJ, fYCE5-sAAAAJ, 7VPNqiEAAAAJ, EEe..."
...,...,...,...,...,...,...,...,...,...,...,...,...
2315,2vrAze8AAAAJ,Marco Jacobs,Marco Jacobs Consulting,https://scholar.googleusercontent.com/citation...,4,2,5,3,232,72,27,"[1FQwYvoAAAAJ, ni6owYQAAAAJ, 1Us1N7AAAAAJ]"
2316,KLE-AyEAAAAJ,Jacques Robin,Université Paris 1 Panthéon-Sorbonne,https://scholar.googleusercontent.com/citation...,30,7,18,9,1387,231,69,"[ujDhg2sAAAAJ, Is0pLz0AAAAJ, _ZkpywYAAAAJ, 2Px..."
2317,J8tTJP8AAAAJ,Irène Durand,University of Bordeaux,,13,2,11,4,419,75,68,[]
2318,E1gfeiMAAAAJ,Themistoklis Diamantopoulos,Research Fellow at Aristotle University of The...,,8,8,9,9,227,208,44,"[339uVZQAAAAJ, zVNCDMsAAAAJ, KRG5Mg8AAAAJ, S3C..."


In [19]:
coauthor_df.drop(columns='coauthors').to_csv('coauthors.csv', index=False)

## Create edge list

In [20]:
author4edges = author_df[['scholar_id', 'name', 'coauthors']]
coauthor4edges = coauthor_df[['scholar_id', 'name']]

In [21]:
author4edges

Unnamed: 0,scholar_id,name,coauthors
0,72e5VYEAAAAJ,Steven M. LaValle,"[1NyT9gQAAAAJ, CPze844AAAAJ, jkRa2LEAAAAJ, -JP..."
1,9P2jyr8AAAAJ,Timo Ojala,"[bjEpXBoAAAAJ, sxK1vQwAAAAJ, JnWGV4kAAAAJ, xFj..."
2,8aM6EeEAAAAJ,Jukka Riekki,"[f-jBRoIAAAAJ, Ij_W2wsAAAAJ, GA4D9f8AAAAJ, 6Dd..."
3,d4rhcDAAAAAJ,Denzil Ferreira,"[JnWGV4kAAAAJ, SkQ6OisAAAAJ, ydA8Q5AAAAAJ, Qzl..."
4,lwakBbMAAAAJ,Georgi V. Georgiev,"[nguLAo0AAAAJ, GA4D9f8AAAAJ, 8aM6EeEAAAAJ, J13..."
...,...,...,...
335,AiqUHA8AAAAJ,Athanasios Kehagias,"[IxCZDBQAAAAJ, 0SQP4bwAAAAJ, 3RiPf3wAAAAJ, bo7..."
336,TmGjsAoAAAAJ,George Sergiadis,[]
337,YWalTEUAAAAJ,Dimitrios Chrissoulidis,[]
338,FWPjh-AAAAAJ,Charis Demoulias,"[LIuIdj0AAAAJ, y6UkvosAAAAJ, bitSbpYAAAAJ, jY_..."


In [22]:
coauthor4edges

Unnamed: 0,scholar_id,name
0,stT2bnoAAAAJ,Gunther Eggeler
1,_bs1TE0AAAAJ,Lidija Petkovska
2,WWKtZZYAAAAJ,James Dedrick
3,C8Ir-n4AAAAJ,Finn Kuusisto
4,NlBmV2QAAAAJ,João Saraiva
...,...,...
2315,2vrAze8AAAAJ,Marco Jacobs
2316,KLE-AyEAAAAJ,Jacques Robin
2317,J8tTJP8AAAAJ,Irène Durand
2318,E1gfeiMAAAAJ,Themistoklis Diamantopoulos


In [23]:
# The code below is terribly unoptimized but will have to do
edges = []
for a in author4edges.iterrows():
    for ca_id in a[1]['coauthors']:
        ca_name = coauthor4edges[coauthor4edges['scholar_id'] == ca_id]['name']
        try:
            ca_name = ca_name.item()
        except ValueError:
            ca_name = author4edges[author4edges['scholar_id'] == ca_id]['name'].item()
        edges.append([a[1]['name'], ca_name])

We also drop duplicates just to make sure

In [24]:
edges_df = pd.DataFrame(edges, columns = ['node1', 'node2']).drop_duplicates()

In [25]:
edges_df

Unnamed: 0,node1,node2
0,Steven M. LaValle,James Kuffner
1,Steven M. LaValle,Anna Yershova
2,Steven M. LaValle,Jingjin Yu
3,Steven M. LaValle,seth hutchinson
4,Steven M. LaValle,Jason O'Kane
...,...,...
3121,Charis Demoulias,Dimitar Bozalakov
3122,Charis Demoulias,Lieven Vandevelde
3123,Charis Demoulias,Jose Luis Martinez Ramos
3124,Charis Demoulias,Milos Cvetkovic


In [26]:
edges_df.to_csv('edges.csv', index=False)