In [1]:
import pandas as pd
import json
from itertools import combinations
from collections import Counter

In [2]:
# Loading the data
csv_path = '/Users/mythri_gowda/Desktop/Data_Visualization_Assignment3/data_scopus.csv'
data = pd.read_csv(csv_path)
data.head()

Unnamed: 0,Title,Year,EID,Abstract,Publisher,Conference name,Conference date,Authors,Author(s) ID,Authors with affiliations,Source title,Abbreviated Source Title,Cited by
0,Virtual reality applications for the built env...,2020,2-s2.0-85086464158,With its advanced capabilities of immersive an...,Elsevier B.V.,,,"Zhang Y., Liu H., Kang S.-C., Al-Hussein M.",57202536919;53984550800;57215426514;6603541102;,"Zhang, Y., Department of Civil and Environment...",Automation in Construction,Autom Constr,
1,"Self-tracking while doing sport: Comfort, moti...",2020,2-s2.0-85082875828,The spread of wearable technologies is paving ...,Academic Press,,,"Rapp A., Tirabeni L.",23398572100;57191836654;,"Rapp, A., Computer Science Department, Univers...",International Journal of Human Computer Studies,Int J Hum Comput Stud,
2,"Bridge damage: Detection, IFC-based semantic e...",2020,2-s2.0-85078194587,Building Information Modeling (BIM) representa...,Elsevier B.V.,,,"Isailović D., Stojanovic V., Trapp M., Richter...",57205293410;56421700900;24831175200;3619515940...,"Isailović, D., Department for Construction Pro...",Automation in Construction,Autom Constr,
3,VR system for spatio-temporal visualization of...,2019,2-s2.0-85075706132,Social media analysis is helpful to understand...,Springer,,,"Okada K., Yoshida M., Itoh T., Czauderna T., S...",57201502480;57171240600;35373203100;2592569550...,"Okada, K., Ochanomizu University, Tokyo, Japan...",Multimedia Tools and Applications,Multimedia Tools Appl,
4,DiseaSE: A biomedical text analytics system fo...,2019,2-s2.0-85074886243,Due to increasing volume and unstructured natu...,Academic Press Inc.,,,"Abulaish M., Parwez M.A., Jahiruddin",6505934038;57202719622;35590173900;,"Abulaish, M., Department of Computer Science, ...",Journal of Biomedical Informatics,J. Biomed. Informatics,


In [3]:
# Cleaning the data by removing rows with missing 'Authors', 'Authors with affiliations', or 'Year'
data_cleaned = data.dropna(subset=['Authors', 'Authors with affiliations', 'Year'])

# Verifying the data by cleaning and by checking the first few rows of the cleaned data
data_cleaned.head()

Unnamed: 0,Title,Year,EID,Abstract,Publisher,Conference name,Conference date,Authors,Author(s) ID,Authors with affiliations,Source title,Abbreviated Source Title,Cited by
0,Virtual reality applications for the built env...,2020,2-s2.0-85086464158,With its advanced capabilities of immersive an...,Elsevier B.V.,,,"Zhang Y., Liu H., Kang S.-C., Al-Hussein M.",57202536919;53984550800;57215426514;6603541102;,"Zhang, Y., Department of Civil and Environment...",Automation in Construction,Autom Constr,
1,"Self-tracking while doing sport: Comfort, moti...",2020,2-s2.0-85082875828,The spread of wearable technologies is paving ...,Academic Press,,,"Rapp A., Tirabeni L.",23398572100;57191836654;,"Rapp, A., Computer Science Department, Univers...",International Journal of Human Computer Studies,Int J Hum Comput Stud,
2,"Bridge damage: Detection, IFC-based semantic e...",2020,2-s2.0-85078194587,Building Information Modeling (BIM) representa...,Elsevier B.V.,,,"Isailović D., Stojanovic V., Trapp M., Richter...",57205293410;56421700900;24831175200;3619515940...,"Isailović, D., Department for Construction Pro...",Automation in Construction,Autom Constr,
3,VR system for spatio-temporal visualization of...,2019,2-s2.0-85075706132,Social media analysis is helpful to understand...,Springer,,,"Okada K., Yoshida M., Itoh T., Czauderna T., S...",57201502480;57171240600;35373203100;2592569550...,"Okada, K., Ochanomizu University, Tokyo, Japan...",Multimedia Tools and Applications,Multimedia Tools Appl,
4,DiseaSE: A biomedical text analytics system fo...,2019,2-s2.0-85074886243,Due to increasing volume and unstructured natu...,Academic Press Inc.,,,"Abulaish M., Parwez M.A., Jahiruddin",6505934038;57202719622;35590173900;,"Abulaish, M., Department of Computer Science, ...",Journal of Biomedical Informatics,J. Biomed. Informatics,


In [4]:
# Defining a function to normalize author names or IDs
def normalize_id(identifier):
    return identifier.strip().lower()

#Generating nodes with unique author IDs, countries, and affiliations
author_country_map = {}
author_affiliation_map = {}  # New map to hold affiliation information

In [5]:
# Counting occurrences of each country
country_counts = Counter()

for idx, row in data_cleaned.iterrows():
    authors = row['Authors'].split(', ') if pd.notna(row['Authors']) else []
    affiliations = row['Authors with affiliations'].split('; ') if pd.notna(row['Authors with affiliations']) else []

    for author, affiliation in zip(authors, affiliations):
        # Extracting country from affiliation string
        country = affiliation.split(',')[-1].strip() if ',' in affiliation else "Unknown"
        author_normalized = normalize_id(author)
        
        # Assigning author-country and affiliation
        author_country_map[author_normalized] = country
        author_affiliation_map[author_normalized] = affiliation  # Save full affiliation
        
        # Counting the occurrence of each country
        country_counts[country] += 1

In [6]:
# Get the top 10 countries
top_countries = set([country for country, _ in country_counts.most_common(10)])

In [7]:
# Converting author-country & affiliation to nodes and assigning "Other" to countries not in the top 10
nodes = [{
    "name": author,
    "country": country if country in top_countries else "Other",
    "affiliation": author_affiliation_map[author]
} for author, country in author_country_map.items()]
node_ids = {node["name"] for node in nodes}  # Set of valid node IDs

In [None]:
#Generating links based on shared publications
links = []
for authors in data_cleaned['Authors'].dropna():
    author_list = [normalize_id(name) for name in authors.split(',')]
    author_pairs = list(combinations(author_list, 2))
    for source, target in author_pairs:
        if source in node_ids and target in node_ids:
            links.append({"source": source, "target": target})

In [9]:
#Structuring data as required
output_data = {
    "nodes": nodes,
    "links": links
}

In [10]:
# Save data to JSON file
output_json_path = '/Users/mythri_gowda/Desktop/Data_Visualization_Assignment3/author_network_data.json'
with open(output_json_path, 'w') as f:
    json.dump(output_data, f, indent=4)

print(f"\nData saved with nodes and links: {output_json_path}")


Data saved with nodes and links: /Users/mythri_gowda/Desktop/Data_Visualization_Assignment3/author_network_data.json
