<a href="https://colab.research.google.com/github/Ifeanyi55/OpenAlex4Gephi/blob/main/OpenAlex4Gephi.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Introduction to OpenAlex4Gephi**

The OpenAlex4Gephi project is a project that allows a user to fetch network data from openalex.org in Gephi format via its API. The network data is a bimodal network of **authors to publications**. The network nodes and edges data are collected and stored as separate csv files, which can be downloaded to your local machine and then imported into Gephi for visualization and analysis.

## **Install Package**

In [None]:
# Install the openalexR package by running this code cell. Click the play button to run the cell or CTRL + Enter
install.packages("openalexR")

## **Keywords**
If you are entering more than one keyword in the search parameter below, make sure to separate them by a comma.

## **Dates**
Dates inputs are always in the **yyyy-mm-dd** format.

## **Run Code**
To run the code after entering your search parameters, press the play button on the left of the code cell.


## **Enter Search Parameters & Run Cell to Generate Network Data (Nodes & Edges CSV Files)**

In [None]:
suppressWarnings(library(openalexR))

authorPubNodes <- function(keywords,pub_start_date,pub_end_date){

  keywords <- keywords
  pub_start_date <- pub_start_date
  pub_end_date <- pub_end_date

  # create search engine function
  search_engine <- function(keywords,pub_start_date,pub_end_date){
    suppressPackageStartupMessages(library(openalexR))
    suppressPackageStartupMessages(library(tidyverse))

    options(openalexR.mailto = "idiayeifeanyi@yahoo.com",
            openalexR.message = "suppressed")

    # search engine
    works_search <- oa_fetch(
      entity = "works",
      title.search = c(keywords),
      cited_by_count = ">50",
      from_publication_date = pub_start_date,
      to_publication_date = pub_end_date,
      options = list(sort = "cited_by_count:desc"),
      verbose = FALSE
    )

    return(works_search)

  }

  search_data <- search_engine(keywords,pub_start_date,pub_end_date)

  # grab authors and group them according to collaboration
  authors_collaboration_groups <- list()
  for (i in 1:nrow(search_data)){
    authors_collaboration_groups[[i]] <- search_data$author[[i]][2]
  }

  # grab all authors
  all_authors <- c()
  for (i in 1:length(authors_collaboration_groups)) {
    all_authors <- c(all_authors,authors_collaboration_groups[[i]][[1]])
  }

  # get length of each authors collaboration
  authors_length <- c()
  for(authors in 1:length(authors_collaboration_groups)){
    authors_length <- c(authors_length,authors_collaboration_groups[[authors]] |> nrow())
  }

  # grab all publications
  publications <- list()
  for (i in 1:nrow(search_data)){
    publications[[i]] <- rep(search_data$display_name[i], each = authors_length[i])
 }

  # place all publications in a vector
  all_publications <- c()
  for(i in 1:length(publications)){
    all_publications <- c(all_publications,publications[[i]])
  }

  # create author_to_publication data frame
  authors_to_publications <- data.frame(
    Authors = all_authors,
    Publications = all_publications
  )

  # stack the df so that authors and publications
  # are together as one column
  stacked_df <- stack(authors_to_publications)
  stacked_df <- unique.data.frame(stacked_df) # remove duplicate rows
  stacked_df <- stacked_df[-2] # delete second column in df

  # create author_publications_nodes df
  author_publication_nodes <- data.frame(
    Id = 1:nrow(stacked_df),
    Nodes = stacked_df$values,
    Label = stacked_df$values
  )


  return(author_publication_nodes)


}


# Function for creating the network edges
authorPubEdges <- function(keywords,pub_start_date,pub_end_date){

  keywords <- keywords
  pub_start_date <- pub_start_date
  pub_end_date <- pub_end_date

  # create search engine function
  search_engine <- function(keywords,pub_start_date,pub_end_date){
    suppressPackageStartupMessages(library(openalexR))
    suppressPackageStartupMessages(library(tidyverse))

    options(openalexR.mailto = "idiayeifeanyi@yahoo.com")

    # search engine
    works_search <- oa_fetch(
      entity = "works",
      title.search = c(keywords),
      cited_by_count = ">50",
      from_publication_date = pub_start_date,
      to_publication_date = pub_end_date,
      options = list(sort = "cited_by_count:desc"),
      verbose = FALSE
    )

    return(works_search)

  }

   # run author nodes function
  author_nodes <- authorPubNodes(keywords,pub_start_date,pub_end_date)

  # run search engine
  search_data <- search_engine(keywords,pub_start_date,pub_end_date)


  # grab authors and group them according to collaboration
  authors_collaboration_groups <- list()
  for (i in 1:nrow(search_data)){
    authors_collaboration_groups[[i]] <- search_data$author[[i]][2]
  }

  # grab all authors
  all_authors <- c()
  for (i in 1:length(authors_collaboration_groups)) {
    all_authors <- c(all_authors,authors_collaboration_groups[[i]][[1]])
  }

  # get length of each authors collaboration
  authors_length <- c()
  for(authors in 1:length(authors_collaboration_groups)){
    authors_length <- c(authors_length,authors_collaboration_groups[[authors]] |> nrow())
  }

  # grab all publications
  publications <- list()
  for (i in 1:nrow(search_data)){
    publications[[i]] <- rep(search_data$display_name[i], each = authors_length[i])
  }

  # place all publications in a vector
  all_publications <- c()
  for(i in 1:length(publications)){
    all_publications <- c(all_publications,publications[[i]])
  }

  # create author_to_publication data frame
  authors_to_publications <- data.frame(
    Authors = all_authors,
    Publications = all_publications
  )

  # create edges data frame
  author_publication_edges <- data.frame(
    Source = authors_to_publications$Authors,
    Target = authors_to_publications$Publications,
    Type = "directed",
    Weight = 1.0
  )


  # replace edges with id from nodes data set
  replace_edges_with_ids <- function(author_edges, author_nodes) {
    # Create a lookup table for node values to their corresponding Ids
    node_lookup <- setNames(author_nodes$Id, author_nodes$Node)

    # Use the lookup table to replace Source and Target values in author_edges
    author_edges$Source <- node_lookup[author_edges$Source]
    author_edges$Target <- node_lookup[author_edges$Target]

    return(author_edges)
  }

  # Call the function with your data frames
  author_publication_edges <- replace_edges_with_ids(author_publication_edges, nodes_pub)

  return(author_publication_edges)


}


Keywords <- "" # @param {type:"string"}
Pub_start_date <- "2024-02-01" # @param {type:"date"}
Pub_end_date <- "2024-02-01" # @param {type:"date"}

# split keywords into a vector
Keywords <- c(unlist(strsplit(Keywords,split = ",")))

# Run this code cell
nodes_pub <- authorPubNodes(keywords = Keywords,
                            pub_start_date = Pub_start_date,
                            pub_end_date = Pub_end_date)

edges_pub <- authorPubEdges(keywords = Keywords,
                            pub_start_date = Pub_start_date,
                            pub_end_date = Pub_end_date)


# export nodes and edges csv files
write.csv(nodes_pub,file = "Nodes.csv",row.names = F)
write.csv(edges_pub,file = "Edges.csv",row.names = F)



To download the files, click the **Files** folder icon in the sidebar on the left of the notebook. There, you will see the generated csv files. Hover over each one and click on the 3 dots to download the file.