On internationality in mathematical oncology

In [None]:
!pip install rpy2
%load_ext rpy2.ipython

In [None]:
# Load required libraries
library(countrycode)
library(stringr)
library(dplyr)
library(tidyr)

# Define custom aliases for specific countries
custom_country_aliases <- list(
  "United States" = c("United States", "USA", "US", "America", "United States of America", "U.S.A"),
  "United Kingdom" = c("United Kingdom", "UK", "U.K.", "Britain", "Great Britain", "England", "Scotland", "Wales", "Northern Ireland"),
  "Czechia" = c("Czechia", "Czech Republic"),
  "United Arab Emirates" = c("United Arab Emirates", "U Arab Emirates")
)

# Get vector of standard country names
standard_countries <- countrycode::codelist$country.name.en
standard_countries <- standard_countries[!is.na(standard_countries)]

# Combine standard names and aliases
all_country_variants <- unlist(custom_country_aliases)
all_country_names <- c(standard_countries, all_country_variants)

# Escape special regex characters
escaped_country_names <- stringr::str_replace_all(all_country_names, "([\\^$.|?*+(){}])", "\\\\\\1")
pattern <- paste0("\\b(", paste(unique(escaped_country_names), collapse = "|"), ")\\b")

# Function to normalize matched country names
normalize_country <- function(match) {
  match <- str_to_title(trimws(match))
  
  for (key in names(custom_country_aliases)) {
    if (tolower(match) %in% tolower(custom_country_aliases[[key]])) {
      return(key)
    }
  }
  
  match_from_code <- countrycode(match, origin = "country.name", destination = "country.name")
  return(match_from_code)
}

# Define input files
input_files <- c(
  "OneDrive - Uppsala universitet/Aim 1/Results/International/ALL_math_onco_high_citations_have_countries.txt", 
  "OneDrive - Uppsala universitet/Aim 1/Results/International/All_math_bio_without_onco_high_citations_have_countries.txt"
)

# Process each file
for (file in input_files) {
  # Determine file suffix
  if (grepl("math_onco", file)) {
    output_suffix <- "math_onco"
  } else if (grepl("math_bio_without_onco", file)) {
    output_suffix <- "math_bio_without_onco"
  } else {
    output_suffix <- "unknown"
  }
  
  # Read lines
  lines <- readLines(file)
  
  # Find article start and end indices
  article_indices <- grep("^PT ", lines)
  end_indices <- grep("^ER", lines)
  
  if (length(article_indices) != length(end_indices)) {
    stop("Mismatch in number of article starts and ends in file: ", file)
  }
  
  # Initialize storage
  results <- list()
  missing_c1_count <- 0
  missing_c1_articles <- list()
  zero_country_articles <- list()
  country_count_articles <- list()
  
  # Process articles
  for (i in seq_along(article_indices)) {
    article_lines <- lines[article_indices[i]:end_indices[i]]
    
    # Extract year
    year_line <- grep("^PY ", article_lines, value = TRUE)
    year <- if (length(year_line) > 0) as.integer(str_extract(year_line, "\\d{4}")) else NA
    
    # Extract C1 lines
    c1_lines <- grep("^C1 ", article_lines, value = TRUE)
    
    if (length(c1_lines) == 0) {
      missing_c1_count <- missing_c1_count + 1
      missing_c1_articles[[length(missing_c1_articles) + 1]] <- article_lines
      next
    }
    
    # Collapse text and match countries
    c1_text <- paste(c1_lines, collapse = " ")
    matches <- str_extract_all(c1_text, regex(pattern, ignore_case = TRUE))[[1]]
    normalized <- unique(na.omit(sapply(matches, normalize_country)))
    num_countries <- length(normalized)
    
    # Store if no recognized countries
    if (num_countries == 0) {
      zero_country_articles[[length(zero_country_articles) + 1]] <- article_lines
      next
    }
    
    # Store for summary
    results[[length(results) + 1]] <- data.frame(
      Year = year,
      UniqueCountries = num_countries,
      stringsAsFactors = FALSE
    )
    
    # Group articles by country count
    num_countries_str <- as.character(num_countries)
    if (!num_countries_str %in% names(country_count_articles)) {
      country_count_articles[[num_countries_str]] <- list()
    }
    country_count_articles[[num_countries_str]][[length(country_count_articles[[num_countries_str]]) + 1]] <- article_lines
  }
  
  # Combine results
  all_data <- bind_rows(results)
  
  summary <- all_data %>%
    group_by(Year, UniqueCountries) %>%
    summarise(NumberOfArticles = n(), .groups = "drop") %>%
    arrange(Year, UniqueCountries)
  
  # Define output directory
  output_dir <- "OneDrive - Uppsala universitet/Aim 1/Results/International/Higher than average citations/"
  
  # Write summary CSV
  write.csv(summary, file.path(output_dir, paste0("country_summary_by_year_", output_suffix, ".csv")), row.names = FALSE)
  
  # Write articles with missing C1
  writeLines(
    as.character(unlist(lapply(missing_c1_articles, function(x) paste(x, collapse = "\n")))),
    file.path(output_dir, paste0(output_suffix, "_missing_c1_articles.txt"))
  )
  
  # Write articles with 0 countries
  writeLines(
    as.character(unlist(lapply(zero_country_articles, function(x) paste(x, collapse = "\n")))),
    file.path(output_dir, paste0(output_suffix, "_articles_0_countries.txt"))
  )
  
  # Write articles grouped by country count
  for (country_count in names(country_count_articles)) {
    suffix <- ifelse(country_count == "1", "country", "countries")
    file_path <- file.path(output_dir, paste0(output_suffix, "_articles_", country_count, "_", suffix, ".txt"))
    
    writeLines(
      as.character(unlist(lapply(country_count_articles[[country_count]], function(x) paste(x, collapse = "\n")))),
      file_path
    )
  }
  
  # Console output
  cat("✅ Done for file:", file, "\n")
  cat("Articles processed:", length(article_indices), "\n")
  cat("Articles missing C1 field:", missing_c1_count, "\n")
  cat("Articles with 0 recognized countries:", length(zero_country_articles), "\n")
  cat("Articles grouped by number of countries:\n")
  for (country_count in sort(as.integer(names(country_count_articles)))) {
    cat("  ", country_count, "country(ies):", length(country_count_articles[[as.character(country_count)]]), "\n")
  }
}


In [None]:
# ---------- Load required packages ----------
library(dplyr)
library(scales)

# ---------- Function to process and expand data ----------
process_data <- function(filepath) {
  dt <- read.csv(filepath)
  # 'dt' must have columns: Year, UniqueCountries, NumberOfArticles
  expanded <- dt[rep(1:nrow(dt), dt$NumberOfArticles), c("Year", "UniqueCountries")]
  # Make Year a factor, but only over the years present
  expanded$Year <- factor(expanded$Year, levels = sort(unique(expanded$Year)))
  expanded$Year_num <- as.numeric(expanded$Year)
  return(expanded)
}

# ---------- Function to extract and count outliers ----------
get_outlier_counts <- function(expanded_data) {
  outliers <- expanded_data %>%
    group_by(Year) %>%
    mutate(is_outlier = UniqueCountries %in% boxplot.stats(UniqueCountries)$out) %>%
    ungroup() %>%
    filter(is_outlier)
  
  outlier_counts <- outliers %>%
    group_by(Year, UniqueCountries) %>%
    summarise(count = n(), .groups = "drop") %>%
    mutate(
      Year_num   = as.numeric(factor(Year, levels = sort(unique(expanded_data$Year)))),
      cex_scaled = rescale(count, to = c(0.5, 2))
    )
  return(outlier_counts)
}

# ---------- Function to draw a single boxplot with overlayed means ----------
plot_box_with_means <- function(data_all, data_high, outliers_all, outliers_high, title, y_max = NULL) {
  # If y_max is not provided, compute it from all‐article data
  if (is.null(y_max)) {
    y_max <- max(data_all$UniqueCountries, na.rm = TRUE)
  }
  y_ticks <- seq(0, y_max, by = 1)
  
  # Draw the boxplot of all‐article data
  boxplot(UniqueCountries ~ Year,
          data    = data_all,
          main    = title,
          xlab    = "Year",
          ylab    = "Number of Countries",
          col     = "lightgray",
          border  = "black",
          las     = 2,
          ylim    = c(0, y_max + 0.5),
          yaxt    = "n",
          outline = FALSE)
  
  # Light horizontal grid lines
  abline(h = y_ticks, col = "lightgray", lty = "dotted")
  axis(2, at = y_ticks, las = 1)
  
  # Plot outliers for ALL articles (small black circles)
  points(outliers_all$Year_num, outliers_all$UniqueCountries,
         pch = 20, col = "black", cex = outliers_all$cex_scaled)
  
  # Add black stars for the mean of ALL articles
  means_all <- tapply(data_all$UniqueCountries, data_all$Year, mean, na.rm = TRUE)
  for (i in seq_along(means_all)) {
    if (!is.na(means_all[i])) {
      points(i, means_all[i], pch = 8, col = "turquoise1", cex = 1)
    }
  }
  
  # Add red crosses for the mean of HIGH‐CITATION articles
  if (!is.null(data_high)) {
    # tapply will produce a vector of length = full Year factor levels,
    # with NA for any Year having no high‐citation entries
    means_high <- tapply(data_high$UniqueCountries, data_high$Year, mean, na.rm = TRUE)
    for (i in seq_along(means_high)) {
      if (!is.na(means_high[i])) {
        points(i, means_high[i], pch = 4, col = "red", cex = 1.5)
      }
    }
  }
  
  # (Optional) Highlight high‐citation outliers:
  # points(outliers_high$Year_num, outliers_high$UniqueCountries,
  #        pch = 20, col = "red", cex = outliers_high$cex_scaled)
}

# ---------- Load & process the “all‐article” datasets ----------
math_onco_all <- process_data("~/OneDrive - Uppsala universitet/Aim 1/Results/International/country_summary_by_year_math_onco.csv")
math_bio_all  <- process_data("~/OneDrive - Uppsala universitet/Aim 1/Results/International/country_summary_by_year_math_bio_without_onco.csv")

outliers_onco_all <- get_outlier_counts(math_onco_all)
outliers_bio_all  <- get_outlier_counts(math_bio_all)

# ---------- Load & process the HIGH‐CITATION datasets (same filenames + "_high_citations") ----------
math_onco_high <- process_data("~/OneDrive - Uppsala universitet/Aim 1/Results/International/Higher than average citations/country_summary_by_year_math_onco_high_citations.csv")
# Re‐factor Year so that levels match the “all‐article” years exactly
math_onco_high$Year <- factor(math_onco_high$Year, levels = levels(math_onco_all$Year))
math_onco_high$Year_num <- as.numeric(math_onco_high$Year)

math_bio_high <- process_data("~/OneDrive - Uppsala universitet/Aim 1/Results/International/Higher than average citations/country_summary_by_year_math_bio_without_onco_high_citations.csv")
# Re‐factor Year to match “all‐article” levels
math_bio_high$Year <- factor(math_bio_high$Year, levels = levels(math_bio_all$Year))
math_bio_high$Year_num <- as.numeric(math_bio_high$Year)

outliers_onco_high <- get_outlier_counts(math_onco_high)
outliers_bio_high  <- get_outlier_counts(math_bio_high)

# ---------- Determine a common y‐axis max (optional) ----------
y_max_onco <- max(math_onco_all$UniqueCountries, math_onco_high$UniqueCountries, na.rm = TRUE)
y_max_bio  <- max(math_bio_all$UniqueCountries,  math_bio_high$UniqueCountries,  na.rm = TRUE)
y_max      <- max(y_max_onco, y_max_bio)

# ---------- Save the combined plot to a PNG ----------
png("~/OneDrive - Uppsala universitet/Aim 1/Results/International/BOXPLOTS_NUMBER_COUNTIES_withMeans.png",
    width = 2400, height = 800, res = 150)

# ---------- Set up side‐by‐side plotting region ----------
par(mfrow = c(1, 2), mar = c(6, 4, 4, 2))

# ---------- Draw the “Mathematical Oncology” boxplot with means ----------
plot_box_with_means(
  data_all     = math_onco_all,
  data_high    = math_onco_high,
  outliers_all  = outliers_onco_all,
  outliers_high = outliers_onco_high,
  title        = "Mathematical Oncology",
  y_max        = y_max
)

# ---------- Draw the “Mathematical Biology excluding Onco” boxplot with means ----------
plot_box_with_means(
  data_all     = math_bio_all,
  data_high    = math_bio_high,
  outliers_all  = outliers_bio_all,
  outliers_high = outliers_bio_high,
  title        = "Mathematical Biology excluding Mathematical Oncology",
  y_max        = y_max
)

dev.off()
