On collaboration in mathematical oncology

In [None]:
!pip install rpy2
%load_ext rpy2.ipython

In [None]:
import csv
from collections import defaultdict
import os

# List of input files and their corresponding output names
files_to_process = [
    ("ALL_math_onco.txt", "math_onco_number_authors_with_mean.csv"),
    ("ALL_math_bio_without_onco.txt", "math_bio_without_onco_number_authors_with_mean.csv")
]

def process_file(input_file, output_file):
    print(f"🔍 Processing {input_file}...")
    year_author_counts = defaultdict(lambda: defaultdict(int))
    max_author_count = 0
    author_blocks = defaultdict(list)  # Store full entries by author count (PT J to ER)

    with open(input_file, 'r', encoding='utf-8') as file:
        current_entry = None
        entry_lines = [] #all lines PT J to ER

        for line in file: #for each line 
            if line.startswith("PT "): # each article starts with PT J
                current_entry = {"PY": None, "AU_count": 0}
                entry_lines = [line]
            elif current_entry is not None:
                entry_lines.append(line)
                if line.startswith("PY "): #store PY 
                    current_entry["PY"] = line[3:].strip()
                elif line.startswith("AU "): #if we find an AU entry
                    authors_line = line[3:].strip() # looks at everything in AU line
                    authors_line = " ".join(authors_line.split()) #removes irregular spacing
                    authors = authors_line.split() # splits line by spaces so just names in file
                    current_entry["AU_count"] = len(authors) // 2 #each author is represented by 2 elements so divide number of words by 2
                    #authors_line = line[3:].strip()
                    #current_entry["AU_count"] = authors_line.count(',') + authors_line.count(';')
                elif line.startswith("ER"): #end of record ER
                    if current_entry["PY"] is not None:
                        year = current_entry["PY"]
                        author_count = current_entry["AU_count"]
                        year_author_counts[year][author_count] += 1
                        max_author_count = max(max_author_count, author_count)
                        author_blocks[author_count].append("".join(entry_lines))
                    current_entry = None
                    entry_lines = []

    # Create CSV summary
    fieldnames = ["Year"] + [f"author_{i}" for i in range(1, max_author_count + 1)] + ["mean_authors"]

    with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()

        for year, author_counts in sorted(year_author_counts.items()):
            row = {"Year": year}
            total_articles = 0
            total_authors = 0

            for i in range(1, max_author_count + 1):
                count = author_counts.get(i, 0)
                row[f"author_{i}"] = count
                total_articles += count
                total_authors += i * count

            row["mean_authors"] = round(total_authors / total_articles, 2) if total_articles > 0 else 0
            writer.writerow(row)

    print(f"✅ Results written to {output_file}")

    # Write author count-specific .txt files
    base_name = os.path.splitext(os.path.basename(input_file))[0]
    output_dir = f"{base_name}_author_groups"
    os.makedirs(output_dir, exist_ok=True)

    for count, entries in author_blocks.items():
        label = f"{count}_author.txt" if count == 1 else f"{count}_authors.txt"
        out_path = os.path.join(output_dir, label)
        with open(out_path, 'w', encoding='utf-8') as out_file:
            for block in entries:
                out_file.write(block + "\n")
        print(f"📄 Saved {len(entries)} entries to {out_path}")

# Run the processor
for input_file, output_file in files_to_process:
    process_file(input_file, output_file)

print("\n🎉 All files processed with full author count distribution, mean counts, and separate author group files!")


In [None]:
import re
from collections import defaultdict

# Function to process one input file into high and low citation files
def process_articles_by_year(
    input_file,  # input file
    high_citations_file,  # output file of higher than average cited articles
    low_citations_file  # output file of lower than average cited articles
):

    # Read the entire file content
    with open(input_file, 'r', encoding='utf-8') as f:
        content = f.read()  # puts file into string called content

    # Split the content into individual articles
    articles = re.findall(r'(PT J.*?\nER\n)', content, re.DOTALL)  # articles start with PT J and end with ER

    # First pass: Group TC values by year
    year_tc_values = defaultdict(list)  # creates dictionary where keys are years and values are TC values

    for article in articles:  # loops through all articles
        year_match = re.search(r'PY (\d{4})', article)  # extracts the publication year
        tc_match = re.search(r'TC (\d+)', article)  # extracts the citation count

        if year_match:  # appends each article's TC to corresponding year
            year = year_match.group(1)
            tc = int(tc_match.group(1)) if tc_match else 0
            year_tc_values[year].append(tc)

    # Compute per-year mean TC
    year_mean_tc = {
        year: sum(tcs) / len(tcs) if tcs else 0  # calculate mean for each year
        for year, tcs in year_tc_values.items()
    }

    # Print per-year mean TC values
    for year in sorted(year_mean_tc.keys()):
        print(f"📊 Year {year} - Mean TC: {year_mean_tc[year]:.2f}")

    # Second pass: Write articles to respective files based on year-specific mean
    high_count = 0
    low_count = 0

    with open(high_citations_file, 'w', encoding='utf-8') as high_f, \
         open(low_citations_file, 'w', encoding='utf-8') as low_f:

        for article in articles:  # loop over articles
            year_match = re.search(r'PY (\d{4})', article)  # extract publication year
            tc_match = re.search(r'TC (\d+)', article)  # extract total citations

            if year_match:
                year = year_match.group(1)
                tc = int(tc_match.group(1)) if tc_match else 0
                mean_tc = year_mean_tc.get(year, 0)  # get the mean for that year

                if tc >= mean_tc:  # if greater than or equal to mean, write to high citation file
                    high_f.write(article + '\n')
                    high_count += 1
                else:  # otherwise write to low citation file
                    low_f.write(article + '\n')
                    low_count += 1

    # Print summary for this file
    print(f"\n✅ Finished processing {input_file}")
    print(f"🟥 Articles with TC ≥ year mean: {high_count}")
    print(f"🟩 Articles with TC < year mean: {low_count}")
    print("--------------------------------------------------\n")

# ---------- Process BOTH files ----------

# Process ALL_math_onco.txt
process_articles_by_year(
    input_file="ALL_math_onco.txt",
    high_citations_file="ALL_math_onco_high_citations.txt",
    low_citations_file="ALL_math_onco_low_citations.txt"
)

# Process ALL_math_bio_without_onco.txt
process_articles_by_year(
    input_file="ALL_math_bio_without_onco.txt",
    high_citations_file="ALL_math_bio_without_onco_high_citations.txt",
    low_citations_file="ALL_math_bio_without_onco_low_citations.txt"
)


In [None]:
import csv
from collections import defaultdict

# List of input files and their corresponding output names
files_to_process = [
    ("ALL_math_onco_high_citations.txt", "ALL_math_onco_high_citations_number_authors_with_mean.csv"),
    ("ALL_math_bio_without_onco_high_citations.txt", "ALL_math_bio_without_onco_high_citations_number_authors_with_mean.csv")
]

def process_file(input_file, output_file):
    print(f"🔍 Processing {input_file}...")
    year_author_counts = defaultdict(lambda: defaultdict(int)) # Map to each publication year
    max_author_count = 0  # To track the highest number of authors found

    with open(input_file, 'r', encoding='utf-8') as file:
        current_entry = None
        for line in file:
            if line.startswith("PT "): # Indicated start of article (PT J) so create a new entry
                current_entry = {"PY": None, "AU_count": 0} # To store the year and author count
            elif line.startswith("PY ") and current_entry is not None:
                current_entry["PY"] = line[3:].strip()
            elif line.startswith("AU ") and current_entry is not None:
                authors_line = line[3:].strip()
                authors_line = " ".join(authors_line.split())
                authors = authors_line.split()
                current_entry["AU_count"] = len(authors) // 2  # Author names are made up of 2 names
            elif line.startswith("ER"): # End of article
                if current_entry is not None and current_entry["PY"] is not None:
                    year = current_entry["PY"]
                    author_count = current_entry["AU_count"]
                    year_author_counts[year][author_count] += 1
                    max_author_count = max(max_author_count, author_count)
                    current_entry = None

    # Create dynamic fieldnames based on actual max author count
    fieldnames = ["Year"] + [f"author_{i}" for i in range(1, max_author_count + 1)] + ["mean_authors"]

    with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()

        for year, author_counts in sorted(year_author_counts.items()):
            row = {"Year": year}
            total_articles = 0
            total_authors = 0

            for i in range(1, max_author_count + 1):
                count = author_counts.get(i, 0)
                row[f"author_{i}"] = count
                total_articles += count
                total_authors += i * count

            row["mean_authors"] = round(total_authors / total_articles, 2) if total_articles > 0 else 0
            writer.writerow(row)

    print(f"✅ Results written to {output_file}")

# Run the processor
for input_file, output_file in files_to_process:
    process_file(input_file, output_file)

print("\n🎉 All files processed with full author count distribution and mean author count!")


In [None]:
%R

# ---------- Load libraries ----------
library(tidyr)
library(dplyr)
library(scales)

# ---------- Load and clean the first dataset (Math Onco) ----------
data_onco <- read.csv("/Users/kirpu383/OneDrive - Uppsala universitet/Aim 1/Results/Collaborative/math_onco_number_authors_with_mean.csv")
colnames(data_onco) <- gsub("\\.", "_", colnames(data_onco))  # Clean column names

# ---------- Convert wide to long format for Math Onco ----------
long_data_onco <- data_onco %>%
  pivot_longer(cols = starts_with("author_"),
               names_to = "author_count",
               names_prefix = "author_",
               values_to = "count") %>%
  mutate(author_count = as.integer(author_count)) %>%
  uncount(count)

# ---------- Load and clean the second dataset (Math Bio Without Onco) ----------
data_bio <- read.csv("/Users/kirpu383/OneDrive - Uppsala universitet/Aim 1/Results/Collaborative/math_bio_without_onco_number_authors_with_mean.csv")
colnames(data_bio) <- gsub("\\.", "_", colnames(data_bio))  # Clean column names

# ---------- Convert wide to long format for Math Bio Without Onco ----------
long_data_bio <- data_bio %>%
  pivot_longer(cols = starts_with("author_"),
               names_to = "author_count",
               names_prefix = "author_",
               values_to = "count") %>%
  mutate(author_count = as.integer(author_count)) %>%
  uncount(count)

# ---------- Load and clean the high citation dataset ----------
mean_data_onco <- read.csv("/Users/kirpu383/OneDrive - Uppsala universitet/Aim 1/Results/Collaborative/ALL_math_onco_high_citations_number_authors_with_mean.csv")
colnames(mean_data_onco) <- gsub("[ .>]", "_", colnames(mean_data_onco))  # Clean column names

mean_data_bio <- read.csv("/Users/kirpu383/OneDrive - Uppsala universitet/Aim 1/Results/Collaborative/ALL_math_bio_without_onco_high_citations_number_authors_with_mean.csv")
colnames(mean_data_onco) <- gsub("[ .>]", "_", colnames(mean_data_onco))  # Clean column names

# ---------- Ensure 'Year' is a factor with full range ----------
all_years <- factor(1961:2024)
long_data_onco$Year <- factor(long_data_onco$Year, levels = levels(all_years))
long_data_bio$Year <- factor(long_data_bio$Year, levels = levels(all_years))
mean_data_onco$Year <- factor(mean_data_onco$Year, levels = levels(all_years))
mean_data_bio$Year <- factor(mean_data_bio$Year, levels = levels(all_years))

# ---------- Function to pad missing years ----------
pad_missing_years <- function(data, year_levels) {
  all_years_df <- data.frame(Year = factor(levels(year_levels), levels = levels(year_levels)))
  padded <- merge(all_years_df, data, by = "Year", all.x = TRUE)
  return(padded)
}

# ---------- Pad both datasets ----------
long_data_onco <- pad_missing_years(long_data_onco, all_years)
long_data_bio <- pad_missing_years(long_data_bio, all_years)

# ---------- Function to extract and count outliers ----------
get_outlier_counts <- function(expanded_data) {
  outliers <- expanded_data %>%
    filter(!is.na(author_count)) %>%
    group_by(Year) %>%
    group_modify(~ {
      out_vals <- boxplot.stats(.x$author_count)$out
      .x %>%
        mutate(is_outlier = author_count %in% out_vals)
    }) %>%
    ungroup() %>%
    filter(is_outlier)
  
  outlier_counts <- outliers %>%
    group_by(Year, author_count) %>%
    summarise(count = n(), .groups = "drop") %>%
    mutate(
      Year_num = as.numeric(factor(Year, levels = levels(expanded_data$Year))),
      cex_scaled = rescale(count, to = c(0.5, 2))
    )
  
  return(outlier_counts)
}

# ---------- Extract outlier dot info for both datasets ----------
outlier_dots_onco <- get_outlier_counts(long_data_onco)
outlier_dots_bio <- get_outlier_counts(long_data_bio)

# ---------- Shared y-axis limits ----------
combined_y_range <- range(c(long_data_onco$author_count, long_data_bio$author_count), na.rm = TRUE)
y_limits <- c(floor(combined_y_range[1]), ceiling(combined_y_range[2]))

# ---------- Define x-axis ticks: every other year ----------
year_labels <- levels(all_years)
x_ticks <- seq(1, length(year_labels), by = 2)
x_labels <- year_labels[x_ticks]

# ---------- Start plotting ----------
pdf("/Users/kirpu383/OneDrive - Uppsala universitet/Aim 1/Results/Collaborative/BOXPLOTS_NUMBER_AUTHORS.pdf",
    width = 16, height = 5.33)

par(mfrow = c(1, 2), mar = c(6, 4, 4, 2))

# ------------------ Plot for Math Onco ------------------
# Draw horizontal grid lines first so they appear underneath
y_ticks <- seq(y_limits[1], y_limits[2], by = 1)
plot(1, type = "n", xlim = range(x_ticks), ylim = y_limits,
     xaxt = "n", yaxt = "n", xlab = "", ylab = "", main = "")
abline(h = y_ticks, col = "gray", lty = "dotted")

# Then add the actual boxplot on top
boxplot(author_count ~ Year, data = long_data_onco,
        add = TRUE,                     # Add to the existing blank plot
        notch = FALSE,
        xlab = "Year", ylab = "Number of Authors",
        main = "Mathematical Oncology",
        xaxt = "n", yaxt = "n",
        col = "lightgray", border = "black", outline = FALSE)

# Custom axes (after the boxplot)
axis(1, at = x_ticks, labels = x_labels, las = 2)
axis(2, at = y_ticks, las = 1)

# Add mean points (full dataset)
means_onco <- tapply(long_data_onco$author_count, long_data_onco$Year, mean, na.rm = TRUE)
for (i in seq_along(means_onco)) {
  if (!is.na(means_onco[i])) {
    points(i, means_onco[i], pch = 8, col = "turquoise1", cex = 1)
  }
}

# Add red crosses for high-citation means
for (i in seq_along(mean_data_onco$Year)) {
  if (!is.na(mean_data_onco$mean_authors[i])) {
    year_index <- which(levels(mean_data_onco$Year) == mean_data_onco$Year[i])
    points(year_index, mean_data_onco$mean_authors[i], pch = 4, col = "red", cex = 1.5)
  }
}

# Add outlier dots
points(outlier_dots_onco$Year_num, outlier_dots_onco$author_count,
       pch = 16, col = "black", cex = outlier_dots_onco$cex_scaled)

# ------------------ Plot for Math Bio Without Onco ------------------
# First draw an empty plot to set up coordinate space and draw grid lines
y_ticks <- seq(y_limits[1], y_limits[2], by = 1)
plot(1, type = "n", xlim = range(x_ticks), ylim = y_limits,
     xaxt = "n", yaxt = "n", xlab = "", ylab = "", main = "")
abline(h = y_ticks, col = "gray", lty = "dotted")  # Grid lines under boxplots

# Now add the boxplot on top
boxplot(author_count ~ Year, data = long_data_bio,
        add = TRUE,                     # Add to existing plot
        notch = FALSE,
        xlab = "Year", ylab = "Number of Authors",
        main = "Mathematical Biology excluding Mathematical Oncology",
        xaxt = "n", yaxt = "n", 
        col = "gray", border = "black", outline = FALSE,
        ylim = y_limits)

# Draw the axes on top of everything
axis(1, at = x_ticks, labels = x_labels, las = 2)
axis(2, at = y_ticks, las = 1)


means_bio <- tapply(long_data_bio$author_count, long_data_bio$Year, mean, na.rm = TRUE)
for (i in seq_along(means_bio)) {
  if (!is.na(means_bio[i])) {
    points(i, means_bio[i], pch = 8, col = "turquoise1", cex = 1)
  }
}

for (i in seq_along(mean_data_bio$Year)) {
  if (!is.na(mean_data_bio$mean_authors[i])) {
    year_index <- which(levels(mean_data_bio$Year) == mean_data_bio$Year[i])
    points(year_index, mean_data_bio$mean_authors[i], pch = 4, col = "red", cex = 1.5)
  }
}

points(outlier_dots_bio$Year_num, outlier_dots_bio$author_count,
       pch = 16, col = "black", cex = outlier_dots_bio$cex_scaled)

dev.off()
