On interdisciplinarity in mathematical oncology

In [None]:
!pip instalxl rpy2
%load_ext rpy2.ipython

Cited by

In [None]:
from collections import defaultdict, Counter
import os
import csv

# Path to your combined file
file_path = os.path.expanduser('~/OneDrive - Uppsala universitet/Aim 1/Results/Interdisciplinarity/Cited_by/MBE_Output/combined_MBE_no_2025.txt')

# Dictionary: year → journal → count
yearly_counts = defaultdict(Counter)

# Variables to track current block info
current_journal = None
current_year = None

# Function to normalize journal names
def normalize_journal(name):
    return name.strip().title()  # You can also use .lower() if preferred

# Read file line-by-line
with open(file_path, 'r', encoding='utf-8') as f:
    for line in f:
        line = line.strip()
        if line.startswith('SO '):
            current_journal = normalize_journal(line[3:])
        elif line.startswith('PY '):
            current_year = line[3:].strip()
        elif line == 'ER':
            if current_journal and current_year:
                yearly_counts[current_year][current_journal] += 1
            current_journal = None
            current_year = None

# Output base name
base_name = os.path.splitext(file_path)[0]
csv_output = base_name + 'MBE_journal_counts_by_year.csv'

# Get sorted list of years and all unique journals
all_years = sorted(yearly_counts.keys())
all_journals = sorted({journal for year in yearly_counts.values() for journal in year})

# ---------- Save CSV with Total Column ----------
with open(csv_output, 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Journal Name'] + all_years + ['Total'])  # Add 'Total' column header
    for journal in all_journals:
        yearly_values = [yearly_counts[year].get(journal, 0) for year in all_years]
        total = sum(yearly_values)
        writer.writerow([journal] + yearly_values + [total])

print(f"\n✅ Yearly journal counts (with totals, normalized) saved to:\n- CSV: {csv_output}")


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import re

# ---------- Load Input Data ----------
cited_by_df = pd.read_csv("MBE_Output/combined_MBE_no_2025MBE_journal_counts_by_year.csv")
title_to_id_df = pd.read_csv("title_to_scopus_id.tsv", sep="\t")
id_to_subject_df = pd.read_csv("scopus_id_to_code_to_subject_area.tsv", sep="\t")

# ---------- Normalize Journal Titles ----------
def normalize_title(title):
    title = title.lower()
    title = title.replace("&", "and")
    title = re.sub(r"[:\-]", " ", title)  # Replace colons and hyphens with space
    title = re.sub(r"[^\w\s]", "", title)  # Remove all punctuation except whitespace
    title = re.sub(r"\s+", " ", title)  # Collapse multiple spaces
    return title.strip()

cited_by_df['Normalized_Title'] = cited_by_df['Journal Name'].apply(normalize_title)
title_to_id_df['Normalized_Title'] = title_to_id_df['title_name'].apply(normalize_title)

# ---------- Merge journal → Scopus ID ----------
merged = pd.merge(
    cited_by_df,
    title_to_id_df[['scopus_id', 'Normalized_Title']],
    on='Normalized_Title',
    how='left'
)

# ---------- Save unmapped journals ----------
failed_titles = merged[merged['scopus_id'].isna()]
failed_titles[['Journal Name']].drop_duplicates().to_csv("MBE_cited_by_failed_titles_yearly.csv", index=False)

# ---------- Drop missing mappings ----------
merged = merged.dropna(subset=['scopus_id'])
merged['scopus_id'] = merged['scopus_id'].astype(int)
id_to_subject_df['scopus_id'] = id_to_subject_df['scopus_id'].astype(int)

# ---------- Merge Scopus ID → Subject Info ----------
final = pd.merge(merged, id_to_subject_df, on='scopus_id', how='left')

# ---------- Aggregate per journal, keep year data ----------
# Get year columns (they're all numbers or "Total")
year_columns = [col for col in cited_by_df.columns if col.isdigit() or col == "Total"]

# Group and aggregate
final_aggregated = final.groupby(['Journal Name', 'scopus_id'], as_index=False).agg(
    {
        **{col: 'first' for col in year_columns},  # Keep year values
        'asjc_code': lambda x: '; '.join(sorted(map(str, set(x.dropna())))),
        'asjc_description': lambda x: '; '.join(sorted(set(x.dropna())))
    }
)

# ---------- Final column order ----------
final_aggregated = final_aggregated[['Journal Name'] + year_columns + ['scopus_id', 'asjc_code', 'asjc_description']]

# ---------- Save final table ----------
final_aggregated.to_csv("MBE_cited_by_journals_with_subjects_with_years.csv", index=False)

print("✅ Done! Output saved as: ALL_cited_by_journals_with_subjects_full_years.csv")
print("⚠️  Check 'ALL_cited_by_failed_titles.csv' for any unmatched journals.")


In [None]:
import pandas as pd

# ---------- Category Sets ----------
Arts_Humanities_Social_science = {1204, 2216, 1200, 1201, 1205, 1208, 1206, 1202, 1207, 1203, 3310, 1213,
    1209, 1210, 1211, 1212, 1402, 2003, 3314, 3302, 3313, 3322, 1403, 1400, 1401, 1410, 1404, 1405, 1406,
    1407, 1408, 3315, 3316, 3303, 2002, 2000, 2001, 3304, 2308, 2309, 2310, 2311, 3317, 3305, 1409, 3308,
    3309, 3320, 3321, 3204, 3200, 3201, 3207, 3306, 3307, 3311, 3300, 3301, 3318, 3319, 3312, 1803}

Other = {2215, 1702, 2204, 1703, 1704, 1705, 1700, 1701, 1706, 1707, 2207, 1708, 1709, 1710, 1802, 1711,
    1712, 2614, 2212, 2214, 1502, 1503, 1500, 1501, 1504, 1505, 1506, 1507, 1508, 2205, 2208, 2100, 2101,
    2102, 2103, 2104, 2105, 2200, 2201, 2213, 2202, 2203, 2206, 2209, 2210, 2211, 1800, 1801, 2302, 2305,
    2300, 2301, 2306, 2312, 2500, 2501, 1900, 1901, 1905, 2304, 1906, 1909, 1605, 1606, 2613, 1804}

Life_Sciences_Medicine = {
    1100, 1101, 1102, 1103, 1104, 1105, 1106, 1107, 1108, 1109, 1110, 1111, 2702, 1314, 2737, 1302, 2402,
    1303, 1300, 1301, 1304, 1305, 1306, 1307, 1308, 1309, 1310, 1311, 2403, 2400, 2401, 2404, 1312, 2405,
    1315, 2406, 3502, 3503, 3500, 3501, 3504, 3505, 3506, 2307, 2703, 2802, 2704, 2803, 2705, 2804, 2805,
    2707, 2706, 2708, 2806, 2709, 2710, 2711, 2807, 2712, 2713, 2714, 2715, 2716, 2717, 2718, 2719, 2720,
    2721, 2722, 2723, 2725, 2724, 2700, 2701, 2726, 1313, 2727, 2808, 2728, 2800, 2801, 2729, 2730, 2731,
    2732, 2733, 2734, 2735, 2738, 2739, 2740, 2741, 2742, 2743, 2744, 2745, 2809, 2746, 2747, 2748, 2902,
    2903, 2904, 2905, 2906, 2907, 2908, 2909, 2910, 2911, 2912, 2913, 2914, 2915, 2900, 2901, 2916, 2917,
    2918, 2919, 2920, 2921, 2922, 2923, 3602, 3603, 3604, 3605, 3600, 3601, 3606, 3607, 3608, 3609, 3610,
    3613, 3614, 3615, 3616, 3002, 3003, 3004, 2736, 3000, 3001, 3611, 3005, 3202, 3203, 3205, 3206, 3612,
    3402, 3403, 3404, 3400, 3401
}

Natural_Science = {
    1602, 1600, 1601, 1603, 1604, 1605, 1606, 1607, 1902, 1903, 1900, 1901, 1904, 1905, 1906, 1907, 1908,
    1909, 1910, 1911, 1912, 1913, 2303, 2304, 2212, 2502, 2503, 2504, 2505, 2506, 2507, 2508, 3102, 3103,
    3107, 3104, 3105, 2610, 3106, 3100, 3101, 3108, 3109, 3110, 2613, 1804
}

Mathematics = {2602, 2603, 2604, 2605, 2606, 2607, 2608, 2609, 2600, 2601, 2611, 2612, 2613}

focused_journal_ids = {29663, 29610, 13845, 5200152802, 24562}

# ---------- Classification Function ----------
def classify_category(code_str, scopus_id):
    try:
        codes = set(int(code.strip()) for code in code_str.split(';') if code.strip())
    except Exception:
        return (-1, "No Code")
    
    if not codes:
        return (-1, "No Code")

    in_math = codes & Mathematics
    in_life = codes & Life_Sciences_Medicine
    in_natural = codes & Natural_Science
    in_other = codes & Other

    # 1: In our focus journals
    if scopus_id in focused_journal_ids:
        return (1, "Focused Journals")

    # 2: At least one in math and at least one in life
    if in_math and in_life and all(code in in_math or code in in_life for code in codes):
        return (2, "Mathematics and Life Sciences")

    # 3: All codes are in mathematics
    elif in_math == codes:
        return (3, "Mathematics")

    # 4: All codes are in life sciences
    elif in_life == codes:
        return (4, "Life Sciences")

    # 5: All codes are 1000
    elif codes == {1000}:
        return (5, "Multidisciplinary")

    # 6: At least one code in life sciences, and all others in natural/math/other
    elif in_life and all(code in (in_natural | in_math | in_other) for code in codes if code not in in_life):
        return (6, "Life Sciences and STEM")

    # 7: Anything else
    else:
        return (7, "Other")

# ---------- File Paths ----------
input_file = "MBE_cited_by_journals_with_subjects_with_years.csv"
output_file = "MBE_cited_by_with_categories_and_years.csv"

# ---------- Load Data ----------
df = pd.read_csv(input_file)

# ---------- Apply Category Classification ----------
df[["Our Category", "Category Name"]] = df.apply(
    lambda row: pd.Series(classify_category(row["asjc_code"], row["scopus_id"])), axis=1
)

# ---------- Save Output ----------
df.to_csv(output_file, index=False)
print(f"✅ Saved to: {output_file}")


In [None]:
%R
# Load required library
library(ggplot2)

# Data values
percentages <- Categories_cited_by$Categories

# Category names
category_names <- c(
  "Focus Journals",                 
  "Mathematics and Life Science",  
  "Mathematics",                   
  "Life Sciences",                 
  "Multidisciplinary",             
  "Life Science and STEM",         
  "Other"
)

# Custom colors
custom_colors <- c(
  "orange",    # 1: Focused Journals
  "blue",      # 2: Math and Life Sciences
  "red",       # 3: Mathematics
  "green",     # 4: Life Sciences
  "purple",    # 5: Multidisciplinary
  "yellow",    # 6: LS and STEM
  "grey"       # 7: Other
)

# Create a data frame
df <- data.frame(
  Category = factor(category_names, levels = category_names),  # preserve order
  Percentage = percentages
)

# Plot a stacked bar chart (single stacked column)
ggplot(df, aes(x = "All Categories", y = Percentage, fill = Category)) +
  geom_bar(stat = "identity", width = 0.5) +
  scale_fill_manual(values = custom_colors) +
  labs(title = "Category Distribution", x = "", y = "Percentage") +
  theme_minimal() +
  theme(axis.text.x = element_blank(),  # remove x-axis text since only one bar
        axis.ticks.x = element_blank())


In [None]:
%R

# Define the file names and titles
files <- c(
  "BMB_cited_by_categories.csv",
  "JMB_cited_by_categories.csv",
  "JTB_cited_by_categories.csv",
  "MB_cited_by_categories.csv",
  "MBE_cited_by_categories.csv"
)

titles <- c(
  "Bull. Math. Biol.",
  "J. Math. Biol.",
  "J. Theor. Biol.",
  "Math. Biosci.",
  "Math. Biosci. & Eng."
)

# Define the custom colors (matching your category mapping)
custom_colors <- c(
  "orange",    # 1: Focused Journals
  "blue",      # 2: Math and Life Sciences
  "red",       # 3: Mathematics
  "green",     # 4: Life Sciences
  "purple",    # 5: Multidisciplinary
  "yellow",    # 6: LS and STEM
  "grey"       # 7: Other
)

# Category name mapping
category_names <- c(
  "Focus Journals",                 
  "Mathematics and Life Science",  
  "Mathematics",                   
  "Life Sciences",                 
  "Multidisciplinary",             
  "Life Science and STEM",         
  "Other"
)

# Set layout to plot 1 row with 5 columns
par(mfrow = c(1, 5), mar = c(2, 2, 2, 2))  # Adjust margins for closer titles

# Loop through each file and plot
for (i in 1:length(files)) {
  df <- read.csv(paste0("~/OneDrive - Uppsala universitet/Aim 1/Results/Interdisciplinarity/Cited_by/", files[i]))
  df$Count <- as.numeric(df$Count)
  summary_df <- aggregate(Count ~ Our.Category, data = df, sum)
  summary_df$Category_Name <- category_names[summary_df$Our.Category]
  total_count <- sum(summary_df$Count)
  summary_df$Percentage <- (summary_df$Count / total_count) * 100
  
  pie(summary_df$Count,
      labels = paste0(round(summary_df$Percentage, 1), "%"),
      main = titles[i],
      col = custom_colors[summary_df$Our.Category],
      clockwise = TRUE)
}

In [None]:
%R

library(tidyverse)

# Load data
df <- read_csv("/Users/kirpu383/OneDrive - Uppsala universitet/Aim 1/Results/Interdisciplinarity/Cited_by/ALL_cited_by_with_categories_and_years.csv")

# Pivot year columns into long format
df_long <- df %>%
  pivot_longer(cols = matches("^\\d{4}$"),  # Matches 4-digit year columns
               names_to = "Year",
               values_to = "Count") %>%
  mutate(Year = as.integer(Year))

# Summarize and normalize
summary_df <- df_long %>%
  group_by(Year, `Category Name`) %>%
  summarise(Count = sum(Count, na.rm = TRUE), .groups = "drop") %>%
  group_by(Year) %>%
  filter(sum(Count, na.rm = TRUE) > 0) %>%
  mutate(Percentage = Count / sum(Count) * 100) %>%
  ungroup()

# Define custom colors and order
category_order <- c(
  "Focused Journals",
  "Mathematics and Life Sciences",
  "Mathematics",
  "Life Sciences",
  "Multidisciplinary",
  "Life Sciences and STEM",
  "Other"
)

custom_colors <- c(
  "Focused Journals" = "orange",
  "Mathematics and Life Sciences" = "blue",
  "Mathematics" = "red",
  "Life Sciences" = "green",
  "Multidisciplinary" = "purple",
  "Life Sciences and STEM" = "yellow",
  "Other" = "grey"
)

# Set factor level to control stacking order
summary_df <- summary_df %>%
  mutate(`Category Name` = factor(`Category Name`, levels = category_order))

# Plot
ggplot(summary_df, aes(x = Year, y = Percentage, fill = `Category Name`)) +
  geom_area(alpha = 0.8, color = "black", size = 0.1) +
  scale_y_continuous(labels = scales::percent_format(scale = 1)) +
  scale_fill_manual(values = custom_colors) +
  labs(
    title = "Percentage of Journal Categories Over Time",
    x = "Year",
    y = "Percentage of Total",
    fill = "Category"
  ) +
  theme_minimal(base_size = 14)

summary_wide <- summary_df %>%
  select(Year, `Category Name`, Percentage) %>%
  pivot_wider(names_from = `Category Name`, values_from = Percentage)

# View or export
View(summary_wide)
#write_csv(summary_wide, "cited_by_percentages_wide.csv")


References

In [None]:
from collections import Counter
import csv
from pathlib import Path

def process_year_files_combined_csv():
    current_dir = Path.cwd()
    years_dir = current_dir / "Years_MBE/combined"
    
    if not years_dir.exists():
        print(f"Error: 'Years' folder not found in {current_dir}")
        return
    
    # Master CSV file
    master_csv = years_dir / "MBE_years_journal_counts.csv"
    
    with open(master_csv, 'w', newline='', encoding='utf-8') as master_file:
        writer = csv.writer(master_file)
        writer.writerow(['Year', 'Journal Name', 'Count'])
        
        for year_file in years_dir.glob('combined_*.txt'):
            year = year_file.stem.split('_')[-1]  # Extract year from filename
            journal_counter = Counter()
            
            with open(year_file, 'r', encoding='utf-8') as f:
                for line in f:
                    line = line.strip()
                    if line.startswith('SO '):
                        journal_name = line[3:].strip()
                        if journal_name:
                            journal_counter[journal_name] += 1
            
            # Write to master CSV
            for journal, count in journal_counter.most_common():
                writer.writerow([year, journal, count])
            
            print(f"Processed year: {year}")
    
    print(f"\n✅ All years combined in: {master_csv}")

if __name__ == "__main__":
    process_year_files_combined_csv()


In [None]:
import pandas as pd
import re

# -----------------------
# Step 1: Load data (wide format with years)
# -----------------------
cited_by_df = pd.read_csv("all_journals_by_year_pivoted.csv")

# Drop "Total Count" column if present
if "Total Count" in cited_by_df.columns:
    cited_by_df = cited_by_df.drop(columns=["Total Count"])

# -----------------------
# Step 2: Normalize titles
# -----------------------
def normalize_title(title):
    title = str(title).lower()
    title = title.replace("&", "and")
    title = re.sub(r"[:\-]", " ", title)
    title = re.sub(r"[^\w\s]", "", title)
    title = re.sub(r"\s+", " ", title)
    return title.strip()

cited_by_df['Normalized_Title'] = cited_by_df['Journal Name'].apply(normalize_title)

# -----------------------
# Step 3: Map Scopus IDs
# -----------------------
title_to_id_df = pd.read_csv("title_to_scopus_id.tsv", sep="\t")
title_to_id_df['Normalized_Title'] = title_to_id_df['title_name'].apply(normalize_title)

# Merge on normalized title
merged = pd.merge(
    cited_by_df,
    title_to_id_df[['Normalized_Title', 'scopus_id']],
    on='Normalized_Title',
    how='left'
)

# Save unmatched titles
merged[merged['scopus_id'].isna()][['Journal Name']].drop_duplicates().to_csv("failed_titles_references_all.csv", index=False)

# Drop unmatched
merged = merged.dropna(subset=['scopus_id'])
merged['scopus_id'] = merged['scopus_id'].astype(int)

# -----------------------
# Step 4: Map ASJC subject areas
# -----------------------
id_to_subject_df = pd.read_csv("scopus_id_to_code_to_subject_area.tsv", sep="\t")
id_to_subject_df['scopus_id'] = id_to_subject_df['scopus_id'].astype(int)

# Aggregate subject areas per journal
subject_grouped = id_to_subject_df.groupby('scopus_id').agg({
    'asjc_code': lambda x: "; ".join(map(str, sorted(set(x)))),
    'asjc_description': lambda x: "; ".join(sorted(set(x)))
}).reset_index()

# -----------------------
# Step 5: Merge subjects into full original table
# -----------------------
final = pd.merge(
    merged,
    subject_grouped,
    on='scopus_id',
    how='left'
)

# -----------------------
# Step 6: Reorder and save
# -----------------------
# Move subject columns next to journal name
first_cols = ['Journal Name', 'scopus_id', 'asjc_code', 'asjc_description']
year_cols = [col for col in final.columns if col not in first_cols + ['Normalized_Title']]
final_output = final[first_cols + year_cols]

# Save
final_output.to_csv("ALL_journals_with_subjects_and_counts.csv", index=False)


In [None]:
import pandas as pd

# ---------- Category Sets ----------
Arts_Humanities_Social_science = {1204, 2216, 1200, 1201, 1205, 1208, 1206, 1202, 1207, 1203, 3310, 1213,
    1209, 1210, 1211, 1212, 1402, 2003, 3314, 3302, 3313, 3322, 1403, 1400, 1401, 1410, 1404, 1405, 1406,
    1407, 1408, 3315, 3316, 3303, 2002, 2000, 2001, 3304, 2308, 2309, 2310, 2311, 3317, 3305, 1409, 3308,
    3309, 3320, 3321, 3204, 3200, 3201, 3207, 3306, 3307, 3311, 3300, 3301, 3318, 3319, 3312, 1803}

Other = {2215, 1702, 2204, 1703, 1704, 1705, 1700, 1701, 1706, 1707, 2207, 1708, 1709, 1710, 1802, 1711,
    1712, 2614, 2212, 2214, 1502, 1503, 1500, 1501, 1504, 1505, 1506, 1507, 1508, 2205, 2208, 2100, 2101,
    2102, 2103, 2104, 2105, 2200, 2201, 2213, 2202, 2203, 2206, 2209, 2210, 2211, 1800, 1801, 2302, 2305,
    2300, 2301, 2306, 2312, 2500, 2501, 1900, 1901, 1905, 2304, 1906, 1909, 1605, 1606, 2613, 1804}

Life_Sciences_Medicine = {
    1100, 1101, 1102, 1103, 1104, 1105, 1106, 1107, 1108, 1109, 1110, 1111, 2702, 1314, 2737, 1302, 2402,
    1303, 1300, 1301, 1304, 1305, 1306, 1307, 1308, 1309, 1310, 1311, 2403, 2400, 2401, 2404, 1312, 2405,
    1315, 2406, 3502, 3503, 3500, 3501, 3504, 3505, 3506, 2307, 2703, 2802, 2704, 2803, 2705, 2804, 2805,
    2707, 2706, 2708, 2806, 2709, 2710, 2711, 2807, 2712, 2713, 2714, 2715, 2716, 2717, 2718, 2719, 2720,
    2721, 2722, 2723, 2725, 2724, 2700, 2701, 2726, 1313, 2727, 2808, 2728, 2800, 2801, 2729, 2730, 2731,
    2732, 2733, 2734, 2735, 2738, 2739, 2740, 2741, 2742, 2743, 2744, 2745, 2809, 2746, 2747, 2748, 2902,
    2903, 2904, 2905, 2906, 2907, 2908, 2909, 2910, 2911, 2912, 2913, 2914, 2915, 2900, 2901, 2916, 2917,
    2918, 2919, 2920, 2921, 2922, 2923, 3602, 3603, 3604, 3605, 3600, 3601, 3606, 3607, 3608, 3609, 3610,
    3613, 3614, 3615, 3616, 3002, 3003, 3004, 2736, 3000, 3001, 3611, 3005, 3202, 3203, 3205, 3206, 3612,
    3402, 3403, 3404, 3400, 3401
}

Natural_Science = {
    1602, 1600, 1601, 1603, 1604, 1605, 1606, 1607, 1902, 1903, 1900, 1901, 1904, 1905, 1906, 1907, 1908,
    1909, 1910, 1911, 1912, 1913, 2303, 2304, 2212, 2502, 2503, 2504, 2505, 2506, 2507, 2508, 3102, 3103,
    3107, 3104, 3105, 2610, 3106, 3100, 3101, 3108, 3109, 3110, 2613, 1804
}

Mathematics = {2602, 2603, 2604, 2605, 2606, 2607, 2608, 2609, 2600, 2601, 2611, 2612, 2613}

focused_journal_ids = {29663, 29610, 13845, 5200152802, 24562}

# ---------- Category Classification Function ----------
def classify_category(code_str, scopus_id):
    if pd.isna(code_str):
        return (-1, "No Code")
    
    try:
        codes = set(int(code.strip()) for code in str(code_str).split(';') if code.strip())
    except Exception:
        return (-1, "No Code")
    
    if not codes:
        return (-1, "No Code")

    in_math = codes & Mathematics
    in_life = codes & Life_Sciences_Medicine
    in_natural = codes & Natural_Science
    in_other = codes & Other

    # Category logic
    if scopus_id in focused_journal_ids:
        return (1, "Focused Journals")
    elif in_math and in_life and all(code in in_math or code in in_life for code in codes):
        return (2, "Mathematics and Life Sciences")
    elif in_math == codes:
        return (3, "Mathematics")
    elif in_life == codes:
        return (4, "Life Sciences")
    elif codes == {1000}:
        return (5, "Multidisciplinary")
    elif in_life and all(code in (in_math | in_natural | in_other) for code in codes if code not in in_life):
        return (6, "Life Sciences and STEM")
    else:
        return (7, "Other")

# ---------- Load File ----------
input_file = "All_journals_with_subjects_and_counts.csv"
df = pd.read_csv(input_file)

# ---------- Apply Classification ----------
df[['Our Category', 'Category Name']] = df.apply(
    lambda row: pd.Series(classify_category(row['asjc_code'], row['scopus_id'])),
    axis=1
)

# ---------- Reorder and Save ----------
# Put category columns right after 'Journal Name' (assumes it's first)
col_order = ['Journal Name', 'Our Category', 'Category Name'] + [col for col in df.columns if col not in ['Journal Name', 'Our Category', 'Category Name']]
df = df[col_order]

output_file = "ALL_references_with_categories_and_years.csv"
df.to_csv(output_file, index=False)

# ---------- Summary ----------
print(f"✅ Saved to: {output_file}")
print(f"Total journals processed: {len(df)}")
print("Category distribution:")
print(df['Category Name'].value_counts())


In [None]:
%R

# Load required library
library(ggplot2)

# Data values
percentages <- Caregories_references$Category

# Category names
category_names <- c(
  "Focus Journals",                 
  "Mathematics and Life Science",  
  "Mathematics",                   
  "Life Sciences",                 
  "Multidisciplinary",             
  "Life Science and STEM",         
  "Other"
)

# Custom colors
custom_colors <- c(
  "orange",    # 1: Focused Journals
  "blue",      # 2: Math and Life Sciences
  "red",       # 3: Mathematics
  "green",     # 4: Life Sciences
  "purple",    # 5: Multidisciplinary
  "yellow",    # 6: LS and STEM
  "grey"       # 7: Other
)

# Create a data frame
df <- data.frame(
  Category = factor(category_names, levels = category_names),  # preserve order
  Percentage = percentages
)

# Plot a stacked bar chart (single stacked column)
ggplot(df, aes(x = "All Categories", y = Percentage, fill = Category)) +
  geom_bar(stat = "identity", width = 0.5) +
  scale_fill_manual(values = custom_colors) +
  labs(title = "Category Distribution", x = "", y = "Percentage") +
  theme_minimal() +
  theme(axis.text.x = element_blank(),  # remove x-axis text since only one bar
        axis.ticks.x = element_blank())


In [None]:
%R

# Define the file names and titles
files <- c(
  "BMB_referenced_categories.csv",
  "JMB_referenced_categories.csv",
  "JTB_referenced_categories.csv",
  "MB_referenced_categories.csv",
  "MBE_referenced_categories.csv"
)

titles <- c(
  "Bull. Math. Biol.",
  "J. Math. Biol.",
  "J. Theor. Biol.",
  "Math. Biosci.",
  "Math. Biosci. & Eng."
)

# Define the custom colors (matching your category mapping)
custom_colors <- c(
  "orange",    # 1: Focused Journals
  "blue",      # 2: Math and Life Sciences
  "red",       # 3: Mathematics
  "green",     # 4: Life Sciences
  "purple",    # 5: Multidisciplinary
  "yellow",    # 6: LS and STEM
  "grey"       # 7: Other
)

# Category name mapping
category_names <- c(
  "Focus Journals",                 
  "Mathematics and Life Science",  
  "Mathematics",                   
  "Life Sciences",                 
  "Multidisciplinary",             
  "Life Science and STEM",         
  "Other"
)

# Set layout to plot 1 row with 5 columns
par(mfrow = c(1, 5), mar = c(2, 2, 2, 2))  # Adjust margins for closer titles

# Loop through each file and plot
for (i in 1:length(files)) {
  df <- read.csv(paste0("~/OneDrive - Uppsala Universitet/Aim 1/Results/Interdisciplinarity/References/", files[i]))
  df$Count <- as.numeric(df$Count)
  summary_df <- aggregate(Count ~ Our.Category, data = df, sum)
  summary_df$Category_Name <- category_names[summary_df$Our.Category]
  total_count <- sum(summary_df$Count)
  summary_df$Percentage <- (summary_df$Count / total_count) * 100
  
  pie(summary_df$Count,
      labels = paste0(round(summary_df$Percentage, 1), "%"),
      main = titles[i],
      col = custom_colors[summary_df$Our.Category],
      clockwise = TRUE)
}

In [None]:
library(tidyverse)
library(ggplot2)

# Load data
df <- read_csv("/Users/kirpu383/OneDrive - Uppsala universitet/Aim 1/Results/Interdisciplinarity/References/ALL_references_with_categories_and_years.csv")

# Pivot year columns into long format
df_long <- df %>%
  pivot_longer(
    cols = matches("^\\d{4}$"),  # Matches columns like 1962, 1963, ...
    names_to = "Year",
    values_to = "Count"
  ) %>%
  mutate(Year = as.integer(Year))

# Define the correct category order (same as Python script)
category_order <- c(
  "Focused Journals",
  "Mathematics and Life Sciences",
  "Mathematics",
  "Life Sciences", 
  "Multidisciplinary",
  "Life Sciences and STEM",
  "Other",
  "No Code"  # Added from your Python classification
)

# Summarize total count per year per category
summary_df <- df_long %>%
  # Ensure categories are in correct order
  mutate(`Category Name` = factor(`Category Name`, levels = category_order)) %>%
  group_by(Year, `Category Name`) %>%
  summarise(Count = sum(Count, na.rm = TRUE), .groups = "drop")

# Remove years with zero total count and calculate percentages
summary_df <- summary_df %>%
  group_by(Year) %>%
  filter(sum(Count, na.rm = TRUE) > 0) %>%
  mutate(Percentage = Count / sum(Count) * 100) %>%
  ungroup()

# Define custom colors for categories (same as your original)
custom_colors <- c(
  "Focused Journals" = "orange",
  "Mathematics and Life Sciences" = "blue",
  "Mathematics" = "red",
  "Life Sciences" = "green",
  "Multidisciplinary" = "purple",
  "Life Sciences and STEM" = "yellow",
  "Other" = "grey",
  "No Code" = "black"  # Added for completeness
)

# Plot: Stacked area plot with custom colors
ggplot(summary_df, aes(x = Year, y = Percentage, fill = `Category Name`)) +
  geom_area(alpha = 0.8, color = "black", size = 0.1) +
  scale_y_continuous(labels = scales::percent_format(scale = 1)) +
  scale_fill_manual(values = custom_colors, drop = FALSE) +  # Maintain all categories
  labs(
    title = "Percentage of Journal Categories Over Time",
    x = "Year",
    y = "Percentage of Total",
    fill = "Category"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    legend.position = "bottom",
    plot.title = element_text(hjust = 0.5)
  )

# Save the plot
ggsave("journal_categories_over_time.png", width = 10, height = 6, dpi = 300)

summary_wide <- summary_df %>%
  select(Year, `Category Name`, Percentage) %>%
  pivot_wider(names_from = `Category Name`, values_from = Percentage)

# View or export
View(summary_wide)