In [10]:
import os
import bibtexparser
from bibtexparser.bwriter import BibTexWriter
from bibtexparser.bibdatabase import BibDatabase
import random

# Define the base directory containing the folders for each journal
base_dir = '.'

# Define criteria weights
criteria_weights = {
    'bibliographic': 0.2,
    'relevance': 0.2,
    'publication_date': 0.2,
    'times_cited': 0.2,
    'times_viewed': 0.2,
}

# Function to read bib files and return entries
def read_bib_file(file_path):
    with open(file_path, encoding='utf-8') as bibtex_file:
        bib_database = bibtexparser.load(bibtex_file)
        return bib_database.entries

# Process each journal folder
for journal_folder in os.listdir(base_dir):
    journal_path = os.path.join(base_dir, journal_folder)
    if os.path.isdir(journal_path):
        # Read all entries from the provided bib files
        bib_file_paths = [os.path.join(journal_path, f) for f in os.listdir(journal_path) if f.endswith('.bib')]

        # Ensure we have all required criteria files
        if len(bib_file_paths) != len(criteria_weights):
            print(f"Warning: Skipping {journal_folder} as it does not contain all required .bib files.")
            continue

        # Calculate the number of samples to take from each criterion based on weights
        total_samples = 20
        samples_count = {criterion: int(total_samples * weight) for criterion, weight in criteria_weights.items()}

        # Adjust for rounding issues to ensure the total number of samples is exactly total_samples
        adjustment = total_samples - sum(samples_count.values())
        most_weighted_criterion = max(criteria_weights, key=criteria_weights.get)
        samples_count[most_weighted_criterion] += adjustment

        # Sample from each criterion
        selected_entries = []
        for file_path, (criterion, count) in zip(bib_file_paths, samples_count.items()):
            criterion_entries = read_bib_file(file_path)
            # Sample entries ensuring we don't exceed available entries
            if len(criterion_entries) < count:
                count = len(criterion_entries)
            selected_entries.extend(random.sample(criterion_entries, count))

        # Remove duplicate entries by ID
        unique_entries = {entry['ID']: entry for entry in selected_entries}.values()

        # If we have less than total_samples unique entries due to duplicates, add more from the largest criterion
        if len(unique_entries) < total_samples:
            remaining_count = total_samples - len(unique_entries)
            most_weighted_file_path = next(path for path in bib_file_paths if most_weighted_criterion in path)
            remaining_entries = [entry for entry in read_bib_file(most_weighted_file_path) if entry['ID'] not in {entry['ID'] for entry in unique_entries}]
            unique_entries = list(unique_entries) + random.sample(remaining_entries, min(remaining_count, len(remaining_entries)))

        # Save the selected entries to a new .bib file
        bib_db = BibDatabase()
        bib_db.entries = unique_entries
        writer = BibTexWriter()
        abstracts_dir = os.path.join(journal_path, 'abstracts')
        os.makedirs(abstracts_dir, exist_ok=True)
        
        selected_bib_path = os.path.join(abstracts_dir, 'selected_papers.bib')
        
        
        with open(selected_bib_path, 'w', encoding='utf-8') as bibfile:
            bibfile.write(writer.write(bib_db))

        # Save each abstract to a separate .txt file
        for entry in unique_entries:
            abstract_file_path = os.path.join(abstracts_dir, f"{entry['ID']}.txt")
            with open(abstract_file_path, 'w', encoding='utf-8') as abstract_file:
                abstract_file.write(entry.get('abstract', 'No abstract available.'))

print("Done")


Done
