In [1]:
import pandas as pd
import numpy as np
import re

In [None]:
data = "all_papers.csv"

data = pd.read_csv(data)
print(f"Found {len(data)} records")

data = data.dropna(subset=['Title'])
data = data.dropna(subset=['Abstract'])
print(f"... but only {len(data)} records have title and abstract")

Found 998 records
... but only 571 records have title and abstract


In [4]:
data[["Title", "Abstract", "Author Keywords"]]

Unnamed: 0,Title,Abstract,Author Keywords
0,The virtual driving instructor: Multi-agent sy...,"This work introduces the design, development, ...",
1,Knowledge graph-based multimodal neural networ...,The application of artificial intelligence in ...,defect detection; knowledge graph; multimodal ...
2,An exploration of knowledge-organizing technol...,Chronic low back pain (LBP) is influenced by a...,artificial intelligence; biopsychosocial model...
3,A Knowledge Retrieval Framework for Household ...,"In the field of domestic cognitive robotics, i...",Cognitive robotics; Knowledge retrieval framew...
4,Turning Transport Data to Comply with EU Stand...,Complying with the EU Regulation on multimodal...,Multimodal transport knowledge graph; Semantic...
...,...,...,...
566,Hybrid Collective Intelligence for Decision Su...,"Human knowledge is growing exponentially, prov...",Decision support systems
567,Organizations' terpersonal tivity owledge pres...,Knowledge is essential for organizations' grow...,Knowledge management
568,Towards Explainable Automatic Knowledge Graph ...,Knowledge graphs are important in human-center...,Knowledge graph
569,Assisting the Assistant: obot for Voice Custom...,"Despite recent advances in automation, custome...",Ontology


In [5]:

def filter_papers(df, inclusion_terms, exclusion_terms):
    """
    Filters a DataFrame of papers based on inclusion and exclusion criteria.

    Args:
        df: pandas DataFrame with columns 'Title', 'Abstract', and 'Author Keywords'.
        inclusion_terms: List of terms.  If any are present (case-insensitive)
          in Title or Abstract, the paper is included (before title exclusion).
        exclusion_terms: List of terms. If any are present (case-insensitive) in the Title,
          or in the Author Keywords, the paper is excluded, even if inclusion terms match.

    Returns:
        A new pandas DataFrame containing only the filtered papers.
    """

    def check_inclusion(row,
                        search_terms,
                        check_title=True,
                        check_abstract=True,
                        check_keywords=True):
        text = ""
        if check_title:
            text += row['Title'].lower()
        if check_abstract:
            text += " " + row['Abstract'].lower()
        if check_keywords and isinstance(row['Author Keywords'], str):
            text += " " + row['Author Keywords'].lower()

        text = text.strip() # remove any leading spaces
        return any(term.lower() in text for term in search_terms)

    # Step 1: Apply inclusion criteria. Create a boolean mask.
    inclusion_mask = df.apply(
        lambda row: check_inclusion(row, inclusion_terms, check_keywords=False), axis=1)
    filtered_df = df[inclusion_mask]

    # Step 2: Apply title exclusion criteria. Create another boolean mask.
    exclusion_mask = filtered_df.apply(
        lambda row: check_inclusion(row, exclusion_terms, check_abstract=False), axis=1)
    # Step 3: Filter by keeping rows that are NOT in the exclusion mask.
    filtered_df = filtered_df[~exclusion_mask]

    return filtered_df


In [None]:

# Define your inclusion and exclusion terms
INCLUSION_TERMS = ["knowledge graph",
                   "ontolog",
                   "pattern",
                   "model"]  # title or abstract must contain one of these
EXCLUSION_TERMS = ["machine learning",
                   "gnn",
                   "loss",
                   "neural",
                   "embedding",
                   "transport",
                   "deep learning",
                   "adversarial",
                   "assistant",
                   "policy",
                   "question answer"]  # title or keywords

filtered_df = filter_papers(data.copy(), INCLUSION_TERMS, EXCLUSION_TERMS)
print("\nFiltered DataFrame (Option 1):")
print(filtered_df[['Title', 'Abstract', 'Author Keywords']])

# filtered_df.to_csv('filtered_papers.csv', index=False)


Filtered DataFrame (Option 1):
                                                 Title  \
0    The virtual driving instructor: Multi-agent sy...   
2    An exploration of knowledge-organizing technol...   
3    A Knowledge Retrieval Framework for Household ...   
5    Citation Recommendation for Research Papers vi...   
6    Learning Visual Models Using a Knowledge Graph...   
..                                                 ...   
563  Augmenting Visual Information in Knowledge Gra...   
565  Multi-Mode Clustering for Graph-Based Lifelog ...   
567  Organizations' terpersonal tivity owledge pres...   
568  Towards Explainable Automatic Knowledge Graph ...   
570  Neurosymbolic Narrative Generation for Cultura...   

                                              Abstract  \
0    This work introduces the design, development, ...   
2    Chronic low back pain (LBP) is influenced by a...   
3    In the field of domestic cognitive robotics, i...   
5    Citation recommendation for resear

In [None]:
for index, row in filtered_df.iterrows():
    print(row['Title'])

The virtual driving instructor: Multi-agent system collaborating via knowledge graph for scalable driver education
An exploration of knowledge-organizing technologies to advance transdisciplinary back pain research
A Knowledge Retrieval Framework for Household Objects and Actions with External Knowledge
Citation Recommendation for Research Papers via Knowledge Graphs
Learning Visual Models Using a Knowledge Graph as a Trainer
Empowering Well-Being Through Conversational Coaching for Active and Healthy Ageing
IMKG: The Internet Meme Knowledge Graph
Object search by a concept-conditioned object detector
Drug-CoV: a drug-origin knowledge graph discovering drug repurposing targeting COVID-19
BiMuF: a bi-directional recommender system with multi-semantic filter for online recruitment
AI-driven streamlined modeling: experiences and lessons learned from multiple domains
A survey on knowledge-enhanced multimodal learning
Profiling temporal learning interests with time-aware transformers and kn