** *This file requires uploading two basic files: "0entities_list_update.xlsx" and "0relationships_list_update.xlsx". The other files are process files and do not need to be uploaded. It is currently the most comprehensive version. **

# Requirements

In [None]:
!pip install rdflib
!python -m spacy download en_core_web_lg

Collecting rdflib
  Downloading rdflib-7.0.0-py3-none-any.whl (531 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m531.9/531.9 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting isodate<0.7.0,>=0.6.0 (from rdflib)
  Downloading isodate-0.6.1-py2.py3-none-any.whl (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.7/41.7 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: isodate, rdflib
Successfully installed isodate-0.6.1 rdflib-7.0.0
2024-01-23 18:42:37.124254: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-01-23 18:42:38.096009: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-01-23 18:42:38.097207: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the followi

In [None]:
import pandas as pd
import spacy
import re
from spacy.language import Language
from spacy.tokens import Span
from rdflib import URIRef

# Named Entity Recognition and Relationship Identification

In [None]:
# List of texts to be processed
texts = [
    'Who is working in the Computational Materials Science field?',
    'What are the research projects associated to EMMO?',
    'Who are the contributors of the data "datasets"?',
    'Who is working with Researcher "Ebrahim Norouzi" in the same group?',
    'Who is the email address of "ParaView"?',
    'What are the affiliations of Volker Hofmann?',
    'What is "Molecular Dynamics" Software? List the programming language, documentation page, repository, and license information.',
    'What are pre- and post-processing tools for MD simulations?',
    'What are some workflow environments for computational materials science?',
    'How should I cite pyiron?',
    'Where can I find a list of interatomic potentials?',
    'What are python libraries used for calculating local atomic structural environment?',
    'What are the electronic lab notebooks available?',
    'What are the software for Molecular Dynamics (MD)?',
    'What are the ontologies in nanomaterials domain?',
    'What is DAMASK?',
    'What are the data portals for materials science ontologies?',
    'What are the instruments for APT?',
    'In which institution can I find tomography equipment?',
    'What are the educational resources for Ontology?',
    'What is the API of Materials Project?',
    'Which simulation software have a python API?',
    'What is the documentation of the "MatDB Online"?',
    'What are the types of software licenses?',
    'What are the software used to produce the data in the Materials Cloud repository?',
    'What are datasets produced by the BAM organization?',
    'What are some available datasets of mechanical properties of steels?',
    'What are datasets related to "Transmission electron microscopy"?',
    'What is the license of the dataset "Elastic Constant Demo Data"?',
    'What is the repository for "BAM reference data"?',
    'What are the different data formats in the "BAM reference data"?',
    'What is the software version of "pacemaker"?',
    'What is the field of research "BAM reference data"?',
    'What is the description of the "BAM reference data"?',
    'What are the datasets produced in 2022?',
    'Who are the creators of the "BAM reference data"?',
    'What are the datasets published by "BAM"?'
]

In [None]:
# Regular expression to match content within quotes
pattern = r'"(.*?)"'

# Use the @Language.component decorator to define a pipeline component
@spacy.Language.component("quote_extractor")
def custom_component(doc):
    matches = re.finditer(pattern, doc.text)
    new_ents = []
    for match in matches:
        start, end = match.span()
        span = doc.char_span(start + 1, end - 1, label="QUOTED_TEXT")  # +1 and -1 to remove the quotes
        if span is not None:
            # Check for overlap with existing entities
            overlap = any(span.start < ent.end and span.end > ent.start for ent in doc.ents)
            if not overlap:
                new_ents.append(span)
    doc.ents = list(doc.ents) + new_ents
    return doc

# Load the English model
nlp = spacy.load("en_core_web_sm")  # small model is enough

# Add the custom component to the pipeline
nlp.add_pipe("quote_extractor", before="ner")  # before is logical

# New function to find the adj-noun phrase to as an entity
def find_adjacent_noun_phrases(doc):
    """Extracts individual phrases composed of adjacent nouns, including a preceding adjective if present."""
    noun_phrases = []
    current_phrase = []

    for i, token in enumerate(doc):
        if token.pos_ in ['NOUN', 'PROPN']:  # Check for nouns or proper nouns
            if current_phrase and doc[i-1].pos_ not in ['NOUN', 'PROPN', 'ADJ']:
                # If the current phrase is not empty and the previous token is not a noun, proper noun, or adjective, add the phrase to the list
                noun_phrases.append(' '.join(current_phrase))
                current_phrase = []  # Reset the current phrase

            if i > 0 and doc[i-1].pos_ == 'ADJ':  # Check if the previous word is an adjective
                current_phrase.append(doc[i-1].text + ' ' + token.text)  # Add adjective and noun together
            else:
                current_phrase.append(token.text)  # Add noun
        else:
            if current_phrase:
                noun_phrases.append(' '.join(current_phrase))  # If the current phrase is not empty, add it to the list
                current_phrase = []  # Reset the current phrase

    if current_phrase:  # Ensure the last phrase is added
        noun_phrases.append(' '.join(current_phrase))

    return noun_phrases


# Function to extract both named entities and predicate verbs
def extract_entities_and_verbs(text):
    doc = nlp(text)

    # Entity Extraction
    named_entities = [ent.text for ent in doc.ents]

    # If named_entities is empty, search for phrases between prepositions and question marks
    if not named_entities:
        phrase = find_prep_to_question_phrase(doc)
        if phrase:
            named_entities = [phrase]
        else:
            # Use adjacent noun phrases as a fallback
            adjacent_nouns = find_adjacent_noun_phrases(doc)
            if adjacent_nouns:
                # Flatten the list of adjacent noun phrases
                named_entities = [phrase for phrase in adjacent_nouns]

    # Predicate Verb Extraction
    predicate_verbs = process_text(text)

    return named_entities, predicate_verbs

# Function to process each text
def process_text(text):
    doc = nlp(text)

    # Attributes Extraction
    predicate_verbs = []

    # Function to convert verbs to their lemma form
    def convert_to_lemma(verb):
        return verb.lemma_

    # Function to convert nouns to their singular form
    def convert_to_singular_noun(noun):
        if noun.tag_ == 'NNS':  # Check if it's a plural noun
            return noun.lemma_
        else:
            return noun.text

    # First, look for phrases between 'be' and prepositions
    for token in doc:
        # Looking for phrases between 'be' and prepositions
        if token.lemma_ == "be":
            phrase = []
            preposition_found = False
            for next_token in doc[token.i + 1:]:
                if next_token.pos_ == "ADP":  # Stop at a preposition
                    preposition_found = True
                    break
                phrase.append(next_token)
            # Only add the phrase if it is not empty and preposition is found
            if phrase and preposition_found:
                predicate_verbs.extend([convert_to_lemma(word) if word.pos_ == "VERB" else convert_to_singular_noun(word) for word in phrase])
                break  # Exit the loop if a phrase is found

    # Check if there are lists and use the words after lists as predicate_verbs
    for token in doc:
        if token.text.lower() == "list":
            list_start_index = token.i
            for next_token in doc[token.i + 1:]:
                if next_token.text == ".":
                    predicate_verbs.extend([convert_to_lemma(word) if word.pos_ == "VERB" else convert_to_singular_noun(word) for word in doc[list_start_index + 1:next_token.i]])
                    list_start_index = next_token.i
                    if next_token.text == ".":
                        break
    # Remove 'the' from predicate_verbs
    predicate_verbs = [' '.join([word for word in phrase.split() if word.lower() != "the"]) for phrase in predicate_verbs]

    # Convert predicate_verbs to string format
    predicate_verbs_string = ' '.join(predicate_verbs)

    # Check for 'is' and 'are' and add 'type' and 'description'
    if not predicate_verbs:
        for token in doc:
            if token.lemma_ == "be" and (token.text.lower() == "is" or token.text.lower() == "are"):
                predicate_verbs.extend(["type"])
                break

    # If no phrases found, then look for other verbs
    if not predicate_verbs:
        for token in doc:
            if "VB" in token.tag_ and token.lemma_ not in ["have", "be", "find"]:
                predicate_verbs.append(token.lemma_)

     # If no phrases found, then look for description
    if not predicate_verbs:
       predicate_verbs.append("description")


    return predicate_verbs

# New function to find the part between the preposition and the question mark
def find_prep_to_question_phrase(doc):
    phrase = []
    verb_found = False
    for token in doc:
        if token.pos_ == "ADP":  # If the token is a preposition (ADP)
            for next_token in doc[token.i + 1:]:  # Iterate over following tokens
                if next_token.text == "?":  # Stop if a question mark is found
                    break
                if next_token.pos_ == "VERB":  # Check if the token is a verb
                    verb_found = True
                    break  # Break if a verb is found
                phrase.append(next_token.text)  # Append the token text to the phrase list
            break  # Stop after finding the first preposition

    return '' if verb_found else ' '.join(phrase)  # Return an empty string if a verb is found, else return the phrase



# Create a list of dataframes to concatenate
dfs = []

# Process each text and store the results in a list of dataframes
for text in texts:
    named_entities, predicate_verbs = extract_entities_and_verbs(text)
    # Flatten the named_entities if it contains lists and then join
    flattened_entities = [item for sublist in named_entities for item in (sublist if isinstance(sublist, list) else [sublist])]
    df_row = pd.DataFrame({
        "Text": [text],
        "Named Entities": [','.join(flattened_entities)],
        "Predicate Verbs": [' '.join(predicate_verbs)]
    })
    dfs.append(df_row)

# Concatenate the dataframes in the list
df = pd.concat(dfs, ignore_index=True)

# # Save the DataFrame to an Excel file
df.to_excel("1QustionEntityandRelationship.xlsx", index=False)

df

Unnamed: 0,Text,Named Entities,Predicate Verbs
0,Who is working in the Computational Materials ...,the Computational Materials Science,work
1,What are the research projects associated to E...,EMMO,research project associate
2,"Who are the contributors of the data ""datasets""?",datasets,contributor
3,"Who is working with Researcher ""Ebrahim Norouz...",Ebrahim Norouzi,work
4,"Who is the email address of ""ParaView""?",ParaView,email address
5,What are the affiliations of Volker Hofmann?,Volker Hofmann,affiliation
6,"What is ""Molecular Dynamics"" Software? List th...",Molecular Dynamics,"programming language , documentation page , r..."
7,What are pre- and post-processing tools for MD...,MD,pre- and post - processing tool
8,What are some workflow environments for comput...,computational materials science,some workflow environment
9,How should I cite pyiron?,pyiron,cite


# Find 5 or 10 Most Similarest Entities

In [None]:
def find_most_similar_entities(df, entity_column):
    # Load the spaCy English model
    nlp = spacy.load("en_core_web_lg") # small model: low accuracy; large model: do best
    # Read data from an Excel file
    df_entity = pd.read_excel('0entities_list_update.xlsx')

    # Function to extract abbreviation
    def extract_abbreviation(text):
        if isinstance(text, str):
            match = re.search(r'\(([^)]+)\)', text)
            return match.group(1) if match else text
        else:
            return text

    # Apply the function to extract abbreviations
    df_entity['entity_abb'] = df_entity['entity_readable'].apply(extract_abbreviation)

    # Convert to lowercase
    df_entity['entity_lowercase'] = df_entity['entity_abb'].str.lower()

    # Function to find most similar entities
    def process_entity(question_entity):
        question_word = nlp(question_entity.lower())
        similarity_scores = {}

        # Set a similarity threshold
        threshold = 0.6

        for index, row in df_entity.iterrows():
            if pd.isna(row['entity_lowercase']):
                continue

            kd_word = nlp(row['entity_lowercase'])
            similarity = question_word.similarity(kd_word)
            if similarity > threshold:
                similarity_scores[index] = similarity

        # # Find the top 10 highest similarity scores
        # top_10_similarities = sorted(similarity_scores.items(), key=lambda x: x[1], reverse=True)[:10]

        # if top_10_similarities:
        #     similar_entities = [df_entity.at[index, 'entity_readable'] for index, _ in top_10_similarities]
        #     entity_uris = [df_entity.at[index, 'entity_uri'] for index, _ in top_10_similarities]

        # Find the top 5 highest similarity scores
        top_5_similarities = sorted(similarity_scores.items(), key=lambda x: x[1], reverse=True)[:5]

        if top_5_similarities:
            similar_entities = [df_entity.at[index, 'entity_readable'] for index, _ in top_5_similarities]
            entity_uris = [df_entity.at[index, 'entity_uri'] for index, _ in top_5_similarities]

            return similar_entities, entity_uris
        else:
            return None, None

    # Function to process a list of entities separated by commas
    def process_entity_list(entity_list):
        similar_entities_list = []
        entity_uris_list = []

        for entity in entity_list.split(','):
            entity = entity.strip()
            if entity:
                similar_entities, entity_uris = process_entity(entity)
                if similar_entities and entity_uris:
                    similar_entities_list.extend(similar_entities)
                    entity_uris_list.extend(entity_uris)

        if similar_entities_list:
            return similar_entities_list, entity_uris_list
        else:
            return None, None


    # Apply to each entity in the provided column of df
    results = df[entity_column].apply(lambda x: process_entity(x) if isinstance(x, str) and ',' not in x else process_entity_list(x) if isinstance(x, str) else (None, None))
    # results = df[entity_column].apply(lambda x: process_entity(x))
    df['Similar Entities'] = results.apply(lambda x: x[0] if x else None)
    df['Entity URIs'] = results.apply(lambda x: x[1] if x else None)
    return df

# df = pd.read_excel("1QustionEntityandRelationship.xlsx")
# Assuming df is your DataFrame with a column 'Named Entities'
df = find_most_similar_entities(df, 'Named Entities')

# Save the DataFrame to an Excel file
df.to_excel("2SimilarEntities5.xlsx", index=False)

df

  similarity = question_word.similarity(kd_word)


Unnamed: 0,Text,Named Entities,Predicate Verbs,Similar Entities,Entity URIs
0,Who is working in the Computational Materials ...,the Computational Materials Science,work,"[Computational materials science, Computationa...","[http://demo.fiz-karlsruhe.de/matwerk/E49517, ..."
1,What are the research projects associated to E...,EMMO,research project associate,[Elemental Multiperspective Material Ontology ...,[http://demo.fiz-karlsruhe.de/matwerk/E1126751]
2,"Who are the contributors of the data ""datasets""?",datasets,contributor,"[datasets, dataset, data analysis, Materials D...",[http://demo.fiz-karlsruhe.de/matwerk/E1172216...
3,"Who is working with Researcher ""Ebrahim Norouz...",Ebrahim Norouzi,work,"[Ebrahim Norouzi, Ebrahim Norouzi]","[http://demo.fiz-karlsruhe.de/matwerk/E15879, ..."
4,"Who is the email address of ""ParaView""?",ParaView,email address,"[paraview, ParaView, ParaView]",[http://demo.fiz-karlsruhe.de/matwerk/E1231097...
5,What are the affiliations of Volker Hofmann?,Volker Hofmann,affiliation,[Volker Hofmann],[http://www.wikidata.org/entity/Q84561074]
6,"What is ""Molecular Dynamics"" Software? List th...",Molecular Dynamics,"programming language , documentation page , r...","[molecular dynamics, Carr Parrinello Molecular...","[http://demo.fiz-karlsruhe.de/matwerk/E616496,..."
7,What are pre- and post-processing tools for MD...,MD,pre- and post - processing tool,"[Molecular Dynamics (MD), Dr Sarath Menon, Tho...","[http://demo.fiz-karlsruhe.de/matwerk/E61379, ..."
8,What are some workflow environments for comput...,computational materials science,some workflow environment,"[Computational materials science, Computationa...","[http://demo.fiz-karlsruhe.de/matwerk/E49517, ..."
9,How should I cite pyiron?,pyiron,cite,[Pyiron],[http://demo.fiz-karlsruhe.de/matwerk/E457491]


If there are none in similar units, use the small model isntead of large model


In [None]:
def find_most_similar_entities(df, entity_column):
    # Load the spaCy English model
    nlp = spacy.load("en_core_web_sm") #small model: do much better when large model fails
    # Read data from an Excel file
    df_entity = pd.read_excel('0entities_list_update.xlsx')

    # Function to extract abbreviation
    def extract_abbreviation(text):
        if isinstance(text, str):
            match = re.search(r'\(([^)]+)\)', text)
            return match.group(1) if match else text
        else:
            return text

    # Apply the function to extract abbreviations
    df_entity['entity_abb'] = df_entity['entity_readable'].apply(extract_abbreviation)

    # Convert to lowercase
    df_entity['entity_lowercase'] = df_entity['entity_abb'].str.lower()

    # Function to find most similar entities
    def process_entity(question_entity):
        question_word = nlp(question_entity.lower())
        similarity_scores = {}

        # Set a similarity threshold
        threshold = 0.6

        for index, row in df_entity.iterrows():
            if pd.isna(row['entity_lowercase']):
                continue

            kd_word = nlp(row['entity_lowercase'])
            similarity = question_word.similarity(kd_word)
            if similarity > threshold:
                similarity_scores[index] = similarity


        # # Find the top 10 highest similarity scores
        # top_10_similarities = sorted(similarity_scores.items(), key=lambda x: x[1], reverse=True)[:10]

        # if top_10_similarities:
        #     similar_entities = [df_entity.at[index, 'entity_readable'] for index, _ in top_10_similarities]
        #     entity_uris = [df_entity.at[index, 'entity_uri'] for index, _ in top_10_similarities]

        # Find the top 5 highest similarity scores
        top_5_similarities = sorted(similarity_scores.items(), key=lambda x: x[1], reverse=True)[:5]
        print(top_5_similarities)

        if top_5_similarities:
            similar_entities = [df_entity.at[index, 'entity_readable'] for index, _ in top_5_similarities]
            entity_uris = [df_entity.at[index, 'entity_uri'] for index, _ in top_5_similarities]

            return similar_entities, entity_uris
        else:
            return None, None

    # Function to process a list of entities separated by commas
    def process_entity_list(entity_list):
        similar_entities_list = []
        entity_uris_list = []

        for entity in entity_list.split(','):
            entity = entity.strip()
            if entity:
                similar_entities, entity_uris = process_entity(entity)
                if similar_entities and entity_uris:
                    similar_entities_list.extend(similar_entities)
                    entity_uris_list.extend(entity_uris)

        if similar_entities_list:
            return similar_entities_list, entity_uris_list
        else:
            return None, None


    # Apply to each entity in the provided column of df
    results = df[entity_column].apply(lambda x: process_entity(x) if isinstance(x, str) and ',' not in x else process_entity_list(x) if isinstance(x, str) else (None, None))
    # results = df[entity_column].apply(lambda x: process_entity(x))
    df['Similar Entities'] = results.apply(lambda x: x[0] if x else None)
    df['Entity URIs'] = results.apply(lambda x: x[1] if x else None)
    return df

# Assuming df is your DataFrame with a column 'Named Entities'
df_null = find_most_similar_entities(df[df['Similar Entities'].isnull()], 'Named Entities')


df[df.isnull()] = df_null[df.isnull()]

# Save the DataFrame to an Excel file
df.to_excel("2SimilarEntities5_up.xlsx", index=False)

df

  similarity = question_word.similarity(kd_word)


[(1152, 0.826565392591856), (724, 0.8210707129094054), (725, 0.8210707129094054), (1062, 0.8004280163819343), (551, 0.7958029372285703)]
[(517, 0.7153859273686815), (18, 0.7139619803978803), (847, 0.6803068597187358), (1831, 0.6719893294018098), (1522, 0.655508816938155)]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Similar Entities'] = results.apply(lambda x: x[0] if x else None)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Entity URIs'] = results.apply(lambda x: x[1] if x else None)


Unnamed: 0,Text,Named Entities,Predicate Verbs,Similar Entities,Entity URIs
0,Who is working in the Computational Materials ...,the Computational Materials Science,work,"[Computational materials science, Computationa...","[http://demo.fiz-karlsruhe.de/matwerk/E49517, ..."
1,What are the research projects associated to E...,EMMO,research project associate,[Elemental Multiperspective Material Ontology ...,[http://demo.fiz-karlsruhe.de/matwerk/E1126751]
2,"Who are the contributors of the data ""datasets""?",datasets,contributor,"[datasets, dataset, data analysis, Materials D...",[http://demo.fiz-karlsruhe.de/matwerk/E1172216...
3,"Who is working with Researcher ""Ebrahim Norouz...",Ebrahim Norouzi,work,"[Ebrahim Norouzi, Ebrahim Norouzi]","[http://demo.fiz-karlsruhe.de/matwerk/E15879, ..."
4,"Who is the email address of ""ParaView""?",ParaView,email address,"[paraview, ParaView, ParaView]",[http://demo.fiz-karlsruhe.de/matwerk/E1231097...
5,What are the affiliations of Volker Hofmann?,Volker Hofmann,affiliation,[Volker Hofmann],[http://www.wikidata.org/entity/Q84561074]
6,"What is ""Molecular Dynamics"" Software? List th...",Molecular Dynamics,"programming language , documentation page , r...","[molecular dynamics, Carr Parrinello Molecular...","[http://demo.fiz-karlsruhe.de/matwerk/E616496,..."
7,What are pre- and post-processing tools for MD...,MD,pre- and post - processing tool,"[Molecular Dynamics (MD), Dr Sarath Menon, Tho...","[http://demo.fiz-karlsruhe.de/matwerk/E61379, ..."
8,What are some workflow environments for comput...,computational materials science,some workflow environment,"[Computational materials science, Computationa...","[http://demo.fiz-karlsruhe.de/matwerk/E49517, ..."
9,How should I cite pyiron?,pyiron,cite,[Pyiron],[http://demo.fiz-karlsruhe.de/matwerk/E457491]


# Find 10 Most Similarest Relationships and added "description"

In [None]:
import pandas as pd
import spacy
from rdflib import URIRef
import re

def find_most_similar_relationships(df, relationship_column):
    # Load the spaCy English model
    nlp = spacy.load("en_core_web_lg")

    # Read data from an Excel file
    df_relationship = pd.read_excel('0relationships_list_update.xlsx')

    # Convert the second column to lowercase and save it in a new column 'entity_lowercase'
    df_relationship['relationship_lowercase'] = df_relationship['relationship_readable'].str.lower()

    # Define a function to remove specific words and plural 's'
    def preprocess_text(text):
        # Remove specific words
        words_to_remove = {'has', 'is', 'of', 'in'}
        tokens = text.split()
        tokens = [word for word in tokens if word not in words_to_remove]

        # Remove trailing 's' for plurals
        processed_text = ' '.join(tokens)
        if processed_text.endswith('s'):
            processed_text = processed_text[:-1]

        return processed_text

    # Apply preprocessing to 'relationship_lowercase'
    df_relationship['cleaned_relationship'] = df_relationship['relationship_lowercase'].apply(preprocess_text)

    # Function to find most similar relationships
    def process_relationship(question_relationship):
        if not isinstance(question_relationship, str):
            return None, None, None

        question_word = nlp(preprocess_text(question_relationship.lower()))

        # Initialize a dictionary to store similarity scores
        similarity_scores = {}

        # Set a similarity threshold
        threshold = 0.6

        # Iterate through each word in the dataset and calculate its similarity to word1
        for index, row in df_relationship.iterrows():
            # Skip if the word is NaN
            if pd.isna(row['cleaned_relationship']):
                continue

            word2 = nlp(row['cleaned_relationship'])
            similarity = question_word.similarity(word2)
            similarity_scores[index] = similarity

            # Only store words with similarity scores above the threshold
            if similarity >= threshold:
                similarity_scores[index] = similarity

        # Find the top 10 highest similarity scores
        top_10_similarities = sorted(similarity_scores.items(), key=lambda x: x[1], reverse=True)[:10]

        similar_relationships = []
        relationship_uris = []
        relationship_uris_withNS = []

        for index, similarity_score in top_10_similarities:
            if 0 <= index < len(df_relationship):
                similar_relationships.append(df_relationship.at[index, 'relationship_readable'])
                relationship_uris.append(df_relationship.at[index, 'relationship_uri'])
                relationship_uris_withNS.append(df_relationship.at[index, 'Predicate with NS'])
        if 'mwo:description' not in relationship_uris_withNS:
          relationship_uris_withNS.append('mwo:description')
        if 'dcterms:description' not in relationship_uris_withNS:
          relationship_uris_withNS.append('dcterms:description')

        return similar_relationships, relationship_uris, relationship_uris_withNS

    # Apply to each entity in the provided column of df
    results = df[relationship_column].apply(lambda x: process_relationship(x))
    df['Similar Relationships'] = results.apply(lambda x: x[0] if x else None)
    df['Relationship URIs'] = results.apply(lambda x: x[1] if x else None)
    df['relationship_uris_withNS'] = results.apply(lambda x: x[2] if x else None)

    return df

df = pd.read_excel("2SimilarEntities5_up.xlsx")
# Example usage with a DataFrame 'df' and a column 'Named Relationships'
df = find_most_similar_relationships(df, 'Predicate Verbs')

# To save the modified DataFrame:
df.to_excel("4beforeSparq5X10.xlsx", index=False)
df

  similarity = question_word.similarity(word2)


Unnamed: 0,Text,Named Entities,Predicate Verbs,Similar Entities,Entity URIs,Similar Relationships,Relationship URIs,relationship_uris_withNS
0,Who is working in the Computational Materials ...,the Computational Materials Science,work,"['Computational materials science', 'Computati...",['http://demo.fiz-karlsruhe.de/matwerk/E49517'...,"[has work package, has expertise in, has fundi...",[http://purls.helmholtz-metadaten.de/mwo/hasWo...,"[mwo:hasWorkPackage, mwo:hasExpertiseIn, nfdic..."
1,What are the research projects associated to E...,EMMO,research project associate,['Elemental Multiperspective Material Ontology...,['http://demo.fiz-karlsruhe.de/matwerk/E1126751'],"[has related Project, related participant proj...",[http://nfdi.fiz-karlsruhe.de/ontology/related...,"[nfdicore:relatedProject, mwo:relatedParticipa..."
2,"Who are the contributors of the data ""datasets""?",datasets,contributor,"['datasets', 'dataset', 'data analysis', 'Mate...",['http://demo.fiz-karlsruhe.de/matwerk/E117221...,"[has contributor, related participant project ...",[http://purls.helmholtz-metadaten.de/mwo/hasCo...,"[mwo:hasContributor, mwo:relatedParticipantPro..."
3,"Who is working with Researcher ""Ebrahim Norouz...",Ebrahim Norouzi,work,"['Ebrahim Norouzi', 'Ebrahim Norouzi']",['http://demo.fiz-karlsruhe.de/matwerk/E15879'...,"[has work package, has expertise in, has fundi...",[http://purls.helmholtz-metadaten.de/mwo/hasWo...,"[mwo:hasWorkPackage, mwo:hasExpertiseIn, nfdic..."
4,"Who is the email address of ""ParaView""?",ParaView,email address,"['paraview', 'ParaView', 'ParaView']",['http://demo.fiz-karlsruhe.de/matwerk/E123109...,"[has email address , has postal address, has w...",[http://purls.helmholtz-metadaten.de/mwo/email...,"[mwo:emailAddress, mwo:hasPostalAddress, mwo:h..."
5,What are the affiliations of Volker Hofmann?,Volker Hofmann,affiliation,['Volker Hofmann'],['http://www.wikidata.org/entity/Q84561074'],"[has affiliation, has curation status, has par...",[http://purls.helmholtz-metadaten.de/mwo/hasAf...,"[mwo:hasAffiliation, ns2:IAO_0000114, nfdicore..."
6,"What is ""Molecular Dynamics"" Software? List th...",Molecular Dynamics,"programming language , documentation page , r...","['molecular dynamics', 'Carr Parrinello Molecu...",['http://demo.fiz-karlsruhe.de/matwerk/E616496...,"[has documentation, has bibliographic citation...",[http://purls.helmholtz-metadaten.de/mwo/hasDo...,"[mwo:hasDocumentation, dcterms:bibliographicCi..."
7,What are pre- and post-processing tools for MD...,MD,pre- and post - processing tool,"['Molecular Dynamics (MD)', 'Dr Sarath Menon',...",['http://demo.fiz-karlsruhe.de/matwerk/E61379'...,"[required tool, has related resource, related ...",[http://purls.helmholtz-metadaten.de/mwo/requi...,"[mwo:requiredTool, mwo:hasRelatedResource, mwo..."
8,What are some workflow environments for comput...,computational materials science,some workflow environment,"['Computational materials science', 'Computati...",['http://demo.fiz-karlsruhe.de/matwerk/E49517'...,"[has some values from, has work package, has r...","[http://www.w3.org/2002/07/owl#someValuesFrom,...","[owl:someValuesFrom, mwo:hasWorkPackage, mwo:h..."
9,How should I cite pyiron?,pyiron,cite,['Pyiron'],['http://demo.fiz-karlsruhe.de/matwerk/E457491'],"[has annotated source , has bibliographic cita...",[http://www.w3.org/2002/07/owl#annotatedSource...,"[owl:annotatedSource, dcterms:bibliographicCit..."


3SimilarRelationships5X10.xlsx without "description" , 4beforeSparq5X10.xlsx with "description"