**Source**: https://www.youtube.com/watch?v=xF2UJTmRU_Y

In [None]:
%%capture
!pip install --upgrade git+https://github.com/UKPLab/sentence-transformers
!pip install keybert ctransformers[cuda]
!pip install --upgrade git+https://github.com/huggingface/transformers
!pip install spacy
!pip install yake
!pip install gensim
!pip install pyate
!pip install rake-nltk
!pip install summa
!python -m spacy download en_core_web_sm
!pip install keybert
!pip install huggingface_hu==0.10.1
!pip install bibtexparser
!pip install Levenshtein
!pip install fuzzywuzzy

In [None]:
import csv
import os
from ctransformers import AutoModelForCausalLM
from transformers import AutoTokenizer, pipeline
from keybert.llm import TextGeneration
from keybert import KeyLLM, KeyBERT
from sentence_transformers import SentenceTransformer

In [None]:
from huggingface_hub import login
login("hf_###")

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
from google.colab import drive
drive.mount('/mnt/drive')

In [None]:
import bibtexparser
from yake import KeywordExtractor
from rake_nltk import Rake
from sklearn.metrics import precision_recall_fscore_support
from yake import KeywordExtractor
from rake_nltk import Rake
from gensim.models.phrases import Phrases, ENGLISH_CONNECTOR_WORDS
from pyate import combo_basic, basic, cvalues
from summa import keywords as summa_keywords
import spacy
import pandas as pd
from keybert import KeyBERT
from nltk.stem import PorterStemmer
from Levenshtein import distance
from nltk.corpus import wordnet
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from fuzzywuzzy import fuzz
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from gensim.models import LdaModel
from gensim.corpora import Dictionary
from nltk.corpus import stopwords

In [None]:
import nltk
nltk.download('stopwords')

In [None]:
# Set gpu_layers to the number of layers to offload to GPU. Set to 0 if no GPU acceleration is available on your system.
model = AutoModelForCausalLM.from_pretrained(
    "TheBloke/Mistral-7B-Instruct-v0.1-GGUF",
    model_file="mistral-7b-instruct-v0.1.Q4_K_M.gguf",
    model_type="mistral",
    gpu_layers=50,
    hf=True
)

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1")

# Pipeline
generator = pipeline(
    model=model, tokenizer=tokenizer,
    task='text-generation',
    max_new_tokens=50,
    repetition_penalty=1.1
)

In [None]:
def extract_keywords_from_abstract(abstract):

    # Get the English stopwords
    stop_words = set(stopwords.words('english'))
    abstract = ' '.join([word for word in abstract.split() if word.lower() not in stop_words])

    # Mistral7B
    example_prompt = """
    <s>[INST]
    I have the following document:
    - Localized magnetic hyperthermia using magnetic nanoparticles (MNPs) under the application of small magnetic fields is a promising tool for treating small or deep-seated tumors.

    Please give me the keywords that are present in this document and separate them with commas.
    Make sure you to only return the keywords and say nothing else. For example, don't say:
    "Here are the keywords present in the document"
    [/INST] localized magnetic hyperthermia,magnetic nanoparticles (MNPs),magnetic fields</s>"""

    keyword_prompt = """
    [INST]

    I have the following document:
    - [DOCUMENT]

    Please give me the keywords that are present in this document and separate them with commas.
    Make sure you to only return the keywords and say nothing else. For example, don't say:
    "Here are the keywords present in the document"
    [/INST]
    """

    prompt = example_prompt + keyword_prompt

    # Mistral7B
    llm = TextGeneration(generator, prompt=prompt)
    kw_model = KeyLLM(llm)
    Mistral7B_keywords = kw_model.extract_keywords([abstract])[0]

    # Mistral7B_embeddings
    model = SentenceTransformer('BAAI/bge-small-en-v1.5')
    embeddings = model.encode([abstract], convert_to_tensor=True)
    Mistral7B_embeddings_keywords = kw_model.extract_keywords([abstract], embeddings=embeddings, threshold=.5)[0]

    # Mistral7B_KeyBERT
    kw_model = KeyBERT(llm=llm, model='BAAI/bge-small-en-v1.5')
    Mistral7B_KeyBERT_keywords = kw_model.extract_keywords([abstract], threshold=.5)[0]
    return {
        "Mistral7B": Mistral7B_keywords,
        "Mistral7B_embeddings": Mistral7B_embeddings_keywords,
        "Mistral7B_KeyBERT": Mistral7B_KeyBERT_keywords,

    }


In [None]:
# Example usage
abstract = "Functionalization facilitates targeted delivery of these nanoparticles to various cell types, bioimaging, gene delivery, drug delivery and other therapeutic and diagnostic applications."
keywords = extract_keywords_from_abstract(abstract)
for method, extracted_keywords in keywords.items():
    print(method + ": ", extracted_keywords)

In [None]:
# Function to tokenize and stem text
def tokenize_and_stem(text):
    stemmer = PorterStemmer()
    if isinstance(text, str):
        tokens = [stemmer.stem(word) for word in text.split()]
        return ' '.join(tokens)
    else:
        return str(text)

# Function to calculate Levenshtein distance similarity
def levenshtein_similarity(text1, text2):
    return 1 - (distance(text1, text2) / max(len(text1), len(text2)))

# Function to find synonyms using WordNet
def find_synonyms(word):
    synonyms = set()
    for synset in wordnet.synsets(word):
        for lemma in synset.lemmas():
            synonyms.add(lemma.name())
    return synonyms

# Function to calculate cosine similarity using TF-IDF
def cosine_similarity_score(text1, text2):
    vectorizer = TfidfVectorizer(tokenizer=lambda x: x.split())
    tfidf_matrix = vectorizer.fit_transform([text1, text2])
    return cosine_similarity(tfidf_matrix)[0][1]

# Function to calculate fuzzy matching score
def fuzzy_matching_score(text1, text2):
    return fuzz.token_set_ratio(text1, text2)

# Function to evaluate keywords
def evaluate_keywords(ground_truth_keywords, extracted_keywords):
    # Initialize variables for evaluation metrics
    tp, fp, fn = 0, 0, 0

    # Tokenize and stem ground truth keywords
    ground_truth_stems = [tokenize_and_stem(keyword) for keyword in ground_truth_keywords]

    # Iterate over extracted keywords
    for extracted_keyword in extracted_keywords:
        # Tokenize and stem extracted keyword
        extracted_stem = tokenize_and_stem(extracted_keyword)

        # Check if extracted keyword matches any ground truth keyword
        matched = False
        for ground_truth_stem in ground_truth_stems:
            # Calculate similarity scores
            levenshtein_sim = levenshtein_similarity(extracted_stem, ground_truth_stem)
            cosine_sim = cosine_similarity_score(extracted_stem, ground_truth_stem)
            fuzzy_score = fuzzy_matching_score(extracted_keyword, ground_truth_stem)

            # If any similarity score exceeds threshold, consider it a match
            if levenshtein_sim > 0.8 or cosine_sim > 0.8 or fuzzy_score > 80:
                matched = True
                break

        # Update evaluation metrics based on match status
        if matched:
            tp += 1
        else:
            fp += 1

    # Calculate false negatives (missed ground truth keywords)
    fn = len(ground_truth_keywords) - tp

    # Calculate precision, recall, and F1-score
    if tp + fp > 0:
        precision = tp / (tp + fp)
    else:
        precision = 0.0

    if tp + fn > 0:
        recall = tp / (tp + fn)
    else:
        recall = 0.0

    if precision + recall > 0:
        f1_score = 2 * (precision * recall) / (precision + recall)
    else:
        f1_score = 0.0

    return precision, recall, f1_score

In [None]:
def evaluate_keywords_from_bib(bib_file, extraction_functions, output_folder):
    # Load the BibTeX file
    with open(bib_file, 'r', encoding='utf-8') as bibfile:
        bib_database = bibtexparser.load(bibfile)

    # Initialize dictionaries to store cumulative scores
    cumulative_precision = {method: 0 for method in extraction_functions}
    cumulative_recall = {method: 0 for method in extraction_functions}
    cumulative_f1_score = {method: 0 for method in extraction_functions}
    total_abstracts = 0

    # Initialize lists to store ground truth keywords, extracted keywords, and evaluation results
    all_extracted_keywords = []
    all_evaluation_results = []
    all_evaluation_results_avg = []

    # Iterate over entries in the BibTeX file
    for entry in bib_database.entries:
        # Check if the entry has abstract and keywords
        if 'abstract' in entry and 'keywords' in entry:
            abstract = entry['abstract'].lower()
            ground_truth_keywords = entry['keywords'].split(',')
            total_abstracts += 1

            # Evaluate keywords for each extraction function
            for method, extraction_function in extraction_functions.items():
                extracted_keywords = extraction_function(abstract)
                precision, recall, f1_score = evaluate_keywords(ground_truth_keywords, extracted_keywords)

                # Accumulate scores
                cumulative_precision[method] += precision
                cumulative_recall[method] += recall
                cumulative_f1_score[method] += f1_score

                # Append data for CSV output
                all_extracted_keywords.append((method, ground_truth_keywords, extracted_keywords))
                all_evaluation_results.append((method, precision, recall, f1_score))

    # Calculate averages
    average_precision = {method: cumulative_precision[method] / total_abstracts for method in extraction_functions}
    average_recall = {method: cumulative_recall[method] / total_abstracts for method in extraction_functions}
    average_f1_score = {method: cumulative_f1_score[method] / total_abstracts for method in extraction_functions}

    # Print average scores
    print("Average Scores over all Abstracts:")
    for method in extraction_functions:
        print(f"Method      , Average Precision:                    , Average Recall:                    , Average F1-score:                    ")
        print(f"{method},{average_precision[method]},{average_recall[method]},{average_f1_score[method]}")
        all_evaluation_results_avg.append((method, average_precision[method], average_recall[method], average_f1_score[method]))

    # Write ground truth keywords, extracted keywords, and evaluation results to CSV files
    with open(os.path.join(output_folder, 'extracted_keywords.csv'), 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['Method', 'Ground Truth Keywords', 'Extracted Keywords'])
        writer.writerows(all_extracted_keywords)

    with open(os.path.join(output_folder, 'evaluation_results.csv'), 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['Method', 'Precision', 'Recall', 'F1-score'])
        writer.writerows(all_evaluation_results)
    
    with open(os.path.join(output_folder, 'evaluation_results_avg.csv'), 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['Method', 'Precision', 'Recall', 'F1-score'])
        writer.writerows(all_evaluation_results_avg)

# Define extraction functions
extraction_functions = {
    "Mistral7B": lambda abstract: extract_keywords_from_abstract(abstract)["Mistral7B"],
    "Mistral7B_embeddings": lambda abstract: extract_keywords_from_abstract(abstract)["Mistral7B_embeddings"],
    "Mistral7B_KeyBERT": lambda abstract: extract_keywords_from_abstract(abstract)["Mistral7B_KeyBERT"],
}

In [None]:
# Example usage
bib_file = "/mnt/drive/MyDrive/colab_data/nanomaterials-v01-i01_20240418.bib"

# Specify the output folder
output_folder = "/mnt/drive/MyDrive/colab_data/"

evaluate_keywords_from_bib(bib_file, extraction_functions, output_folder)