<a href="https://colab.research.google.com/github/GeorgeM2000/CANE/blob/master/code/Automatic_Keyword_Extraction_for_Citation_Graphs_with_KeyBERT_and_KeyLLM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ***Libraries & Tools***

In [None]:
!pip install keybert

In [None]:
!pip install bitsandbytes

In [2]:
import nltk
import spacy
import numpy as np
import gc
import os
import openai
import transformers

from keybert import KeyBERT
from keybert import KeyLLM
from keybert.llm import OpenAI
from keybert.llm import TextGeneration
from keybert.llm import LiteLLM
from torch import cuda, bfloat16
#import bitsandbytes

  from tqdm.autonotebook import tqdm, trange


In [None]:
nltk.data.path.append("...")

# ***Abstracts Retrieval***

In [None]:
data_file = 'cora/data.txt'
categories_file = 'cora/group.txt'

In [4]:
def extract_abstracts(file_path):
    # Read the contents of the file
    with open(file_path, 'r') as file:
        abstracts = file.readlines()

   # Remove any leading or trailing whitespace characters from each line
    abstracts = [abstract.strip() for abstract in abstracts if abstract.strip()]

    # Track the number of abstracts
    num_abstracts = len(abstracts)

    return abstracts, num_abstracts

# Example usage
file_path = data_file
abstracts, num_abstracts = extract_abstracts(file_path)

# Display the number of extracted abstracts
print(f'Number of extracted abstracts: {num_abstracts}')

Number of extracted abstracts: 2277


In [None]:
abstracts

In [5]:
total_words = 0
for abstract in abstracts:
    total_words += len(abstract.split())

print(f'Total words in abstracts: {total_words}')

Total words in abstracts: 205936


Abstracts are classified into 7 classes:
- Case_Based
- Genetic_Algorithms
- Neural_Networks
- Probabilistic_Methods
- Reinforcement_Learning
- Rule_Learning
- Theory

In [None]:
labels = []

with open(categories_file, 'r') as file:
    for line in file:
        line = line.strip()  # Remove any surrounding whitespace
        if line.isdigit():   # Check if the line contains only a digit
            labels.append(int(line))  # Convert to int and add to the list
        else:
            labels.append(-1)  # Append -1 if no number is present

print(labels)

In [7]:
label_to_class_mapping = {
    0: 'Case Based',
    1: 'Genetic Algorithms',
    2: 'Neural Networks',
    3: 'Probabilistic Methods',
    4: 'Reinforcement Learning',
    5: 'Rule Learning',
    6: 'Theory',
    -1: 'Case Based Genetic Algorithms Neural Networks Probabilistic Methods Reinforcement Learning Rule Learning Theory'
}

In [8]:
classes = [label_to_class_mapping[label] for label in labels]

In [None]:
classes

# ***KeyBERT & KeyLLM***

## Keyword & Keyphrase Methods

In [9]:
def save_keywords_to_files(lists_of_keywords, file_names):
    for keywords_list, file_name in zip(lists_of_keywords, file_names):
        with open(file_name, 'w') as file:
            for keywords in keywords_list:
                line = ' '.join(keywords)
                file.write(line + '\n')

In [10]:
def modify_keyword_list(abstract_keywords):
    new_abstract_keywords = []
    for keywords in abstract_keywords:
        new_abstract_keywords.append([kw[0] for kw in keywords])
    return new_abstract_keywords

In [11]:
T = 5

## KeyBERT

In [None]:
kbert = KeyBERT(model='all-MiniLM-L6-v2')
#kbert = KeyBERT(model='all-mpnet-base-v2')

In [13]:
abstract_embeddings, word_embeddings = kbert.extract_embeddings(abstracts, keyphrase_ngram_range=(1,3), stop_words='english')

In [None]:
print(f'\n{abstract_embeddings.shape} \n{word_embeddings.shape}')

<class 'numpy.ndarray'> 
<class 'numpy.ndarray'>

(2277, 768) 
(14430, 768)


In [None]:
np.save('KeyBERT_Abstract_Embeddings_Unigram_all-mpnet-base-v2.npy', abstract_embeddings)
np.save('KeyBERT_Abstract_Word_Embeddings_Unigram_all-mpnet-base-v2.npy', word_embeddings)

In [None]:
np.save('KeyBERT_Abstract_Embeddings_Trigram_all-MiniLM-L6-v2.npy', abstract_embeddings)
np.save('KeyBERT_Abstract_Word_Embeddings_Trigram_all-MiniLM-L6-v2.npy', word_embeddings)

### KeyBERT for keyword extraction

In [None]:
keybert_kws = kbert.extract_keywords(abstracts, keyphrase_ngram_range=(1,1), stop_words='english',
                                       use_maxsum=True, nr_candidates=20, top_n=T,
                                       doc_embeddings=abstract_embeddings, word_embeddings=word_embeddings, seed_keywords=classes)

In [None]:
len(keybert_kws)

2277

In [None]:
modified_keybert_kws = modify_keyword_list(keybert_kws)

In [None]:
modified_keybert_kws

### KeyBERT for keyphrase extraction

In [14]:
keybert_kphs = kbert.extract_keywords(abstracts, keyphrase_ngram_range=(1,3), stop_words='english',
                                       use_maxsum=True, nr_candidates=20, top_n=T,
                                       doc_embeddings=abstract_embeddings, word_embeddings=word_embeddings, seed_keywords=classes)

In [15]:
len(keybert_kphs)

2277

In [16]:
modified_keybert_kphs = modify_keyword_list(keybert_kphs)

In [None]:
modified_keybert_kphs

## KeyLLM

In [None]:
prompt = """
I have the following document:
[DOCUMENT]

Based on the information above, extract five keywords that best describe the topic of the text.
The keywords should be separated by commas. Make sure you to only return the keywords and say nothing else.
"""

***WARNING***: The python code below uses an LLM model from OpenAI. OpenAI requires an API key. Usage for OpenAI LLMs is limited and thus not appropriate for the current task.

In [None]:
api_key = os.getenv("OPENAI_API_KEY")
client = openai.OpenAI(api_key=api_key)

llm = OpenAI(client=client, prompt=prompt)

kLLM = KeyLLM(llm)

In [None]:
model_id = 'bit-dny/MindLLM-1b3-chat-zh-v2.0'

# 4-bit Quantization to load Llama 2 with less GPU memory
bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16
)

# Llama 2 Model & Tokenizer
tokenizer = transformers.AutoTokenizer.from_pretrained(model_id)
model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    quantization_config=bnb_config,
    device_map='auto',
)
model.eval()

# Our text generator
generator = transformers.pipeline(
    model=model, tokenizer=tokenizer,
    task='text-generation',
    temperature=0.1,
    max_new_tokens=500,
    repetition_penalty=1.1
)

In [None]:
llm = TextGeneration(generator, prompt=prompt)
kLLM = KeyLLM(llm)

### KeyLLM for keyword extraction

In [None]:
kLLM_openai_kws = kLLM.extract_keywords(abstracts)

## Create Keyword Text Files

In [17]:
# Combine the lists and provide corresponding file names
lists_of_keywords = [modified_keybert_kphs]
file_names = ['cora/KeyBERT_Trigram_Guided_all-MiniLM-L6-v2.txt']

# Save the keywords to separate text files
save_keywords_to_files(lists_of_keywords, file_names)
