<a href="https://colab.research.google.com/github/LC-Platform/multimodal-Concept-accessor/blob/Domain-keyword-identifier/keyword_extractor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **DOMAIN KEYWORD EXTRACTION USING DIFFERENT TECHNOQUES**

# TF-IDF WITH THRESHOLD

In [None]:
!pip install PyMuPDF scikit-learn nltk




In [9]:
import fitz  # PyMuPDF

def extract_text_from_pdf(pdf_path):
    text = ""
    doc = fitz.open(pdf_path)
    for page in doc:
        text += page.get_text()
    doc.close()
    return text

# Example usage
pdf_path = "/content/Class-XI-Biology.pdf"  # replace with your PDF file path
raw_text = extract_text_from_pdf(pdf_path)
print(raw_text[:500])  # print first 500 characters to verify


PLANT KINGDOM
29
In the previous chapter, we looked at the broad classification of living
organisms under the system proposed by Whittaker (1969) wherein he
suggested the Five Kingdom classification viz. Monera, Protista, Fungi,
Animalia and Plantae.  In this chapter, we will deal in detail with further
classification within Kingdom Plantae popularly known as the ‘plant
kingdom’.
We must stress here that our understanding of the plant kingdom
has changed over time. Fungi, and members of the Mone


**PHASE 1:** TF-IDF with removal of stopwords in the preprocesing step with an add on of threshold

In [35]:
import re
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def preprocess_text2(text):
    text = text.lower()
    text = re.sub(r'\W+', ' ', text)  # remove punctuation and non-words
    tokens = text.split()
    cleaned_tokens = [word for word in tokens if word not in stop_words and len(word) > 2]
    return ' '.join(cleaned_tokens)

cleaned_text = preprocess_text2(raw_text)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [36]:
from sklearn.feature_extraction.text import TfidfVectorizer

def calculate_tfidf2(text):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([text])
    feature_names = vectorizer.get_feature_names_out()
    tfidf_scores = tfidf_matrix.toarray()[0]
    return dict(zip(feature_names, tfidf_scores))

tfidf_dict = calculate_tfidf2(cleaned_text)


In [37]:
def filter_keywords2(tfidf_dict, threshold=0.08):
    keywords = {word: score for word, score in tfidf_dict.items() if score >= threshold}
    return sorted(keywords.items(), key=lambda x: x[1], reverse=True)

filtered_keywords = filter_keywords2(tfidf_dict, threshold=0.08)


In [38]:
# Assuming `filtered_keywords` is a list of (word, score) tuples
output_path = "tfidf_keywords_phase2.txt"

with open(output_path, "w") as file:
    file.write("domain keywords based on TF-IDF:\n")
    for word, score in filtered_keywords:
        file.write(f"{word}: {score:.4f}\n")

print(f"TF-IDF keywords saved to {output_path}")


TF-IDF keywords saved to tfidf_keywords_phase2.txt


 **PHASE 2:**TF-IDF with threshold set to 0.1 removed the stopwords with POS in the data preprocessing step

In [39]:
import nltk
from nltk import pos_tag, word_tokenize
from nltk.corpus import stopwords
import re

nltk.download('punkt')
nltk.download('punkt_tab')

nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger_eng')


stop_words = set(stopwords.words('english'))

def preprocess_text_with_pos(text):
    text = text.lower()
    text = re.sub(r'\W+', ' ', text)
    tokens = word_tokenize(text)

    # POS tagging
    tagged_tokens = pos_tag(tokens)

    # Keep only nouns and proper nouns, and remove stopwords
    filtered_tokens = [
        word for word, pos in tagged_tokens
        if word not in stop_words and pos in ['NN', 'NNS', 'NNP', 'NNPS']
    ]

    return ' '.join(filtered_tokens)
cleaned_text = preprocess_text_with_pos(raw_text)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


In [40]:
from sklearn.feature_extraction.text import TfidfVectorizer

def calculate_tfidf(text):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([text])
    feature_names = vectorizer.get_feature_names_out()
    tfidf_scores = tfidf_matrix.toarray()[0]
    return dict(zip(feature_names, tfidf_scores))

tfidf_dict = calculate_tfidf(cleaned_text)


In [41]:
def filter_keywords(tfidf_dict, threshold=0.1):
    keywords = {word: score for word, score in tfidf_dict.items() if score >= threshold}
    return sorted(keywords.items(), key=lambda x: x[1], reverse=True)

filtered_keywords = filter_keywords(tfidf_dict, threshold=0.1)


In [42]:
# Assuming `filtered_keywords` is a list of (word, score) tuples
output_path = "tfidf_keywords.txt"

with open(output_path, "w") as file:
    file.write("Top domain keywords based on TF-IDF:\n")
    for word, score in filtered_keywords:
        file.write(f"{word}: {score:.4f}\n")

print(f"TF-IDF keywords saved to {output_path}")


TF-IDF keywords saved to tfidf_keywords.txt


**PHASE 3:**TF-IDF with threshold set to 0.08 removed the stopwords with POS in the data preprocessing step

In [43]:
import nltk
from nltk import pos_tag, word_tokenize
from nltk.corpus import stopwords
import re

nltk.download('punkt')
nltk.download('punkt_tab')

nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger_eng')


stop_words = set(stopwords.words('english'))

def preprocess_text_with_pos1(text):
    text = text.lower()
    text = re.sub(r'\W+', ' ', text)
    tokens = word_tokenize(text)

    # POS tagging
    tagged_tokens = pos_tag(tokens)

    # Keep only nouns and proper nouns, and remove stopwords
    filtered_tokens = [
        word for word, pos in tagged_tokens
        if word not in stop_words and pos in ['NN', 'NNS', 'NNP', 'NNPS']
    ]

    return ' '.join(filtered_tokens)
cleaned_text = preprocess_text_with_pos1(raw_text)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [44]:
from sklearn.feature_extraction.text import TfidfVectorizer

def calculate_tfidf3(text):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([text])
    feature_names = vectorizer.get_feature_names_out()
    tfidf_scores = tfidf_matrix.toarray()[0]
    return dict(zip(feature_names, tfidf_scores))

tfidf_dict = calculate_tfidf3(cleaned_text)


In [45]:
def filter_keywords3(tfidf_dict, threshold=0.08):
    keywords = {word: score for word, score in tfidf_dict.items() if score >= threshold}
    return sorted(keywords.items(), key=lambda x: x[1], reverse=True)

filtered_keywords = filter_keywords3(tfidf_dict, threshold=0.08)


In [46]:
# Assuming `filtered_keywords` is a list of (word, score) tuples
output_path = "tfidf_keywords_3.txt"

with open(output_path, "w") as file:
    file.write("domain keywords based on TF-IDF:\n")
    for word, score in filtered_keywords:
        file.write(f"{word}: {score:.4f}\n")

print(f"TF-IDF keywords saved to {output_path}")


TF-IDF keywords saved to tfidf_keywords_3.txt


 **PHASE 4:**TF-IDF with threshold set to 0.08 removed the stopwords with POS in the data preprocessing step and inserted the lemmatization to avoid the plurals as different word

In [47]:
import nltk
from nltk import pos_tag, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

# Initialize the Lemmatizer
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text_with_pos_and_lemmatization(text):
    # Convert to lowercase
    text = text.lower()

    # Remove non-alphabetic characters
    text = re.sub(r'\W+', ' ', text)

    # Tokenize the text
    tokens = word_tokenize(text)

    # POS tagging
    tagged_tokens = pos_tag(tokens)

    # Lemmatization: Keep only nouns and proper nouns and remove stopwords
    filtered_tokens = [
        lemmatizer.lemmatize(word) for word, pos in tagged_tokens
        if word not in stop_words and pos in ['NN', 'NNS', 'NNP', 'NNPS']
    ]

    return ' '.join(filtered_tokens)
cleaned_text4 = preprocess_text_with_pos_and_lemmatization(raw_text)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [48]:
from sklearn.feature_extraction.text import TfidfVectorizer

def calculate_tfidf4(text):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([text])
    feature_names = vectorizer.get_feature_names_out()
    tfidf_scores = tfidf_matrix.toarray()[0]
    return dict(zip(feature_names, tfidf_scores))

tfidf_dict = calculate_tfidf4(cleaned_text4)


In [49]:
def filter_keywords4(tfidf_dict, threshold=0.08):
    keywords = {word: score for word, score in tfidf_dict.items() if score >= threshold}
    return sorted(keywords.items(), key=lambda x: x[1], reverse=True)

filtered_keywords = filter_keywords4(tfidf_dict, threshold=0.08)


In [50]:
# Assuming `filtered_keywords` is a list of (word, score) tuples
output_path = "tfidf_keywords_4.txt"

with open(output_path, "w") as file:
    file.write("domain keywords based on TF-IDF:\n")
    for word, score in filtered_keywords:
        file.write(f"{word}: {score:.4f}\n")

print(f"TF-IDF keywords saved to {output_path}")


TF-IDF keywords saved to tfidf_keywords_4.txt


PMI WITH TF-IDF

In [51]:
import nltk
from nltk import pos_tag, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import math
from collections import Counter

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

# Initialize the Lemmatizer
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text_with_pos_and_lemmatization2(text):
    # Convert to lowercase
    text = text.lower()

    # Remove non-alphabetic characters
    text = re.sub(r'\W+', ' ', text)

    # Tokenize the text
    tokens = word_tokenize(text)

    # POS tagging
    tagged_tokens = pos_tag(tokens)

    # Lemmatization: Keep only nouns and proper nouns and remove stopwords
    filtered_tokens = [
        lemmatizer.lemmatize(word) for word, pos in tagged_tokens
        if word not in stop_words and pos in ['NN', 'NNS', 'NNP', 'NNPS']
    ]

    return filtered_tokens
cleaned_tokens = preprocess_text_with_pos_and_lemmatization2(raw_text)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [52]:
# Calculate PMI for word pairs
def calculate_pmi(tokens, window_size=5):
    word_counts = Counter(tokens)
    total_words = len(tokens)
    cooccurrence = Counter()

    # Sliding window to calculate co-occurrence of word pairs
    for i in range(len(tokens)):
        for j in range(i + 1, min(i + window_size, len(tokens))):
            if tokens[i] != tokens[j]:
                cooccurrence[(tokens[i], tokens[j])] += 1

    pmi_dict = {}
    for (word1, word2), count in cooccurrence.items():
        p_x_y = count / total_words
        p_x = word_counts[word1] / total_words
        p_y = word_counts[word2] / total_words
        if p_x * p_y > 0:
            pmi_dict[(word1, word2)] = math.log(p_x_y / (p_x * p_y))
    return pmi_dict
pmi_dict = calculate_pmi(cleaned_tokens, window_size=5)

In [53]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Calculate TF-IDF
def calculate_tfidf5(text):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([text])
    feature_names = vectorizer.get_feature_names_out()
    tfidf_scores = tfidf_matrix.toarray()[0]
    return dict(zip(feature_names, tfidf_scores))

tfidf_dict = calculate_tfidf4(' '.join(cleaned_tokens))


In [54]:
def filter_keywords4(tfidf_dict, threshold=0.08):
    keywords = {word: score for word, score in tfidf_dict.items() if score >= threshold}
    return sorted(keywords.items(), key=lambda x: x[1], reverse=True)
filtered_keywords = filter_keywords4(tfidf_dict, threshold=0.08)

In [55]:
output_path = "tfidf_keywords_with_pmi.txt"

with open(output_path, "w") as file:
    file.write("Domain keywords based on TF-IDF and PMI:\n")

    # Write TF-IDF results
    file.write("\nTF-IDF Keywords:\n")
    for word, score in filtered_keywords:
        file.write(f"{word}: {score:.4f}\n")

    # Write PMI results
    file.write("\nPMI Results:\n")
    for (word1, word2), pmi in pmi_dict.items():
        file.write(f"PMI({word1}, {word2}): {pmi:.4f}\n")

print(f"TF-IDF keywords with PMI saved to {output_path}")


TF-IDF keywords with PMI saved to tfidf_keywords_with_pmi.txt


**SPACY**

In [56]:
# STEP 1: Install necessary packages
!pip install scispacy
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_scibert-0.5.1.tar.gz
!pip install PyMuPDF  # for PDF reading


Collecting spacy<3.8.0,>=3.7.0 (from scispacy)
  Using cached spacy-3.7.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (27 kB)
Collecting thinc<8.3.0,>=8.2.2 (from spacy<3.8.0,>=3.7.0->scispacy)
  Using cached thinc-8.2.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (15 kB)
Using cached spacy-3.7.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.6 MB)
Using cached thinc-8.2.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (920 kB)
Installing collected packages: thinc, spacy
  Attempting uninstall: thinc
    Found existing installation: thinc 8.1.12
    Uninstalling thinc-8.1.12:
      Successfully uninstalled thinc-8.1.12
  Attempting uninstall: spacy
    Found existing installation: spacy 3.4.4
    Uninstalling spacy-3.4.4:
      Successfully uninstalled spacy-3.4.4
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the fo



In [57]:
!pip uninstall -y numpy
!pip install numpy==1.23.5 --no-cache-dir
!pip install -U spacy


[31mERROR: Operation cancelled by user[0m[31m
[0m^C
^C
^C


In [1]:
!pip install PyMuPDF



In [2]:
!pip install scispacy
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_scibert-0.5.1.tar.gz

Collecting spacy<3.8.0,>=3.7.0 (from scispacy)
  Using cached spacy-3.7.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (27 kB)
Collecting thinc<8.3.0,>=8.2.2 (from spacy<3.8.0,>=3.7.0->scispacy)
  Using cached thinc-8.2.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (15 kB)
Using cached spacy-3.7.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.6 MB)
Using cached thinc-8.2.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (920 kB)
Installing collected packages: thinc, spacy
  Attempting uninstall: thinc
    Found existing installation: thinc 8.1.12
    Uninstalling thinc-8.1.12:
      Successfully uninstalled thinc-8.1.12
  Attempting uninstall: spacy
    Found existing installation: spacy 3.4.4
    Uninstalling spacy-3.4.4:
      Successfully uninstalled spacy-3.4.4
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the fo

In [3]:
# STEP 2: Import libraries
import spacy
import fitz  # PyMuPDF

# Load the best model
import en_core_sci_scibert
nlp = en_core_sci_scibert.load()


In [4]:
# STEP 3: Extract text from PDF
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    doc.close()
    return text

# Use your textbook
pdf_path = "/content/Class-XI-Biology.pdf"
raw_text = extract_text_from_pdf(pdf_path)


In [5]:
import re

# List of unwanted terms to remove (case-insensitive)
unwanted_terms = [
    "chapter", "figure", "contain", "figures", "table", "list", "example",
    "figure", "image", "note", "section", "text", "caption", "label",
    "reference", "unit", "index", "topic"
]

# Function to clean text
def clean_text(text):
    # Remove Roman numerals (i, ii, iii, iv, etc.) in parentheses
    text = re.sub(r'\([ivxlc]+\)', '', text)

    text = re.sub(r'\b[ivxlc]+\b', '', text, flags=re.IGNORECASE)

    # Remove numeric headings like 3.1.3, 3.4, 3.1, etc.
    text = re.sub(r'\d+(\.\d+)+', '', text)

    # Remove alphabetic labels in parentheses like (a), (b), (c)
    text = re.sub(r'\([a-zA-Z]\)', '', text)

    # Remove unwanted terms (case-insensitive)
    text = ' '.join([word for word in text.split() if word.lower() not in unwanted_terms])

    # Remove any extra spaces left behind
    text = re.sub(r'\s+', ' ', text).strip()

    # Remove duplicate words (preserving order)
    seen = set()
    cleaned_words = []
    for word in text.split():
        if word.lower() not in seen:
            cleaned_words.append(word)
            seen.add(word.lower())

    return ' '.join(cleaned_words)
raw_text = clean_text(raw_text)



In [6]:
# STEP 4: Extract biomedical keywords using spaCy + SciSpaCy
def extract_keywords_spacy(text):
    doc = nlp(text)
    keywords = set()

    # Named biomedical entities (very domain-specific)
    for ent in doc.ents:
        if len(ent.text.strip()) > 2:
            keywords.add(ent.text.strip())

    # Add noun chunks (general keyphrases)
    for chunk in doc.noun_chunks:
        if len(chunk.text.strip()) > 2:
            keywords.add(chunk.text.strip())

    return sorted(keywords)

# Run the keyword extraction
keywords = extract_keywords_spacy(raw_text)
unique_keywords = sorted(set(keywords))

  with torch.cuda.amp.autocast(self._mixed_precision):


In [7]:
# STEP 5: Save to file or print sample
output_path = "biology_keywords_using_spacy.txt"

with open(output_path, "w") as f:
    f.write("Extracted Biology Keywords (SpaCy Only):\n")
    for word in unique_keywords:
        f.write(word + "\n")

print(f"✅ Keywords saved to: {output_path}")


✅ Keywords saved to: biology_keywords_using_spacy.txt


**KeyBERT**

In [8]:
# 1. Install KeyBERT
!pip install keybert


Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers>=0.3.8->keybert)
  Using cached transformers-4.51.3-py3-none-any.whl.metadata (38 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers<5.0.0,>=4.41.0->sentence-transformers>=0.3.8->keybert)
  Using cached tokenizers-0.21.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Using cached transformers-4.51.3-py3-none-any.whl (10.4 MB)
Using cached tokenizers-0.21.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)
Installing collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.13.3
    Uninstalling tokenizers-0.13.3:
      Successfully uninstalled tokenizers-0.13.3
  Attempting uninstall: transformers
    Found existing installation: transformers 4.26.1
    Uninstalling transformers-4.26.1:
      Successfully uninstalled transformers-4.26.1
[31mERROR: pip's dependency resolver does not currently take into a

In [5]:
!pip uninstall -y transformers
!pip install transformers==4.35.2
!pip install keybert
!pip install sentence-transformers


[0mCollecting transformers==4.35.2
  Using cached transformers-4.35.2-py3-none-any.whl.metadata (123 kB)
Collecting tokenizers<0.19,>=0.14 (from transformers==4.35.2)
  Using cached tokenizers-0.15.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Using cached transformers-4.35.2-py3-none-any.whl (7.9 MB)
Using cached tokenizers-0.15.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)
Installing collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.21.1
    Uninstalling tokenizers-0.21.1:
      Successfully uninstalled tokenizers-0.21.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
spacy-transformers 1.2.1 requires transformers<4.27.0,>=3.4.0, but you have transformers 4.35.2 which is incompatible.
sentence-transformers 4.1.0 requires t

In [2]:
# 2. Import necessary libraries
from keybert import KeyBERT


In [6]:
# 3. Load the model
kw_model = KeyBERT(model='all-MiniLM-L6-v2')  # Small, fast, and works great


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [10]:
text = raw_text

In [13]:
# 5. Extract keywords/phrases
keywords = kw_model.extract_keywords(
    text,
    keyphrase_ngram_range=(1, 3),     # 1 to 3 words allowed in a phrase
    stop_words='english',             # Remove common words
    use_maxsum=True,                  # Diverse keywords, avoids redundancy
    nr_candidates=20,                 # Number of candidates before filtering
    top_n=10                          # Final top 10 key phrases
)


In [14]:
 # 6. Print the keywords
for keyword, score in keywords:
    print(f"{keyword}: {score:.4f}")


kingdom plantae popularly: 0.6088
biology algae: 0.6121
32 biology algae: 0.6121
kingdom 35 bryophytes: 0.6138
algae bryophytes: 0.6154
organisms consider plant: 0.6164
understanding plant kingdom: 0.6228
placed kingdom cyanobacteria: 0.6435
plantae earlier classifications: 0.6492
kingdom cyanobacteria referred: 0.6633


In [15]:
with open("keybert_keywords.txt", "w") as file:
    file.write("Domain Keywords using KeyBERT:\n")
    for keyword, score in keywords:
        file.write(f"{keyword}: {score:.4f}\n")


**YAKE**

In [16]:
!pip install yake




In [17]:
import yake

# Define language and max number of keywords
language = "en"
max_ngram_size = 3
deduplication_threshold = 0.9
numOfKeywords = 50

# Initialize the YAKE keyword extractor
custom_kw_extractor = yake.KeywordExtractor(
    lan=language,
    n=max_ngram_size,
    dedupLim=deduplication_threshold,
    top=numOfKeywords,
    features=None
)


In [20]:
text = raw_text  # from your earlier extraction


In [18]:
keywords = custom_kw_extractor.extract_keywords(text)

# Print the keywords
for kw, score in keywords:
    print(f"{kw} : {score:.4f}")


PLANT KINGDOM : 0.0058
algae : 0.0066
PLANT : 0.0099
Red algae : 0.0155
Figure : 0.0160
called : 0.0164
plant body : 0.0171
KINGDOM : 0.0194
green algae : 0.0200
water : 0.0200
male : 0.0208
plants : 0.0218
female : 0.0221
form : 0.0232
reproduction : 0.0237
spores : 0.0260
Pteridophytes : 0.0292
Brown algae : 0.0316
Bryophytes : 0.0333
2015-16 : 0.0349
Sexual reproduction : 0.0350
produce : 0.0355
Gymnosperms : 0.0367
classification : 0.0371
gametes : 0.0387
Gametophyte : 0.0393
green : 0.0417
ovules : 0.0422
female gametophytes : 0.0433
called green algae : 0.0440
sex : 0.0441
female sex : 0.0445
body : 0.0445
Red : 0.0450
water algae : 0.0451
Sporophyte : 0.0459
mosses : 0.0459
Angiosperms : 0.0463
main plant body : 0.0473
male gametes : 0.0474
Fresh water algae : 0.0479
called red algae : 0.0484
forms : 0.0497
pollen : 0.0519
multicellular : 0.0523
leaves : 0.0527
cells : 0.0541
vegetative : 0.0559
sexual : 0.0577
develops : 0.0578


In [21]:
!pip install rake-nltk
import nltk
nltk.download('stopwords')




[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [22]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [23]:
from rake_nltk import Rake


# Initialize Rake with English stopwords
rake_extractor = Rake()


In [25]:
text = raw_text  # Already extracted using PyMuPDF


In [24]:
# Feed the text to RAKE
rake_extractor.extract_keywords_from_text(text)

# Get ranked keywords (highest ranked first)
rake_keywords = rake_extractor.get_ranked_phrases()

# Optional: remove duplicates and filter out generic terms
unwanted_terms = ['chapter', 'figure', 'table', 'contain', 'includes', 'example']

# Clean keywords
cleaned_rake_keywords = list({kw for kw in rake_keywords if not any(term in kw.lower() for term in unwanted_terms)})

# Display the top 50 cleaned keywords
for kw in cleaned_rake_keywords[:50]:
    print(kw)


meiosis
characterised
largely aquatic
gametophytic plant body
celled egg apparatus – one egg cell
sloth bear ).
form new individuals
gametophyte
mosses
taxa
considered
evolutionarily
monera
proteins
fragmentation
6 life cycle
two cotyledons
useful
reproduce asexually
two fusions
female gamete
fusion
secondary protonema
chemical constituents
pteridophytes
cytotaxonomy
development
brown algae
mouth
highly variable
5 ).
sources
formation
stage bears
fungi
developed
possess chlorophyll
produced either
produce
amount
frond
upright
shady places
unicellular
fragment develops
5 angiosperms
cycas male cones
thalli
colour
germination gives rise


In [26]:
with open("rake_keywords.txt", "w") as f:
    for kw in cleaned_rake_keywords:
        f.write(kw + "\n")


In [27]:
# 1. Install and Import
!pip install rake-nltk

from rake_nltk import Rake
import nltk
nltk.download('stopwords')

# 2. Load Text (your PDF already loaded as raw_text)
text = raw_text

# 3. Initialize RAKE
rake = Rake()

# 4. Extract Keywords
rake.extract_keywords_from_text(text)
keywords = rake.get_ranked_phrases()

# 5. Custom Filtering
unwanted_terms = ['chapter', 'figure', 'table', 'contain', 'includes', 'example', 'iii', 'ii', 'i', 'a-i', 'a-ii', 'a-iii']
filtered_keywords = []

for kw in keywords:
    kw_lower = kw.lower()
    if not any(term in kw_lower for term in unwanted_terms) and len(kw.strip()) > 2:
        filtered_keywords.append(kw.strip())

# 6. Remove Near Duplicates
unique_keywords = list(set(filtered_keywords))
unique_keywords.sort()

# 7. Display Top Keywords
for keyword in unique_keywords[:50]:
    print(keyword)


1 ).
1 algae
1 algae 3
1 algae algae
1 chlorophyceae
100 metres
100 metres ).
1969
1a ).
1b ).
2 ).
2 bryophytes
2 bryophytes 3
2 mosses
2 phaeophyceae
2015
3 ).
3 rhodophyceae
4 ).
4 gymnosperms
4 gymnosperms 3
5 ).
absent fresh water algae
adapted
agar
algae
algae may store food
algae reproduce
also
also frequently grown
also occur
also referred
also used
among
amount
anatomy
anther
anthers
apex
asexual
asexual buds
assumes
attached
banks
bare rocks
bark
based
bear two laterally attached flagella
bears two
become


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
