In [None]:
#  key word extraction from the para

!pip install keybert

from keybert import KeyBERT

kw_model = KeyBERT()
doc = """
Supervised learning is the machine learning task of learning a function that maps an input to an output based on example input-output pairs. It infers a function from labeled training data consisting of a set of training examples. In supervised learning, each example is a pair consisting of an input object (typically a vector) and a desired output value (also called the supervisory signal). A supervised learning algorithm analyzes the training data and produces an inferred function, which can be used for mapping new inputs to outputs.
"""
keywords = kw_model.extract_keywords(doc)
print(keywords)


Collecting keybert
  Downloading keybert-0.8.5-py3-none-any.whl.metadata (15 kB)
Collecting sentence-transformers>=0.3.8 (from keybert)
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Downloading keybert-0.8.5-py3-none-any.whl (37 kB)
Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence-transformers, keybert
Successfully installed keybert-0.8.5 sentence-transformers-3.0.1


  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

[('supervised', 0.6429), ('labeled', 0.4506), ('learning', 0.4269), ('training', 0.3847), ('supervisory', 0.369)]


In [None]:
# keyword extraction using traditional method

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter

nltk.download('punkt')
nltk.download('stopwords')

def extract_keywords(text, num_keywords=10):
  # Tokenize the text
  tokens = word_tokenize(text)

  # Remove stop words
  stop_words = set(stopwords.words('english'))
  tokens = [word for word in tokens if word.lower() not in stop_words]

  # Count word frequencies
  word_counts = Counter(tokens)

  # Get the most frequent words
  keywords = word_counts.most_common(num_keywords)

  return keywords

doc = """
Supervised learning is the machine learning task of learning a function that maps an input to an output based on example input-output pairs. It infers a function from labeled training data consisting of a set of training examples. In supervised learning, each example is a pair consisting of an input object (typically a vector) and a desired output value (also called the supervisory signal). A supervised learning algorithm analyzes the training data and produces an inferred function, which can be used for mapping new inputs to outputs.
"""

keywords = extract_keywords(doc)
print(keywords)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


[('learning', 5), ('.', 4), ('function', 3), ('training', 3), ('input', 2), ('output', 2), ('example', 2), ('data', 2), ('consisting', 2), ('supervised', 2)]


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
# key word extraction using tf-idf

from sklearn.feature_extraction.text import TfidfVectorizer

def extract_keywords_tfidf(text, num_keywords=10)
  vectorizer = TfidfVectorizer(stop_words='english')

  # Fit the vectorizer to the text
  vectorizer.fit([text])

  # Get the feature names (words)
  feature_names = vectorizer.get_feature_names_out()

  # Get the TF-IDF matrix
  tfidf_matrix = vectorizer.transform([text])

  # Get the indices of the top keywords
  top_indices = tfidf_matrix.toarray()[0].argsort()[-num_keywords:][::-1]

  # Extract the keywords
  keywords = [feature_names[i] for i in top_indices]

  return keywords

doc = """
Supervised learning is the machine learning task of learning a function that maps an input to an output based on example input-output pairs. It infers a function from labeled training data consisting of a set of training examples. In supervised learning, each example is a pair consisting of an input object (typically a vector) and a desired output value (also called the supervisory signal). A supervised learning algorithm analyzes the training data and produces an inferred function, which can be used for mapping new inputs to outputs.
"""

keywords = extract_keywords_tfidf(doc)
print(keywords)


['learning', 'input', 'training', 'supervised', 'output', 'function', 'consisting', 'data', 'example', 'vector']
