## Using BERT for Keyword Extraction

BERT (Bidirectional Encoder Representations from Transformers) has revolutionized the way we approach natural language processing tasks, including keyword extraction. Its ability to understand context and semantics makes it particularly effective. Here’s how BERT enhances keyword extraction:

* Contextual Understanding: BERT’s bidirectional training allows it to grasp the context of words in a sentence, leading to more accurate keyword identification.
* Fine-tuning: By fine-tuning BERT on specific datasets, we can improve its performance in extracting relevant keywords tailored to particular domains.

In [1]:
from transformers import BertTokenizer, BertModel
import torch

# Load pre-trained model and tokenizer
model = BertModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Read and process input text
file_path = "keywords.txt"
with open(file_path, "r") as file:
    text = file.read()

# Tokenize input
inputs = tokenizer(text, return_tensors='pt')

# Get embeddings
with torch.no_grad():
    outputs = model(**inputs)
    last_hidden_states = outputs.last_hidden_state

print(last_hidden_states)


  from .autonotebook import tqdm as notebook_tqdm


tensor([[[ 0.2731,  0.0477,  0.0559,  ..., -0.4933,  0.2429,  0.4301],
         [ 0.4000, -0.5024,  0.2372,  ..., -0.3041,  0.7607,  0.1820],
         [ 0.4276, -0.3590,  0.0432,  ..., -0.2957, -0.6141, -0.6569],
         [ 0.9027,  0.0675, -0.0979,  ...,  0.1569, -0.6991, -0.2826]]])


In [4]:
import torch
import pandas as pd
from transformers import BertTokenizer, BertModel
import requests
from bs4 import BeautifulSoup
import re

# Load pre-trained model and tokenizer
model = BertModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Loading the dataset of keywords
datapath = r"dataset.csv"
df = pd.read_csv(datapath)
# Flatten the dataset to create a set of valid keywords
valid_keywords = set()
for column in df.columns:
    valid_keywords.update(df[column].dropna().str.strip().tolist())

def extract_keywords_from_tokens(text, model, tokenizer, num_keywords=5):
    # Tokenize input
    inputs = tokenizer(text, return_tensors='pt')
    
    # Get embeddings
    with torch.no_grad():
        outputs = model(**inputs)
        last_hidden_states = outputs.last_hidden_state

    # Convert token IDs to tokens
    tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
    
    # Get the [CLS] token's embedding
    cls_embedding = last_hidden_states[:, 0, :].squeeze()
    
    # Calculate similarity between each token embedding and the [CLS] embedding
    similarities = torch.matmul(last_hidden_states.squeeze(), cls_embedding)
    
    # Get the indices of the top-n tokens with the highest similarity
    top_indices = similarities.topk(min(num_keywords, len(similarities))).indices

    # Extract the corresponding tokens, excluding [CLS] and checking if they are in valid_keywords
    keywords = [tokens[i] for i in top_indices if tokens[i] != '[CLS]' and tokens[i] in valid_keywords]
    
    return keywords

# Read and process input text
file_path = "keywords.txt"
with open(file_path, "r") as file:
    text = file.read()

# Use the function to extract keywords
keywords = extract_keywords_from_tokens(text, model, tokenizer)

# Print extracted keywords
print("Extracted keywords:")
for idx, keyword in enumerate(keywords, start=1):
    print(f"Keyword {idx}: {keyword}")


Extracted keywords:


## Web scraping the results of keywords from the web

In [5]:
import torch
import pandas as pd
from transformers import BertTokenizer, BertModel, BartForConditionalGeneration, BartTokenizer
import requests
from bs4 import BeautifulSoup

# Load pre-trained BERT model and tokenizer
bert_model = BertModel.from_pretrained('bert-base-uncased')
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Load pre-trained BART model and tokenizer for summarization
bart_model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
bart_tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')

# Loading the dataset of keywords
datapath = r"dataset.csv"
df = pd.read_csv(datapath)
# Flatten the dataset to create a set of valid keywords
valid_keywords = set()
for column in df.columns:
    valid_keywords.update(df[column].dropna().str.strip().tolist())

def extract_keywords_from_tokens(text, model, tokenizer, num_keywords=5):
    # Tokenize input
    inputs = tokenizer(text, return_tensors='pt')
    
    # Get embeddings
    with torch.no_grad():
        outputs = model(**inputs)
        last_hidden_state = outputs.last_hidden_state

    # Convert token IDs to tokens
    tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
    
    # Get the [CLS] token's embedding
    cls_embedding = last_hidden_state[:, 0, :].squeeze()
    
    # Calculate similarity between each token embedding and the [CLS] embedding
    similarities = torch.matmul(last_hidden_state.squeeze(), cls_embedding)
    
    # Get the indices of the top-n tokens with the highest similarity
    top_indices = similarities.topk(num_keywords).indices

    # Extract the corresponding tokens, excluding [CLS] and checking if they are in valid_keywords
    keywords = [tokens[i] for i in top_indices if tokens[i] != '[CLS]' and tokens[i] in valid_keywords]
    
    return keywords

# Read and process input text
file_path = "keywords.txt"
with open(file_path, "r") as file:
    text = file.read()

# Use the function to extract keywords
keywords = extract_keywords_from_tokens(text, bert_model, bert_tokenizer)

# Print extracted keywords
print("Extracted keywords:")
for idx, keyword in enumerate(keywords, start=1):
    print(f"Keyword {idx}: {keyword}")

    # Search the web for the keyword on Wikipedia
    search_url = f"https://en.wikipedia.org/wiki/{keyword}"
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"}
    response = requests.get(search_url, headers=headers)
    soup = BeautifulSoup(response.content, "html.parser")

    # Extract the relevant information from the search result
    paragraphs = soup.find_all("p")
    extracted_text = ""
    for paragraph in paragraphs:
        extracted_text += paragraph.get_text() + " "
    extracted_text = extracted_text.strip()

    # Generate a summary using BART
    def generate_summary(text, model, tokenizer):
        inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=1024)
        summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=150, early_stopping=True)
        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        return summary

    # Generate summary
    summary = generate_summary(extracted_text, bart_model, bart_tokenizer)
    print(f"Summary of {keyword}:")
    print(summary)


RuntimeError: selected index k out of range

## This code uses pretrained data of the BERT to generate result so giving gibrish and non meaningful data

In [8]:
import torch
import pandas as pd
from transformers import BertTokenizer, BertModel, BartForConditionalGeneration, BartTokenizer

# Load pre-trained BERT model and tokenizer
bert_model = BertModel.from_pretrained('bert-base-uncased')
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Load pre-trained BART model and tokenizer for summarization
bart_model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
bart_tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')

# Loading the dataset of keywords
datapath = r"C:\Users\Lenovo\Documents\Rohit_AI_ML\SummariseIT\dataset.csv"
df = pd.read_csv(datapath)
# Flatten the dataset to create a set of valid keywords
valid_keywords = set()
for column in df.columns:
    valid_keywords.update(df[column].dropna().str.strip().tolist())

def extract_keywords_from_tokens(text, model, tokenizer, num_keywords=5):
    # Tokenize input
    inputs = tokenizer(text, return_tensors='pt')
    
    # Get embeddings
    with torch.no_grad():
        outputs = model(**inputs)
        last_hidden_state = outputs.last_hidden_state

    # Convert token IDs to tokens
    tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
    
    # Get the [CLS] token's embedding
    cls_embedding = last_hidden_state[:, 0, :].squeeze()
    
    # Calculate similarity between each token embedding and the [CLS] embedding
    similarities = torch.matmul(last_hidden_state.squeeze(), cls_embedding)
    
    # Get the indices of the top-n tokens with the highest similarity
    top_indices = similarities.topk(num_keywords).indices

    # Extract the corresponding tokens, excluding [CLS] and checking if they are in valid_keywords
    keywords = [tokens[i] for i in top_indices if tokens[i] != '[CLS]' and tokens[i] in valid_keywords]
    
    return keywords

def generate_summary(text, model, tokenizer):
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=1024)
    summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=150, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Read and process input text
file_path = "C:/Users/Lenovo/Documents/Rohit_AI_ML/SummariseIT/keywords.txt"
with open(file_path, "r") as file:
    text = file.read()

# Use the function to extract keywords
keywords = extract_keywords_from_tokens(text, bert_model, bert_tokenizer)

# Print extracted keywords
print("Extracted keywords:")
for idx, keyword in enumerate(keywords, start=1):
    print(f"Keyword {idx}: {keyword}")

    # Generate summary using the extracted keywords
    summary = generate_summary(f"what is {keyword}", bart_model, bart_tokenizer)
    print(f"Summary of {keyword}:")
    print(summary)

Extracted keywords:
Keyword 1: software
Summary of software:
what is software and how does it work? We look at some of the key features of software. What do you think? Let us know in the comments below. Back to Mail Online home. back to the page you came from."What is software?" is a weekly, interactive look at software.
