In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from transformers import BertTokenizer, BertModel
from sentence_transformers import SentenceTransformer
import torch
from rake_nltk import Rake

# Load BERT model and tokenizer for text embedding
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Load Sentence-BERT model for context analysis
sbert_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Initialize RAKE for keyword extraction
rake = Rake()

def encode_text(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

def encode_context(text):
    return sbert_model.encode(text)

def extract_keywords(text):
    rake.extract_keywords_from_text(text)
    keywords = rake.get_ranked_phrases()
    return " ".join(keywords)

# Load dataset from text file
file_path = '/Dataset/What is a Binary Tree.txt'  # Replace with your actual file path
with open(file_path, 'r') as file:
    lines = file.readlines()

# Process the dataset
data = []
for line in lines:
    line = line.strip()
    if line:  # Ensure line is not empty
        data.append({'text': line, 'context': line})

df = pd.DataFrame(data)

# User input
user_input = "Binary trees are used in data structures."

# Extract context from user input
user_context = extract_keywords(user_input)

# Encode dataset texts and user input
df['text_embedding'] = df['text'].apply(encode_text)
user_text_embedding = encode_text(user_input)

# Encode dataset contexts and user context
df['context_embedding'] = df['context'].apply(encode_context)
user_context_embedding = encode_context(user_context)

# Calculate similarity scores for text
df['text_similarity'] = df['text_embedding'].apply(lambda x: cosine_similarity([x], [user_text_embedding]).item())

# Calculate similarity scores for context
df['context_similarity'] = df['context_embedding'].apply(lambda x: cosine_similarity([x], [user_context_embedding]).item())

# Combine scores
df['final_score'] = (df['text_similarity'] + df['context_similarity']) / 2

# Display results
print(df[['text', 'text_similarity', 'context_similarity', 'final_score']].sort_values(by='final_score', ascending=False))


  from .autonotebook import tqdm as notebook_tqdm


ModuleNotFoundError: No module named 'sentence_transformers'

In [3]:
%pip install rake-nltk

Collecting rake-nltk
  Downloading rake_nltk-1.0.6-py3-none-any.whl (9.1 kB)
Installing collected packages: rake-nltk
Successfully installed rake-nltk-1.0.6
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip available: 22.3.1 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip
