In [64]:
import torch
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd

In [65]:
# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

In [66]:
# Load keywords from an Excel file
df = pd.read_excel("/content/drive/MyDrive/data/keywords.xlsx")
keywords = df["Keywords"].tolist()

In [67]:
# Generate embeddings for each keyword
def get_keyword_embeddings(keywords):
    keyword_embeddings = {}
    for keyword in keywords:
        tokens = tokenizer(keyword, return_tensors='pt', padding=True, truncation=True)
        with torch.no_grad():
            embedding = model(**tokens).last_hidden_state.mean(dim=1).numpy()
        keyword_embeddings[keyword] = embedding
    return keyword_embeddings

keyword_embeddings = get_keyword_embeddings(keywords)

In [68]:
# Function to generate embeddings for the input text
def get_text_embedding(text):
    tokens = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
    with torch.no_grad():
        embedding = model(**tokens).last_hidden_state.mean(dim=1).numpy()
    return embedding


In [69]:
# Function to evaluate the introduction
def evaluate_introduction(intro):
    intro_lower = intro.lower()
    intro_embedding = get_text_embedding(intro)

    # Calculate cosine similarity for each keyword and get their mean
    scores = []
    for keyword, keyword_embedding in keyword_embeddings.items():
        score = cosine_similarity(intro_embedding, keyword_embedding.reshape(1, -1))
        scores.append(score[0][0])

    # Calculate the mean of cosine similarity scores
    mean_score = np.mean(scores)

    # Calculate the overall score on a 10-point scale
    normalized_scores = [(score - min(scores)) / (max(scores) - min(scores)) for score in scores]
    overall_score = np.mean(normalized_scores) * 10

    # Count the number of keywords present in the introduction
    keyword_count = sum(1 for keyword in keywords if keyword.lower() in intro_lower)

    # Adjust final score based on keyword count
    if keyword_count >= 5:
        final_score = min(overall_score * 1.5, 10)  # Increase score by 50% if 5 or more keywords are found, capped at 10
    else:
        final_score = overall_score

    return round(final_score, 2)

In [70]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [71]:
file_path = '/content/drive/MyDrive/data/result.xlsx'
df = pd.read_excel(file_path)

In [72]:
# Load introductions from the same Excel file (or a different file if needed)

introductions = df['Transcript']

In [73]:
# Evaluate the introduction
scores = []
for intro in introductions:
    if isinstance(intro, str):  # Check if the introduction is a string
        score = evaluate_introduction(intro)
        scores.append(score)
    else: # Handle non-string introductions
        scores.append(None) # or any default value you prefer



In [74]:
file_path = '/content/drive/MyDrive/data/result.xlsx'
df_result= pd.read_excel(file_path)

In [75]:
# Add the scores to the DataFrame and save to a new Excel file
df_result['Relavance_Scores'] = scores[:len(df_result)] # Truncate the scores list to match the DataFrame length
df_result.to_excel("/content/drive/MyDrive/data/result.xlsx", index=False)

print("Scores have been evaluated and saved to the Excel sheet.")

Scores have been evaluated and saved to the Excel sheet.


In [76]:
print(scores)

[4.81, 4.74, 5.11, 5.07, 4.61, 6.99, 4.83, 4.44, 6.73, 4.7, 5.12, 4.58, 4.95, 7.6, 4.94, 5.35, 4.78, 4.51, 4.77, 4.87, 4.78, 4.78, 4.74, 4.62, 4.41, 4.64, 5.25, 4.72, 7.48, 4.77, 6.93, 4.77, 4.59, 4.37, 6.64, 4.85, 4.8, 5.03, 4.88, 4.9, 4.98, 6.71]
