## Similarity Calculations

In [None]:
pip install transformers torch

In [None]:
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/bert-base-nli-mean-tokens")
model = AutoModel.from_pretrained("sentence-transformers/bert-base-nli-mean-tokens")

In [None]:
# Function to extract text embedding based on the define model
def embed_text(text, tokenizer, model):
    inputs = tokenizer(text, padding=True, truncation=True, max_length=128, return_tensors="pt")
    outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(1)
    return embeddings

In [None]:
def calculate_similarity(embedding1, embedding2):
    """Calculate the similarities between two text embedings."""
    return cosine_similarity(embedding1.detach().numpy(), embedding2.detach().numpy())[0][0]

In [None]:
def concatenate_sources(cells):
    """Concatenate the 'source' text from a list of cell dictionaries into a single string."""
    concatenated_text = ""
    for cell in cells:
        if 'source' in cell:
            # Concatenate the lines in 'source'
            concatenated_text += ' '.join(cell['source']).strip() + " "
    return concatenated_text.strip()

In [None]:
# Function to clean and preprocess text for prompt/question pairs JSON file
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenize text
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    # Stemming
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
    return ' '.join(stemmed_tokens)

# Path to your JSON file containing prompt-answer pairs
json_file_path = 'path_to_your_prompt_answer_pairs.json'  # Replace 'path_to_your_prompt_answer_pairs.json' with your file path

with open(json_file_path, 'r', encoding='utf-8') as file:
    formatted_data = json.load(file)

# Preprocess the text in pairs
for html_id, content in formatted_data.items():
    if 'pairs' in content:
        preprocessed_pairs = []
        for pair in content['pairs']:
            preprocessed_pair = [preprocess_text(text) for text in pair]
            preprocessed_pairs.append(preprocessed_pair)
        formatted_data[html_id]['pairs'] = preprocessed_pairs

# Save the preprocessed data back to the same JSON file
with open(json_file_path, 'w', encoding='utf-8') as file:
    json.dump(formatted_data, file, indent=4)

In [None]:
# Function to preprocess the question text from the assignment file
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenize text
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    # Stemming
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
    # Reassemble the text
    return ' '.join(stemmed_tokens)

# Path to your JSON file containing questions data
json_file_path = 'path_to_your_questions_data.json'  # Replace 'path_to_your_questions_data.json' with your file path

with open(json_file_path, 'r', encoding='utf-8') as file:
    questions_data = json.load(file)

# Preprocess the text in each question
for question_id, cells in questions_data.items():
    preprocessed_text = []
    for cell in cells:
        if 'source' in cell:
            # Concatenate all lines in 'source'
            cell_text = ' '.join(cell['source'])
            # Preprocess the concatenated text
            preprocessed_text.append(preprocess_text(cell_text))
    # Update the question with preprocessed text
    questions_data[question_id] = preprocessed_text

# Save the preprocessed data back to the same JSON file
with open(json_file_path, 'w', encoding='utf-8') as file:
    json.dump(questions_data, file, indent=4)

In [None]:
# Load the data from the 'questions_data.json' file
with open('path_to_your_questions_data.json', 'r') as file:
    question_data = json.load(file)

# Load the data from the 'prompt_answer_pairs.json' file
with open('path_to_your_prompt_answer_pairs.json', 'r') as file:
    chat_texts = json.load(file)

# Initialize a dictionary to store similarity scores for each HTML file
html_similarity_scores = {chat_id: {} for chat_id in chat_texts.keys()}

# Calculate similarities
for q_id, q_contents in question_data.items():
    # Concatenate all texts for each question
    q_text = ' '.join(q_contents)
    q_embedding = embed_text(q_text, tokenizer, model)
    for chat_id, chat_data in chat_texts.items():
        # Concatenate all texts from pairs for embedding
        all_chat_text = ' '.join([' '.join(pair) for pair in chat_data['pairs']])
        chat_embedding = embed_text(all_chat_text, tokenizer, model)
        similarity = calculate_similarity(q_embedding, chat_embedding) 
        html_similarity_scores[chat_id][f"q_{q_id[-1]}_similarity"] = similarity

In [None]:
# Define the path to your prompt/answer JSON file
formatted_json_path = 'path_to_your_prompt_answer_pairs.json'  # Replace 'path_to_your_prompt_answer_pairs.json' with your file path

# Load the data from the formatted JSON file
with open(formatted_json_path, 'r', encoding='utf-8') as file:
    formatted_data = json.load(file)

# Update the formatted_data with similarity scores and calculate averages
for html_id, scores in html_similarity_scores.items():
    if html_id in formatted_data:
        # Convert scores to standard Python floats
        formatted_scores = {k: float(v) for k, v in scores.items()}
        formatted_data[html_id]['scores'] = formatted_scores

        # Calculate and add the average similarity score
        average_score = sum(formatted_scores.values()) / len(formatted_scores) if formatted_scores else 0
        formatted_data[html_id]['scores']['average_similarity'] = average_score

# Save the updated formatted data back to the same JSON file
with open(formatted_json_path, 'w', encoding='utf-8') as file:
    json.dump(formatted_data, file, indent=4)