# Pre-processing

In [1]:
# Following is the preprocessing code
from lxml import etree
import pandas as pd

def extract_data_from_xml(xml_file):
    parser = etree.XMLParser(recover=True)
    tree = etree.parse(xml_file, parser=parser)
    root = tree.getroot()

    data = []

    for row in root.findall('.//row'):
        accepted_answer_id = row.get('AcceptedAnswerId')

        if accepted_answer_id:
            question_id = row.get('Id')
            question_title = row.get('Title')
            question_body = row.get('Body')
            question_tags = row.get('Tags')

            accepted_answer_row = root.find(f'.//row[@Id="{accepted_answer_id}"]')

            if accepted_answer_row is not None:
                accepted_answer_body = accepted_answer_row.get('Body')
                data.append({
                    'Question Title': question_title,
                    'Question Body': question_body,
                    'Question Tags': question_tags,
                    'Accepted Answer Body': accepted_answer_body
                })

    return data

def save_to_excel(data, output_file):
    df = pd.DataFrame(data)
    df.to_excel(output_file, index=False)

# Replace 'input.xml' and 'output.xlsx' with your actual file paths
input_xml_file = 'Posts.xml'
output_excel_file = 'output.xlsx'

# Extract data from XML and save to Excel
data = extract_data_from_xml(input_xml_file)
save_to_excel(data, output_excel_file)

OSError: Error reading file 'Posts.xml': failed to load external entity "Posts.xml"

# Evaluating the responses of LLMs and Human

In [4]:
!pip install pandas
!pip install transformers
!pip install sentence-transformers
!pip install bert-score
!pip install nltk
!pip install bert-score
!pip  install rouge-score


Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24933 sha256=f9f4590419b8519ab5eb08613e71e7a307aea041bb7b111bfb78247e91d98f15
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [11]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from bert_score import BERTScorer
from rouge_score import rouge_scorer
from nltk.translate import meteor_score
from sentence_transformers import SentenceTransformer, util

# Replace 'your_dataset.xlsx' with the actual file path
xlsx_file = 'test.xlsx'

# Read the dataset
df = pd.read_excel(xlsx_file)

# Select the "Human Response" and "LLM Response" columns
human_responses = df['Human Response'].tolist()
llm_responses = df['LLM Response'].tolist()

# Define a function to compute Cosine Similarity using sentence-transformers
def cosine_similarity_score(sentences1, sentences2):
    model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
    embeddings1 = model.encode(sentences1, convert_to_tensor=True)
    embeddings2 = model.encode(sentences2, convert_to_tensor=True)
    cos_scores = util.pytorch_cos_sim(embeddings1, embeddings2)
    return cos_scores.numpy().diagonal()

# Define a function to compute BLEURT score using BERTScore
def bleurt_score(sentences1, sentences2):
    scorer = BERTScorer(lang="en")
    _, _, bleurt_scores = scorer.score(sentences1, sentences2)
    return bleurt_scores.numpy()

# Define a function to compute METEOR score
def meteor_score_func(sentences1, sentences2):
    meteor_scores = [meteor_score.single_meteor_score(s1, s2) for s1, s2 in zip(sentences1, sentences2)]
    return meteor_scores

# Define a function to compute Rouge-L score
def rouge_l_score(sentences1, sentences2):
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    rouge_scores = [scorer.score(s1, s2)['rougeL'].fmeasure for s1, s2 in zip(sentences1, sentences2)]
    return rouge_scores

# Define a function to compute BERTScore
def bert_score(sentences1, sentences2):
    scorer = BERTScorer(lang="en")
    _, _, bert_scores = scorer.score(sentences1, sentences2)
    return bert_scores.numpy()

# Compute scores
cosine_scores = cosine_similarity_score(human_responses, llm_responses)
bleurt_scores = bleurt_score(human_responses, llm_responses)
# meteor_scores = meteor_score_func(human_responses, llm_responses)
rouge_l_scores = rouge_l_score(human_responses, llm_responses)
bert_scores = bert_score(human_responses, llm_responses)

# Display scores
scores_df = pd.DataFrame({
    'Cosine Similarity': cosine_scores,
    'BLEURT': bleurt_scores,
    # 'METEOR': meteor_scores,
    'Rouge-L': rouge_l_scores,
    'BERTScore': bert_scores
})

print(scores_df)


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


   Cosine Similarity    BLEURT   Rouge-L  BERTScore
0           0.423615  0.862702  0.217391   0.862702
1           0.930341  0.903527  0.381188   0.903527
