In [12]:
!pip install pandas numpy bert-score rouge-score nltk scikit-learn



In [13]:
import pandas as pd

In [None]:
df = pd.read_csv("/content/quantitative_analysis.csv")

In [15]:
df.head()

Unnamed: 0,repo_name,question,documentation,ledge
0,https://github.com/gatsbyjs/gatsby,What does IMergeWorkerQueryState do?,The IMergeWorkerQueryState interface in Gatsby...,The IMergeWorkerQueryState is a TypeScript int...
1,https://github.com/gatsbyjs/gatsby,What is the purpose of packages/gatsby-adapter...,The file route-handler.ts in the gatsby-adapte...,The provided data indicates that the file pack...
2,https://github.com/gatsbyjs/gatsby,How is packages/gatsby-cli/src/reporter/redux/...,The packages/gatsby-cli/src/reporter/redux/act...,The provided data includes a partial source co...
3,https://github.com/gatsbyjs/gatsby,What is the purpose of packages/gatsby-core-ut...,The site-metadata.ts file in Gatsby's gatsby-c...,The file packages/gatsby-core-utils/src/site-m...
4,https://github.com/gatsbyjs/gatsby,What is the purpose of PARTIAL_HYDRATION_CHUNK...,The constant PARTIAL_HYDRATION_CHUNK_REASON in...,PARTIAL_HYDRATION_CHUNK_REASON is a constant s...


In [16]:
import pandas as pd
import numpy as np
from bert_score import score as bert_score
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import word_tokenize
import nltk
import uuid

In [17]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [18]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [19]:
def jaccard_similarity(text1, text2):
    tokens1 = set(word_tokenize(text1.lower()))
    tokens2 = set(word_tokenize(text2.lower()))
    intersection = tokens1.intersection(tokens2)
    union = tokens1.union(tokens2)
    return len(intersection) / len(union) if union else 0.0

In [20]:
def evaluate_row(row, reference_type='question'):
    question = row['question']
    doc = row['documentation']
    ledge = row['ledge']

    doc_ref = question if reference_type == 'question' else ledge
    ledge_ref = question if reference_type == 'question' else doc

    results = {'documentation': {}, 'ledge': {}}

    try:
        P_doc, R_doc, F1_doc = bert_score([doc], [doc_ref], lang="en", verbose=False)
        P_ledge, R_ledge, F1_ledge = bert_score([ledge], [ledge_ref], lang="en", verbose=False)
        results['documentation']['BERTScore_F1'] = F1_doc.item()
        results['ledge']['BERTScore_F1'] = F1_ledge.item()
    except Exception as e:
        print(f"BERTScore error: {e}")
        results['documentation']['BERTScore_F1'] = None
        results['ledge']['BERTScore_F1'] = None


    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    rouge_doc = scorer.score(doc_ref, doc)
    rouge_ledge = scorer.score(ledge_ref, ledge)
    results['documentation']['ROUGE-L_F1'] = rouge_doc['rougeL'].fmeasure
    results['ledge']['ROUGE-L_F1'] = rouge_ledge['rougeL'].fmeasure

    smoothie = SmoothingFunction().method4
    doc_tokens = [word_tokenize(doc_ref)]
    ledge_tokens = [word_tokenize(ledge_ref)]
    bleu_doc = sentence_bleu(doc_tokens, word_tokenize(doc), smoothing_function=smoothie)
    bleu_ledge = sentence_bleu(ledge_tokens, word_tokenize(ledge), smoothing_function=smoothie)
    results['documentation']['BLEU'] = bleu_doc
    results['ledge']['BLEU'] = bleu_ledge

    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([doc_ref, doc, ledge])
    cosine_doc = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
    cosine_ledge = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[2:3])[0][0]
    results['documentation']['Cosine_TFIDF'] = cosine_doc
    results['ledge']['Cosine_TFIDF'] = cosine_ledge

    jaccard_doc = jaccard_similarity(doc_ref, doc)
    jaccard_ledge = jaccard_similarity(ledge_ref, ledge)
    results['documentation']['Jaccard'] = jaccard_doc
    results['ledge']['Jaccard'] = jaccard_ledge

    return results

In [21]:
df = df.dropna(subset=['documentation', 'ledge'])

all_results = []
for idx, row in df.iterrows():
    result_id = str(uuid.uuid4())
    results = evaluate_row(row, reference_type='question')

    result_entry = {
        'repo_name' : row['repo_name'],
        'question': row['question'],
        'documentation' : row['documentation'],
        'ledge' : row['ledge'],
        'documentation_BERTScore_F1': results['documentation']['BERTScore_F1'],
        'ledge_BERTScore_F1': results['ledge']['BERTScore_F1'],
        'documentation_ROUGE-L_F1': results['documentation']['ROUGE-L_F1'],
        'ledge_ROUGE-L_F1': results['ledge']['ROUGE-L_F1'],
        'documentation_BLEU': results['documentation']['BLEU'],
        'ledge_BLEU': results['ledge']['BLEU'],
        'documentation_Cosine_TFIDF': results['documentation']['Cosine_TFIDF'],
        'ledge_Cosine_TFIDF': results['ledge']['Cosine_TFIDF'],
        'documentation_Jaccard': results['documentation']['Jaccard'],
        'ledge_Jaccard': results['ledge']['Jaccard'],
    }
    all_results.append(result_entry)

results_df = pd.DataFrame(all_results)

results_df.to_csv('evaluation_results.csv', index=False)
print("Results saved to 'evaluation_results.csv'")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You sho

Results saved to 'evaluation_results.csv'


In [22]:
numerical_columns = [
    'documentation_BERTScore_F1',
    'ledge_BERTScore_F1',
    'documentation_ROUGE-L_F1',
    'ledge_ROUGE-L_F1',
    'documentation_BLEU',
    'ledge_BLEU',
    'documentation_Cosine_TFIDF',
    'ledge_Cosine_TFIDF',
    'documentation_Jaccard',
    'ledge_Jaccard'
]

average_results = results_df.groupby('repo_name')[numerical_columns].mean().reset_index()

average_results.to_csv('average_evaluation_results.csv', index=False)
print("Average results by repo_name saved to 'average_evaluation_results.csv'")

Average results by repo_name saved to 'average_evaluation_results.csv'
