# Install Necessary Libraries

In [None]:
# Install necessary libraries
!pip install rouge_score swifter

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting swifter
  Downloading swifter-1.4.0.tar.gz (1.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score, swifter
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=397819f57646dfefe34f7dc7c2d5c00b58413dbfa4b635756f429d6e9c603a95
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
  Building wheel for swifter (setup.py) ... [?25l[?25hdone
  Created wheel for swifter: filename=swifter-1.4.0-py3-none-any.whl size=16505 sha256=b879b51dfeea09e10908021c19e666c6017143e2f55bcb2dab4244521a8167ac
  Stored in directory: /root/.c

In [None]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from transformers import BertTokenizer, BertModel
import torch
import re
import gc
import torch
from nltk.tokenize import sent_tokenize
from rouge_score import rouge_scorer
from bs4 import BeautifulSoup
from tqdm import tqdm

# Set random seed for reproducibility
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

In [None]:
# Download NLTK data
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

# Read the Excel file

In [None]:
#Load the dataset
data = pd.read_excel('366_ARPs_for_extracting_Issue_Solution_Pairs.xlsx')

In [None]:
# Applying heuristic technique to reduce noice in the data
def clean_dataset(text):
    if not isinstance(text, str):
        return text

    soup = BeautifulSoup(text, "html.parser")

    for a in soup.find_all('a'):
        a.replace_with('[external-link]')

    for img in soup.find_all('img'):
        img.replace_with('[figure]')

    for code in soup.find_all('code'):
        code.replace_with('[code-snippet]')

    for table in soup.find_all('table'):
        table.replace_with('[table]')

    clean_text = soup.get_text()

    return clean_text

# Apply the function to 'Question_body' and 'Answer_body' columns
dataset['Question_body_cleaned'] = dataset['Question_body'].apply(clean_dataset)
dataset['Answer_body_cleaned'] = dataset['Answer_body'].apply(clean_dataset)

In [None]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    words = word_tokenize(text)
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)

import swifter
data['Question_body_preprocessed'] = data['Question_body_cleaned'].swifter.apply(lambda x: preprocess_text(x) if pd.notnull(x) else "")
data['Answer_body_preprocessed'] = data['Answer_body_cleaned'].swifter.apply(lambda x: preprocess_text(x) if pd.notnull(x) else "")

print("Preprocessed data:")
print(data[['Question_body_preprocessed', 'Answer_body_preprocessed']].head())

Pandas Apply:   0%|          | 0/366 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/366 [00:00<?, ?it/s]

Preprocessed data:
                          Question_body_preprocessed  \
0  need help architecture pattern use nestjs proj...   
1  part file structure process fromjson tojson bu...   
2  building c net mvc web application would allow...   
3  mobile application scale white label develop r...   
4  im trying properly design application accordin...   

                            Answer_body_preprocessed  
0  first talking strictly cqrs pattern designing ...  
1  purpose clean architecture fromjson tojson met...  
2  easy reliable way without splitting task multi...  
3  simple answer question think tailwind nativewi...  
4  determining source information business logic ...  


# Define the BertSum model

In [None]:
# Define the BertSum model
class BertSum:
    def __init__(self, model_name='bert-base-uncased', num_sentences=6):
        self.tokenizer = BertTokenizer.from_pretrained(model_name)
        self.model = BertModel.from_pretrained(model_name)
        self.model.eval()
        self.num_sentences = num_sentences

    def summarize_and_get_embeddings(self, texts):
        summaries = []
        embeddings_list = []

        for text in texts:
            if not text:
                summaries.append("")
                embeddings_list.append(None)
                continue

            sentences = sent_tokenize(text)
            if not sentences:
                summaries.append("")
                embeddings_list.append(None)
                continue

            inputs = self.tokenizer(sentences, return_tensors='pt', padding=True, truncation=True)
            outputs = self.model(**inputs)

            # Get the sentence embeddings by averaging over token embeddings
            sentence_embeddings = outputs.last_hidden_state.mean(dim=1)

            # Score sentences based on norm
            scores = torch.norm(sentence_embeddings, dim=1)

            # Select the top N sentences (up to 6 or fewer if less than 6 sentences)
            num_sentences_to_select = min(self.num_sentences, len(sentences))
            top_sentence_idxs = scores.topk(num_sentences_to_select).indices.tolist()

            # Sort indices for natural order and join with proper delimiters
            sorted_top_sentences = sorted([sentences[idx].strip() for idx in top_sentence_idxs])
            summary = '. '.join(sorted_top_sentences)

            if summary and not summary.endswith('.'):
                summary += '.'

            summaries.append(summary)
            embeddings_list.append(sentence_embeddings.detach().cpu().numpy())

        return summaries, embeddings_list

In [None]:
# Initialize the model
bertsum = BertSum()

batch_size = 3

# Prepare lists for storing summaries/issues or solutions and embeddings
question_summaries = []
question_embeddings = []
answer_summaries = []
answer_embeddings = []

# Function to process batches and manage memory
def process_text_column(text_column):
    summaries = []
    embeddings = []

    for i in tqdm(range(0, len(text_column), batch_size), desc="Processing Batches"):
        batch = text_column[i:i + batch_size].tolist()

        with torch.no_grad():
            batch_summaries, batch_embeddings = bertsum.summarize_and_get_embeddings(batch)

        # Convert only PyTorch tensors to CPU before storing
        batch_embeddings = [emb.cpu().numpy() if isinstance(emb, torch.Tensor) else emb for emb in batch_embeddings]

        summaries.extend(batch_summaries)
        embeddings.extend(batch_embeddings)

        # Explicitly clear CUDA cache
        torch.cuda.empty_cache()
        gc.collect()

    return summaries, embeddings

# Process Question Body
print("Processing Question Body...")
question_summaries, question_embeddings = process_text_column(data['Question_body'])

# Process Answer Body
print("Processing Answer Body...")
answer_summaries, answer_embeddings = process_text_column(data['Answer_body'])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Processing Question Body...


Processing Batches: 100%|██████████| 183/183 [08:09<00:00,  2.68s/it]


Processing Answer Body...


Processing Batches: 100%|██████████| 183/183 [09:57<00:00,  3.26s/it]


In [None]:
# Assign results to DataFrame
data['Issue_Extracted'] = question_summaries
data['Question_embeddings'] = question_embeddings
data['Solution_Extracted'] = answer_summaries
data['Answer_embeddings'] = answer_embeddings

# Save to Excel in chunks
output_path = "BertSum_Extracted_Issue_Solution.xlsx"
data.to_excel(output_path, index=False, engine='openpyxl')

# Display the summaries

In [None]:
# Display the summaries/issue–solution pairs for both questions and answers
summaries = data[['Question_title', 'Issue_Extracted', 'Solution_Extracted']]
summaries.head()

Unnamed: 0,Question_title,Issue_Extracted,Solution_Extracted
0,Separation of Students and Users in NestJS Mic...,"Considering this, creating users, contacts, an...","But still, if you inject repository for <em>on..."
1,Flutter Clean Architecture,I created entity class on business layer and a...,Business Layer (Domain Layer):\n\nThe business...
2,Correct .NET Architecture for long running asy...,How do I handle re-attaching the UI web page t...,"<a href=""https://i.sstatic.net/XZbMs.png"" rel=..."
3,Architecture for white-label mobile apps with ...,Advise on what approach to development and sca...,<strong>Use dependency injection</strong> -&gt...
4,Implementing Data Source Selection Logic in Cl...,<strong>Approach 2</strong>\nUserRepository in...,"By moving the selection logic to the UseCase, ..."


# Evaluation of issue-solution pairs extracted by BertSum

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

def evaluate_summaries_at_sentence_level(df, ref_col, gen_col):
    precision_list = []
    recall_list = []
    f1_list = []

    for index, row in df.iterrows():
        ref_summary = row[ref_col]
        gen_summary = row[gen_col]

        if pd.isna(ref_summary) or pd.isna(gen_summary):
            continue

        # List of sentences in the ground-truth benchmark
        ref_sentences = nltk.sent_tokenize(ref_summary)

        # List of sentences in the generated issue and solution pairs
        gen_sentences = nltk.sent_tokenize(gen_summary)

        # Precision, Recall, F1 (binary classification based on exact match)
        ref_sentences_set = set(ref_sentences)
        gen_sentences_set = set(gen_sentences)

        precision = len(ref_sentences_set & gen_sentences_set) / len(gen_sentences_set) if gen_sentences_set else 0
        recall = len(ref_sentences_set & gen_sentences_set) / len(ref_sentences_set) if ref_sentences_set else 0
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) else 0

        precision_list.append(precision)
        recall_list.append(recall)
        f1_list.append(f1)

    metrics_df = pd.DataFrame({
        'precision': precision_list,
        'recall': recall_list,
        'f1': f1_list
    })

    return metrics_df

# Load your DataFrame
df = pd.read_excel('BertSum_Extracted_Issue_Solution.xlsx')

question_metrics_df = evaluate_summaries_at_sentence_level(df, 'Ground_truth_Issue_Labeled', 'Issue_Extracted')
answer_metrics_df = evaluate_summaries_at_sentence_level(df, 'Ground_truth_Solution_Labeled', 'Solution_Extracted')

question_metrics_df.columns = [f'Question_{col}' for col in question_metrics_df.columns]
answer_metrics_df.columns = [f'Answer_{col}' for col in answer_metrics_df.columns]

combined_metrics_df = pd.concat([question_metrics_df, answer_metrics_df], axis=1)

# Compute overall Precision, Recall, F1 scores
mean_question_metrics = question_metrics_df.mean()
mean_answer_metrics = answer_metrics_df.mean()

print("\nMean Precision, Recall, F1 Scores for \033[31mQuestions\033[0m:")
print(mean_question_metrics)

print("\nMean Precision, Recall, F1 Scores for \033[31mAnswers\033[0m:")
print(mean_answer_metrics)



Mean Precision, Recall, F1 Scores for [31mQuestions[0m:
Question_precision    0.039936
Question_recall       0.026438
Question_f1           0.031186
dtype: float64

Mean Precision, Recall, F1 Scores for [31mAnswers[0m:
Answer_precision    0.036976
Answer_recall       0.026708
Answer_f1           0.030522
dtype: float64
