In [None]:
!pip install contractions colorama rouge-score



#Importing Necessary Libraries

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from nltk.stem import WordNetLemmatizer
import numpy as np
import pandas as pd
import contractions
import unicodedata
import chardet
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import re
import networkx as nx
from rouge_score import rouge_scorer
from bs4 import BeautifulSoup
import string

# Set random seed for reproducibility
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

In [None]:
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
#Load the dataset
dataset=pd.read_excel('366_ARPs_for_extracting_Issue_Solution_Pairs.xlsx')

# Post Preprocessing

In [None]:
# Applying heuristic technique to reduce noice in the data
def clean_dataset(text):
    if not isinstance(text, str):
        return text

    soup = BeautifulSoup(text, "html.parser")

    for a in soup.find_all('a'):
        a.replace_with('[external-link]')

    for img in soup.find_all('img'):
        img.replace_with('[figure]')

    for code in soup.find_all('code'):
        code.replace_with('[code-snippet]')

    for table in soup.find_all('table'):
        table.replace_with('[table]')

    clean_text = soup.get_text()

    return clean_text

# Apply the function to 'Question_body' and 'Answer_body' columns
dataset['Question_body_cleaned'] = dataset['Question_body'].apply(clean_dataset)
dataset['Answer_body_cleaned'] = dataset['Answer_body'].apply(clean_dataset)

In [None]:
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    sentences = sent_tokenize(text)
    stop_words = set(ENGLISH_STOP_WORDS)
    punctuation = set(string.punctuation)
    extra_special_characters = set(["''", '``', '##', '>>', '<<', 'e', 'g', 'eg', 'cant', 'cannot', 'isnt', 'would', 'could', 'doesnt', 'hasnt', 'thanks', '-', ')', '\n'])
    special_characters = extra_special_characters.union(punctuation)

    processed_sentences = []

    for sentence in sentences:
        cleaned_words = re.findall(r'\b\w+\b', sentence.lower())
        cleaned_words = [word for word in cleaned_words if word not in stop_words]
        cleaned_words = [word for word in cleaned_words if word not in special_characters]
        cleaned_words = [word for word in cleaned_words if not word.isdigit()]
        cleaned_words = [word for word in cleaned_words if word.isalpha()]

        lemmatized_words = [lemmatizer.lemmatize(word) for word in cleaned_words]

        processed_sentences.append(cleaned_words)

    return sentences, processed_sentences


# Preprocess the Question_body and Answer_body columns
dataset['processed_question'] = dataset['Question_body_cleaned'].apply(preprocess_text)
dataset['processed_answer'] = dataset['Answer_body_cleaned'].apply(preprocess_text)

# Calculating Word Frequency

In [None]:
#function to calculate word frequency
def calculate_word_frequencies(sentences):
    word_freq = {}
    for sentence in sentences:
        for word in sentence:
            if word not in ENGLISH_STOP_WORDS and word.isalpha():
                if word not in word_freq:
                    word_freq[word] = 1
                else:
                    word_freq[word] += 1
    return word_freq

#function to score sentences following luhn
def score_sentences_luhn(original_sentences, processed_sentences, word_freq):
    sentence_scores = np.zeros(len(original_sentences))
    for i, sentence in enumerate(processed_sentences):
        for word in sentence:
            if word in word_freq:
                sentence_scores[i] += word_freq[word]
    return sentence_scores

In [None]:
# Calculate word frequencies
dataset['question_word_freq'] = dataset['processed_question'].apply(lambda x: calculate_word_frequencies(x[1]))
dataset['answer_word_freq'] = dataset['processed_answer'].apply(lambda x: calculate_word_frequencies(x[1]))

In [None]:
# Score sentences
dataset['question_scores'] = dataset.apply(lambda x: score_sentences_luhn(x['processed_question'][0], x['processed_question'][1], x['question_word_freq']), axis=1)
dataset['answer_scores'] = dataset.apply(lambda x: score_sentences_luhn(x['processed_answer'][0], x['processed_answer'][1], x['answer_word_freq']), axis=1)

dataset[['question_scores', 'answer_scores']].head()

Unnamed: 0,question_scores,answer_scores
0,"[12.0, 9.0, 22.0, 7.0, 46.0, 9.0, 26.0, 48.0, ...","[7.0, 34.0, 6.0, 8.0, 47.0, 12.0, 90.0, 17.0, ..."
1,"[5.0, 7.0, 26.0, 11.0, 4.0, 1.0]","[27.0, 3.0, 40.0, 21.0, 39.0, 31.0, 3.0]"
2,"[33.0, 20.0, 30.0, 34.0, 21.0, 5.0, 8.0, 23.0,...","[9.0, 11.0, 11.0, 12.0, 18.0, 23.0, 14.0, 18.0..."
3,"[17.0, 8.0, 6.0, 1.0, 9.0, 5.0, 18.0, 4.0, 4.0]","[3.0, 27.0, 20.0, 20.0, 20.0, 26.0, 22.0, 11.0]"
4,"[36.0, 34.0, 63.0, 20.0, 11.0, 43.0, 17.0, 28....","[21.0, 15.0, 26.0, 22.0, 9.0, 13.0, 11.0]"


# Generate issue/solutions for both questions and answers

In [None]:
#Generate summaries or issue–solution pairs for both questions and answers.
def generate_summary(original_sentences, scores, num_sentences=6):
    ranked_sentence_indices = np.argsort(scores)[::-1]
    summary = []
    for i in range(min(num_sentences, len(original_sentences))):
        summary.append(original_sentences[ranked_sentence_indices[i]])
    return " ".join(summary)

dataset['Issue_Extracted'] = dataset.apply(lambda x: generate_summary(x['processed_question'][0], x['question_scores']), axis=1)
dataset['Solution_Extracted'] = dataset.apply(lambda x: generate_summary(x['processed_answer'][0], x['answer_scores']), axis=1)

# Display the generated issuse and solutions
summaries = dataset[['Question_title', 'Issue_Extracted', 'Solution_Extracted']]
summaries.head()

Unnamed: 0,Question_title,Issue_Extracted,Solution_Extracted
0,Separation of Students and Users in NestJS Mic...,"I am still able to do stuff I need to do, I am...",If you define relations for user (1:M addresse...
1,Flutter Clean Architecture,I created entity class on business layer and a...,Business Layer (Domain Layer):\n\nThe business...
2,Correct .NET Architecture for long running asy...,"However, if they leave the page and come back ...",The state of the operation should be marked in...
3,Architecture for white-label mobile apps with ...,It becomes a nightmare to move features across...,"Think of how Tailwind or Nativewind works, tak..."
4,Implementing Data Source Selection Logic in Cl...,Which version to show the user depends on othe...,"Your repositories might return the same model,..."


# Saving the extracted issues and solutions to a new Excel file

In [None]:
# Save the extracted issues and solutions to a new Excel file
file_path = '/content/Luhn_Extracted_Issue_Solution.xlsx'

dataset.to_excel(file_path, index=False, engine='openpyxl')

# Evaluation : Precision, Recall, and F1

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

nltk.download('punkt')

def evaluate_summaries_at_sentence_level(df, ref_col, gen_col):
    precision_list = []
    recall_list = []
    f1_list = []

    for index, row in df.iterrows():
        ref_summary = row[ref_col]
        gen_summary = row[gen_col]

        if pd.isna(ref_summary) or pd.isna(gen_summary):
            continue

         # List of sentences in the ground-truth benchmark
        ref_sentences = nltk.sent_tokenize(ref_summary)
        # List of sentences in the generated issues and solutions
        gen_sentences = nltk.sent_tokenize(gen_summary)

        ref_sentences_set = set(ref_sentences)
        gen_sentences_set = set(gen_sentences)

        precision = len(ref_sentences_set & gen_sentences_set) / len(gen_sentences_set) if gen_sentences_set else 0
        recall = len(ref_sentences_set & gen_sentences_set) / len(ref_sentences_set) if ref_sentences_set else 0
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) else 0

        precision_list.append(precision)
        recall_list.append(recall)
        f1_list.append(f1)

    metrics_df = pd.DataFrame({
        'precision': precision_list,
        'recall': recall_list,
        'f1': f1_list
    })

    return metrics_df

# Load DataFrame containing the extracted issues and solutions
df = pd.read_excel('Luhn_Extracted_Issue_Solution.xlsx')

question_metrics_df = evaluate_summaries_at_sentence_level(df, 'Ground_truth_Issue_Labeled', 'Issue_Extracted')
answer_metrics_df = evaluate_summaries_at_sentence_level(df, 'Ground_truth_Solution_Labeled', 'Solution_Extracted')

question_metrics_df.columns = [f'Question_{col}' for col in question_metrics_df.columns]
answer_metrics_df.columns = [f'Answer_{col}' for col in answer_metrics_df.columns]

combined_metrics_df = pd.concat([question_metrics_df, answer_metrics_df], axis=1)

# Compute overall Precision, Recall, F1 scores
mean_question_metrics = question_metrics_df.mean()
mean_answer_metrics = answer_metrics_df.mean()

print("\nMean Precision, Recall, F1 Scores for \033[31mQuestions/Issues\033[0m:")
print(mean_question_metrics)

print("\nMean Precision, Recall, F1 Scores for \033[31mAnswers/Issue\033[0m:")
print(mean_answer_metrics)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!



Mean Precision, Recall, F1 Scores for [31mQuestions[0m:
Question_precision    0.601776
Question_recall       0.592653
Question_f1           0.595404
dtype: float64

Mean Precision, Recall, F1 Scores for [31mAnswers[0m:
Answer_precision    0.591894
Answer_recall       0.572834
Answer_f1           0.580010
dtype: float64
