<a href="https://colab.research.google.com/github/JeanMusenga/ASSORT-Automatic-Summarization-of-Stack-Overflow-Posts/blob/main/BertSum_Revised_Version.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install necessary libraries
!pip install transformers torch nltk openpyxl swifter

In [2]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from transformers import BertTokenizer, BertModel
import torch
import re

In [3]:
# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
# Read the Excel file
file_path = './saved_file'
file_path = ('366_ARPs_for_extracting_Issue_Solution_Pairs.xlsx')
data = pd.read_excel(file_path)

In [None]:
# Display the first few rows of the dataset
print(data.head())

In [6]:
#Preprocess the text data. Initialize stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Define the preprocessing function
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters and numbers (optional, depending on your data)
    text = re.sub(r'[^a-z\s]', '', text)
    # Tokenize into words
    words = word_tokenize(text)
    # Remove stopwords and perform lemmatization
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)

# Preprocess using swifter for faster processing on large datasets
import swifter

data['Question_body'] = data['Question_body'].swifter.apply(lambda x: preprocess_text(x) if pd.notnull(x) else "")
data['Answer_body'] = data['Answer_body'].swifter.apply(lambda x: preprocess_text(x) if pd.notnull(x) else "")

# Verify preprocessing by printing the tokens
print("Preprocessed data:")
print(data[['Question_body', 'Answer_body']].head())

Pandas Apply:   0%|          | 0/366 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/366 [00:00<?, ?it/s]

Preprocessed data:
                                       Question_body  \
0  need help architecture pattern use nestjs proj...   
1  part file structure process fromjson tojson bu...   
2  building c net mvc web application would allow...   
3  mobile application scale white label develop r...   
4  im trying properly design application accordin...   

                                         Answer_body  
0  first talking strictly cqrs pattern designing ...  
1  purpose clean architecture fromjson tojson met...  
2  easy reliable way without splitting task multi...  
3  simple answer question think strongtailwindstr...  
4  determining source information business logic ...  


In [16]:
class BertSum:
    def __init__(self, model_name='bert-base-uncased', num_sentences=5):
        self.tokenizer = BertTokenizer.from_pretrained(model_name)
        self.model = BertModel.from_pretrained(model_name)
        self.num_sentences = num_sentences

    def summarize(self, text):
        sentences = sent_tokenize(text)
        if not sentences:
            return ""

        # Tokenize sentences and obtain embeddings
        inputs = self.tokenizer(sentences, return_tensors='pt', padding=True, truncation=True)
        outputs = self.model(**inputs)
        sentence_embeddings = outputs.last_hidden_state.mean(dim=1)

        # Score sentences based on norm
        scores = torch.norm(sentence_embeddings, dim=1)

        # Select the top N sentences (up to 5 or fewer if less than 5 sentences)
        num_sentences_to_select = min(self.num_sentences, len(sentences))
        top_sentence_idxs = scores.topk(num_sentences_to_select).indices.tolist()

        # Join the top sentences to form the summary
        summary = '. '.join([sentences[idx].strip() for idx in top_sentence_idxs])

        # Ensure the summary ends with a period
        if not summary.endswith('.'):
            summary += '.'

        return summary

In [17]:
# Initialize the model
bertsum = BertSum()

In [18]:
# Apply the model to the Question_body and Answer_body column
data['Question_summary'] = data['Question_body'].apply(lambda x: bertsum.summarize(x) if pd.notnull(x) else "")
data['Answer_summary'] = data['Answer_body'].apply(lambda x: bertsum.summarize(x) if pd.notnull(x) else "")

In [19]:
# Display the summaries
summaries = data[['Question_title', 'Question_summary', 'Answer_summary']]
summaries.head()

Unnamed: 0,Question_title,Question_summary,Answer_summary
0,Separation of Students and Users in NestJS Mic...,need help architecture pattern use nestjs proj...,first talking strictly cqrs pattern designing ...
1,Flutter Clean Architecture,part file structure process fromjson tojson bu...,purpose clean architecture fromjson tojson met...
2,Correct .NET Architecture for long running asy...,building c net mvc web application would allow...,easy reliable way without splitting task multi...
3,Architecture for white-label mobile apps with ...,mobile application scale white label develop r...,simple answer question think strongtailwindstr...
4,Implementing Data Source Selection Logic in Cl...,im trying properly design application accordin...,determining source information business logic ...


In [20]:
# Display the summarized data
#print("Summarized data:")
#print(data[['Question_body', 'Question_summary', 'Answer_body', 'Answer_summary']].head())

In [21]:
# Save the summarized data to an Excel file
output_path = '/content/BertSumSummarizedData.xlsx'
data[['Question_summary', 'Answer_summary']].to_excel(output_path, index=False, engine='openpyxl')

# Verify that the file has been saved correctly. Load and verify the saved data
saved_data = pd.read_excel(output_path)
print("Saved summarized data:")
print(saved_data.head())

Saved summarized data:
                                    Question_summary  \
0  need help architecture pattern use nestjs proj...   
1  part file structure process fromjson tojson bu...   
2  building c net mvc web application would allow...   
3  mobile application scale white label develop r...   
4  im trying properly design application accordin...   

                                      Answer_summary  
0  first talking strictly cqrs pattern designing ...  
1  purpose clean architecture fromjson tojson met...  
2  easy reliable way without splitting task multi...  
3  simple answer question think strongtailwindstr...  
4  determining source information business logic ...  
