<a href="https://colab.research.google.com/github/JeanMusenga/PhD-Thesis_2024_Musenga/blob/main/LSA_TextSummarizationWith.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing necessary libraries

In [1]:
import re
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.decomposition import TruncatedSVD

In [3]:
data=pd.read_excel('DataSampePilot.xlsx')

# Preprocessing function

In [4]:
# Preprocessing function
def preprocess_text(text):
    sentences = re.split(r'(?<=[.!?]) +', text)  # Simple sentence tokenizer
    processed_sentences = []

    for sentence in sentences:
        words = re.findall(r'\b\w+\b', sentence.lower())  # Tokenization
        words = [word for word in words if word not in ENGLISH_STOP_WORDS]
        processed_sentences.append(words)

    return sentences, processed_sentences

# Build term-document matrix function

In [5]:
# Build term-document matrix function
def build_term_document_matrix(processed_sentences):
    sentences = [" ".join(sentence) for sentence in processed_sentences]
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(sentences)
    return tfidf_matrix

# Apply LSA

In [8]:
# Apply LSA
def apply_lsa(tfidf_matrix, num_topics=1):
    lsa = TruncatedSVD(n_components=num_topics, n_iter=100)
    lsa_matrix = lsa.fit_transform(tfidf_matrix)
    return lsa, lsa_matrix

# Rank sentences

In [6]:
# Rank sentences
def rank_sentences(lsa_matrix):
    scores = np.mean(lsa_matrix, axis=1)
    return scores

# Generate summary function

In [7]:
# Generate summary function
def generate_summary(original_sentences, scores, num_sentences=3):
    ranked_sentence_indices = np.argsort(scores)[::-1]
    summary = []
    for i in range(min(num_sentences, len(original_sentences))):
        summary.append(original_sentences[ranked_sentence_indices[i]])
    return " ".join(summary)

In [9]:
# Preprocess the Question_body and Answer_body columns
data['processed_question'] = data['Question_body'].apply(preprocess_text)
data['processed_answer'] = data['Answer_body'].apply(preprocess_text)

In [10]:
# Apply term-document matrix building
data['question_tfidf_matrix'] = data['processed_question'].apply(lambda x: build_term_document_matrix(x[1]))
data['answer_tfidf_matrix'] = data['processed_answer'].apply(lambda x: build_term_document_matrix(x[1]))


In [11]:
# Apply LSA to the term-document matrices for both questions and answers
data['question_lsa'], data['question_lsa_matrix'] = zip(*data['question_tfidf_matrix'].apply(apply_lsa))
data['answer_lsa'], data['answer_lsa_matrix'] = zip(*data['answer_tfidf_matrix'].apply(apply_lsa))


In [12]:
# Rank sentences
data['question_scores'] = data['question_lsa_matrix'].apply(rank_sentences)
data['answer_scores'] = data['answer_lsa_matrix'].apply(rank_sentences)


In [13]:
# Generate summaries for both questions and answers
data['question_summary'] = data.apply(lambda x: generate_summary(x['processed_question'][0], x['question_scores']), axis=1)
data['answer_summary'] = data.apply(lambda x: generate_summary(x['processed_answer'][0], x['answer_scores']), axis=1)


In [14]:
# Display the summaries
summaries = data[['question_summary', 'answer_summary']]
summaries.head()

Unnamed: 0,question_summary,answer_summary
0,Would it be all about querying some API over H...,"You send a request, you get a response. In ord..."
1,I have some spring boot microservices and I wa...,HTML pages generated by Spring MVC may still c...
2,On which layer should I implement the logic fo...,Determining the source of the information is b...
3,I can imagine some ways that I could call an A...,if you can't find it I can try to put of a blo...
4,Also I would like to know what is standard/pre...,It would also facilitate SSL (in addition to t...
