<a href="https://colab.research.google.com/github/JeanMusenga/PhD-Thesis_2024_Musenga/blob/main/LSA_TextSummarizationWith.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Important resources:
https://chatgpt.com/share/0fde4435-eb1d-4ab1-9149-78c0899251e0

# Importing necessary libraries

In [1]:
pip install contractions



In [2]:
pip install chardet



In [3]:
import re
import string
import numpy as np
import pandas as pd
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.decomposition import TruncatedSVD
import contractions
import unicodedata
import chardet

In [29]:
dataset=pd.read_excel('DataSampePilot.xlsx')

In [30]:
# converting all object types in string
dataset['Question_body'] = dataset['Question_body'].astype(str)
dataset['Answer_body'] = dataset['Answer_body'].astype(str)

# Preprocessing function

In [31]:
# Preprocessing function
def preprocess_text(text):
    sentences = re.split(r'(?<=[.!?]) +', text)  # Simple sentence tokenizer
    #sentences = text.replace('\r', '').replace('\n', '').replace('\t', '')
    #sentences = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%|-|\#)*\b(\/)?', '', text, flags=re.MULTILINE)  # repacing URL
    #sentences = re.sub(r'(\/.*?\/)+.*? ', '', text, flags=re.MULTILINE)
    #sentences = re.sub(r'(www\.)\S+', '', text, flags=re.MULTILINE)
    #sentences = re.sub(r'  ', ' ', text, flags=re.MULTILINE)
    #sentences = text.replace('\r', '').replace('\n', '').replace('\t', '')
    processed_sentences = []

    for sentence in sentences:
        words = re.findall(r'\b\w+\b', sentence.lower())  # Tokenization
        cleaned_words = [word for word in words if word not in ENGLISH_STOP_WORDS]
        punctuation = list(set(string.punctuation))
        extra_special_characters = ["''", '``', '##','>>', '<<','e', 'g', 'eg', 'cant', 'cannot', 'isnt', 'would', 'could', 'doesnt', 'hasnt', 'thanks', '-', ')', '\n']
        special_characters = [c for c in extra_special_characters if c not in punctuation] + punctuation
        cleaned_words = [word for word in cleaned_words if word not in special_characters]  # Remove special characters
        cleaned_words = [word for word in cleaned_words if not word.isdigit()]              # Remove numbers
        cleaned_words = [word for word in cleaned_words if word.encode('utf-8').isalpha()]  # Remove special variable names
        processed_sentences.append(words)

    return sentences, processed_sentences

# Build term-document matrix function

In [12]:
# Build term-document matrix function
def build_term_document_matrix(processed_sentences):
    sentences = [" ".join(sentence) for sentence in processed_sentences]
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(sentences)
    return tfidf_matrix

# Apply LSA

In [13]:
# Apply LSA
def apply_lsa(tfidf_matrix, num_topics=1):
    lsa = TruncatedSVD(n_components=num_topics, n_iter=100)
    lsa_matrix = lsa.fit_transform(tfidf_matrix)
    return lsa, lsa_matrix

# Rank sentences

In [14]:
# Rank sentences
def rank_sentences(lsa_matrix):
    scores = np.mean(lsa_matrix, axis=1)
    return scores

# Generate summary function

In [15]:
# Generate summary function
def generate_summary(original_sentences, scores, num_sentences=10):
    ranked_sentence_indices = np.argsort(scores)[::-1]
    summary = []
    for i in range(min(num_sentences, len(original_sentences))):
        summary.append(original_sentences[ranked_sentence_indices[i]])
    return " ".join(summary)

# Preprocess the Question_body and Answer_body columns

In [16]:
# Preprocess the Question_body and Answer_body columns
dataset['processed_question'] = dataset['Question_body'].apply(preprocess_text)
dataset['processed_answer'] = dataset['Answer_body'].apply(preprocess_text)

# Apply term-document matrix building

In [17]:
# Apply term-document matrix building
dataset['question_tfidf_matrix'] = dataset['processed_question'].apply(lambda x: build_term_document_matrix(x[1]))
dataset['answer_tfidf_matrix'] = dataset['processed_answer'].apply(lambda x: build_term_document_matrix(x[1]))


# Apply LSA to the term-document matrices for both questions and answers

In [18]:
# Apply LSA to the term-document matrices for both questions and answers
dataset['question_lsa'], dataset['question_lsa_matrix'] = zip(*dataset['question_tfidf_matrix'].apply(apply_lsa))
dataset['answer_lsa'], dataset['answer_lsa_matrix'] = zip(*dataset['answer_tfidf_matrix'].apply(apply_lsa))


  self.explained_variance_ratio_ = exp_var / full_var
  self.explained_variance_ratio_ = exp_var / full_var


# Rank sentences

In [19]:
# Rank sentences
dataset['question_scores'] = dataset['question_lsa_matrix'].apply(rank_sentences)
dataset['answer_scores'] = dataset['answer_lsa_matrix'].apply(rank_sentences)


# Generate summaries for both questions and answers

In [20]:
# Generate summaries for both questions and answers
dataset['question_summary'] = dataset.apply(lambda x: generate_summary(x['processed_question'][0], x['question_scores']), axis=1)
dataset['answer_summary'] = dataset.apply(lambda x: generate_summary(x['processed_answer'][0], x['answer_scores']), axis=1)


In [21]:
# Display the summaries
summaries = dataset[['question_summary', 'answer_summary']]
summaries.head()

Unnamed: 0,question_summary,answer_summary
0,What would trigger that 2nd Lamba (if weâ€TMr...,Therefore it will be super-expensive compare t...
1,If I use Spring MVC for designing the client s...,"It can be part of your Spring MVC application,..."
2,On which layer should I implement the logic fo...,"Your repositories might return the same model,..."
3,"Instead, it is really the WebAPI that should d...",if you cannot find it I can try to put of a bl...
4,"I am trying to learn AWS services, and now it ...",It would also facilitate SSL (in addition to t...


In [22]:
# Function to detect encoding and decode
def detect_and_fix_encoding(text):
    result = chardet.detect(text.encode())
    encoding = result['encoding']
    return text.encode(encoding).decode('utf-8')

# Apply detect_and_fix_encoding to each element in the specified columns
dataset['question_summary'] = dataset['question_summary'].apply(detect_and_fix_encoding)
dataset['answer_summary'] = dataset['answer_summary'].apply(detect_and_fix_encoding)

In [23]:
# Normalize unicode characters
def normalize_unicode(text):
    return unicodedata.normalize('NFKD', text)

# Apply normalize_unicode to each element in the specified columns
dataset['question_summary'] = dataset['question_summary'].apply(normalize_unicode)
dataset['answer_summary'] = dataset['answer_summary'].apply(normalize_unicode)

In [24]:
# Apply contractions.fix to each element in the specified columns
dataset['question_summary'] = dataset['question_summary'].apply(lambda x: contractions.fix(x))
dataset['answer_summary'] = dataset['answer_summary'].apply(lambda x: contractions.fix(x))

In [25]:
# Function to replace problematic characters
def replace_problematic_characters(text):
    replacements = {
        'â€™': "'",
        'â€œ': '"',
        'â€': '"',
        'â€TM': "'",
        'â€¦': " ",


        # Add more replacements if necessary
    }
    for old, new in replacements.items():
        text = text.replace(old, new)
    return text

In [26]:
# Apply replace_problematic_characters to each element in the specified columns
dataset['question_summary'] = dataset['question_summary'].apply(replace_problematic_characters)
dataset['answer_summary'] = dataset['answer_summary'].apply(replace_problematic_characters)

In [27]:
# Save the summarized data to a new Excel file
file_path = '/content/TextRank_LSA_SummarizedData.xlsx'

dataset.to_excel(file_path, index=False, engine='openpyxl')

In [32]:
# Verify that the file has been saved correctly
saved_data = pd.read_excel(file_path)
print(saved_data[['question_summary', 'answer_summary']].head())

                                    question_summary  \
0  What would trigger that 2nd Lamba (if we're do...   
1  If I use Spring MVC for designing the client s...   
2  On which layer should I implement the logic fo...   
3  Instead, it is really the WebAPI that should d...   
4  I am trying to learn AWS services, and now it ...   

                                      answer_summary  
0  Therefore it will be super-expensive compare t...  
1  It can be part of your Spring MVC application,...  
2  Your repositories might return the same model,...  
3  if you cannot find it I can try to put of a bl...  
4  It would also facilitate SSL (in addition to t...  
