<a href="https://colab.research.google.com/github/JeanMusenga/PhD-Thesis_2024_Musenga/blob/main/TextRank_TextSummarizationWith.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing necessary libraries

In [None]:
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
import numpy as np
import pandas as pd

In [None]:
nltk.download('punkt')
nltk.download('stopwords')

# Preprocessing the Text
First, let's preprocess the text data in the Question_body and Answer_body columns.

In [None]:
dataset=pd.read_excel('DataSampePilot.xlsx')

In [None]:
def preprocess_text(text):
    sentences = sent_tokenize(text)
    stop_words = set(stopwords.words('english'))
    processed_sentences = []

    for sentence in sentences:
        words = word_tokenize(sentence.lower())
        words = [word for word in words if word not in stop_words and word not in string.punctuation]
        processed_sentences.append(words)

    return sentences, processed_sentences

# Preprocess the Question_body and Answer_body columns
dataset['processed_question'] = dataset['Question_body'].apply(preprocess_text)
dataset['processed_answer'] = dataset['Answer_body'].apply(preprocess_text)

dataset[['processed_question', 'processed_answer']].head()


Unnamed: 0,processed_question,processed_answer
0,"([Kinda new to AWS., I have this high-level qu...","([You send a request, you get a response., In ..."
1,([I have some spring boot microservices and I ...,([<blockquote>\ntl;dr: Spring MVC will not con...
2,([I'm trying to properly design an application...,([Determining the source of the information is...
3,([I heard that for .NET8 Microsoft gifted us w...,([I have always asked myself this very same qu...
4,"([I am trying to learn AWS services, and now i...","([Short answer is: no, you don't have to but y..."


# Building the Similarity Matrix
Next, we'll build the similarity matrix for the preprocessed sentences using TF-IDF and calculate cosine similarities.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def build_similarity_matrix(processed_sentences):
    sentences = [" ".join(sentence) for sentence in processed_sentences]
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(sentences)
    similarity_matrix = cosine_similarity(tfidf_matrix)

    return similarity_matrix

# Build similarity matrices for both processed questions and answers
dataset['question_similarity_matrix'] = dataset['processed_question'].apply(lambda x: build_similarity_matrix(x[1]))
dataset['answer_similarity_matrix'] = dataset['processed_answer'].apply(lambda x: build_similarity_matrix(x[1]))

dataset[['question_similarity_matrix', 'answer_similarity_matrix']].head()


Unnamed: 0,question_similarity_matrix,answer_similarity_matrix
0,"[[1.0000000000000002, 0.0, 0.0, 0.195561002883...","[[1.0000000000000002, 0.28406646613058584, 0.0..."
1,"[[1.0, 0.36705691615797376, 0.0865249736265545...","[[1.0, 0.03225581303269778, 0.0, 0.24721627282..."
2,"[[1.0, 0.09260356758339026, 0.0327609480967487...","[[1.0, 0.12601290906721013, 0.1169803562364746..."
3,"[[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.12...","[[1.0000000000000004, 0.0, 0.0, 0.0, 0.0, 0.0,..."
4,"[[1.0, 0.15402619283158595, 0.0874518910758340...","[[1.0, 0.09744854253490878, 0.0, 0.0, 0.0, 0.0..."


# Applying TextRank Algorithm

In [None]:
import numpy as np

def text_rank(similarity_matrix, damping_factor=0.85, max_iter=100, tol=1e-4):
    n = similarity_matrix.shape[0]
    scores = np.ones(n) / n
    for _ in range(max_iter):
        prev_scores = scores.copy()
        for i in range(n):
            scores[i] = (1 - damping_factor) + damping_factor * np.sum(similarity_matrix[i] * prev_scores / np.sum(similarity_matrix[i]))
        if np.linalg.norm(scores - prev_scores) < tol:
            break
    return scores

# Apply TextRank to the similarity matrices for both questions and answers
dataset['question_scores'] = dataset['question_similarity_matrix'].apply(text_rank)
dataset['answer_scores'] = data['answer_similarity_matrix'].apply(text_rank)

dataset[['question_scores', 'answer_scores']].head()


Unnamed: 0,question_scores,answer_scores
0,"[0.9998933023916312, 0.9998933023916314, 0.999...","[0.9998559017305455, 0.9998559017305453, 0.999..."
1,"[0.99976338826903, 0.99976338826903, 0.9997633...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ..."
2,"[0.9998769696694945, 0.9998769696694942, 0.999...","[0.9998168371689724, 0.9998168371689723, 0.999..."
3,"[0.9998764871976102, 0.9998764871976104, 0.999...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ..."
4,"[0.9997905000298701, 0.9997905000298701, 0.999...","[0.9998348759326341, 0.9998348759326342, 0.999..."


# Generating the Summary

In [None]:
def generate_summary(original_sentences, scores, num_sentences=3):
    ranked_sentence_indices = np.argsort(scores)[::-1]

    summary = []
    for i in range(min(num_sentences, len(original_sentences))):
        summary.append(original_sentences[ranked_sentence_indices[i]])

    return " ".join(summary)

# Generate summaries for both questions and answers
dataset['question_summary'] = dataset.apply(lambda x: generate_summary(x['processed_question'][0], x['question_scores']), axis=1)
dataset['answer_summary'] = dataset.apply(lambda x: generate_summary(x['processed_answer'][0], x['answer_scores']), axis=1)

# Display the summaries
dataset[['question_summary', 'answer_summary']].head()


Unnamed: 0,question_summary,answer_summary
0,"While waiting for a response, the react app wi...","If it waits until something is resolved, you w..."
1,"I mean, designing the client side app using Sp...",</blockquote> Any basic REST API can encapsula...
2,"I'm not sure if it's relevant to my question, ...",Put in an interface for RemoteUserContactsData...
3,I can imagine some ways that I could call an A...,:-) Feel free to include a sign-up flow too if...
4,"For instance, lets consider these services are...",Imagine the case where someone deploys an inse...
