<a href="https://colab.research.google.com/github/JeanMusenga/PhD-Thesis_2024_Musenga/blob/main/TextRank_TextSummarization_WithEffectiveToknization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Important resources:
https://chatgpt.com/share/0fde4435-eb1d-4ab1-9149-78c0899251e0

# Importing necessary libraries

In [None]:
pip install contractions

In [None]:
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
import numpy as np
import pandas as pd
import contractions
import unicodedata
import chardet

In [None]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

# Preprocessing the Text
First, let's preprocess the text data in the Question_body and Answer_body columns.

In [None]:
dataset=pd.read_excel('DataSampePilot.xlsx')

In [None]:
def preprocess_text(text):
    sentences = sent_tokenize(text)
    stop_words = set(stopwords.words('english'))
    processed_sentences = []

    for sentence in sentences:
        cleaned_words = word_tokenize(sentence.lower())
        cleaned_words = [word for word in cleaned_words if word not in stop_words and word not in string.punctuation]

        punctuation = list(set(string.punctuation))
        extra_special_characters = ["''", '``', '##','>>', '<<','e', 'g', 'eg', 'cant', 'cannot', 'isnt', 'would', 'could', 'doesnt', 'hasnt', 'thanks', '-', ')', '\n']
        special_characters = [c for c in extra_special_characters if c not in punctuation] + punctuation
        cleaned_words = [word for word in cleaned_words if word not in special_characters]  # Remove special characters
        cleaned_words = [word for word in cleaned_words if not word.isdigit()]              # Remove numbers
        cleaned_words = [word for word in cleaned_words if word.encode('utf-8').isalpha()]  # Remove special variable names

        processed_sentences.append(cleaned_words)

    return sentences, processed_sentences

In [None]:
# Preprocess the Question_body and Answer_body columns
dataset['processed_question'] = dataset['Question_body'].apply(preprocess_text)
dataset['processed_answer'] = dataset['Answer_body'].apply(preprocess_text)

dataset[['processed_question', 'processed_answer']].head()

Unnamed: 0,processed_question,processed_answer
0,"([Kinda new to AWS., I have this high-level qu...","([You send a request, you get a response., In ..."
1,([I have some spring boot microservices and I ...,([<blockquote>\ntl;dr: Spring MVC will not con...
2,([I'm trying to properly design an application...,([Determining the source of the information is...
3,([I heard that for .NET8 Microsoft gifted us w...,([I have always asked myself this very same qu...
4,"([I am trying to learn AWS services, and now i...","([Short answer is: no, you don't have to but y..."


# Building the Similarity Matrix
Next, we built the similarity matrix for the preprocessed sentences using TF-IDF and calculate cosine similarities.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def build_similarity_matrix(processed_sentences):
    sentences = [" ".join(sentence) for sentence in processed_sentences]
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(sentences)
    similarity_matrix = cosine_similarity(tfidf_matrix)

    return similarity_matrix

# Build similarity matrices for both processed questions and answers
dataset['question_similarity_matrix'] = dataset['processed_question'].apply(lambda x: build_similarity_matrix(x[1]))
dataset['answer_similarity_matrix'] = dataset['processed_answer'].apply(lambda x: build_similarity_matrix(x[1]))

dataset[['question_similarity_matrix', 'answer_similarity_matrix']].head()


Unnamed: 0,question_similarity_matrix,answer_similarity_matrix
0,"[[1.0, 0.0, 0.0, 0.22805744768427957, 0.0, 0.0...","[[1.0000000000000002, 0.3056819385972427, 0.0,..."
1,"[[1.0000000000000002, 0.4306667306746475, 0.10...","[[1.0, 0.0, 0.0, 0.24362602763568547, 0.045602..."
2,"[[1.0, 0.07100128918503626, 0.0, 0.30960776281...","[[1.0000000000000002, 0.17911946739679466, 0.0..."
3,"[[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.08...","[[1.0000000000000002, 0.0, 0.0, 0.0, 0.0, 0.0,..."
4,"[[1.0, 0.15402619283158595, 0.0861583441336023...","[[1.0, 0.10358784971785488, 0.0, 0.0, 0.0, 0.0..."


# Applying TextRank Algorithm

In [None]:
import numpy as np

def text_rank(similarity_matrix, damping_factor=0.85, max_iter=100, tol=1e-4):
    n = similarity_matrix.shape[0]
    scores = np.ones(n) / n
    for _ in range(max_iter):
        prev_scores = scores.copy()
        for i in range(n):
            scores[i] = (1 - damping_factor) + damping_factor * np.sum(similarity_matrix[i] * prev_scores / np.sum(similarity_matrix[i]))
        if np.linalg.norm(scores - prev_scores) < tol:
            break
    return scores

# Apply TextRank to the similarity matrices for both questions and answers
dataset['question_scores'] = dataset['question_similarity_matrix'].apply(text_rank)
dataset['answer_scores'] = dataset['answer_similarity_matrix'].apply(text_rank)

dataset[['question_scores', 'answer_scores']].head()


  scores[i] = (1 - damping_factor) + damping_factor * np.sum(similarity_matrix[i] * prev_scores / np.sum(similarity_matrix[i]))
  scores[i] = (1 - damping_factor) + damping_factor * np.sum(similarity_matrix[i] * prev_scores / np.sum(similarity_matrix[i]))


Unnamed: 0,question_scores,answer_scores
0,"[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[0.9998559017305455, 0.9998559017305455, 0.999..."
1,"[0.9997633882690298, 0.9997633882690299, 0.999...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ..."
2,"[0.9998769696694947, 0.9998769696694949, 0.999...","[0.9998168371689721, 0.999816837168972, 0.9998..."
3,"[0.9998764871976101, 0.9998764871976102, 0.999...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ..."
4,"[0.9997905000298699, 0.9997905000298698, 0.999...","[0.9998348759326342, 0.999834875932634, 0.9998..."


# Generating the Summary

In [None]:
def generate_summary(original_sentences, scores, num_sentences=10):
    ranked_sentence_indices = np.argsort(scores)[::-1]

    summary = []
    for i in range(min(num_sentences, len(original_sentences))):
        summary.append(original_sentences[ranked_sentence_indices[i]])

    return " ".join(summary)

# Generate summaries for both questions and answers
dataset['question_summary'] = dataset.apply(lambda x: generate_summary(x['processed_question'][0], x['question_scores']), axis=1)
dataset['answer_summary'] = dataset.apply(lambda x: generate_summary(x['processed_answer'][0], x['answer_scores']), axis=1)

# Display the summaries
dataset[['question_summary', 'answer_summary']].head()


Unnamed: 0,question_summary,answer_summary
0,Thanks! One that sends a queue and the other t...,Lambda is charged per miliseconds running. The...
1,"<a href=""https://i.sstatic.net/BnByRPzu.png"" r...",</blockquote> Any basic REST API can encapsula...
2,"I'm not sure if it's relevant to my question, ...",This &quot;feels&quot; wrong as it seems like ...
3,I can imagine some ways that I could call an A...,:-) Feel free to include a sign-up flow too if...
4,Expose service as REST endpoints\nFeature like...,"In any cases, don't do security at the service..."


In [None]:
# Function to detect encoding and decode
def detect_and_fix_encoding(text):
    result = chardet.detect(text.encode())
    encoding = result['encoding']
    return text.encode(encoding).decode('utf-8')

# Apply detect_and_fix_encoding to each element in the specified columns
dataset['question_summary'] = dataset['question_summary'].apply(detect_and_fix_encoding)
dataset['answer_summary'] = dataset['answer_summary'].apply(detect_and_fix_encoding)

In [None]:
# Normalize unicode characters
def normalize_unicode(text):
    return unicodedata.normalize('NFKD', text)

# Apply normalize_unicode to each element in the specified columns
dataset['question_summary'] = dataset['question_summary'].apply(normalize_unicode)
dataset['answer_summary'] = dataset['answer_summary'].apply(normalize_unicode)

In [None]:
# Apply contractions.fix to each element in the specified columns
dataset['question_summary'] = dataset['question_summary'].apply(lambda x: contractions.fix(x))
dataset['answer_summary'] = dataset['answer_summary'].apply(lambda x: contractions.fix(x))

In [None]:
# Function to replace problematic characters
def replace_problematic_characters(text):
    replacements = {
        'â€™': "'",
        'â€œ': '"',
        'â€': '"',
        'â€TM': "'",
        'â€¦': " ",


        # Add more replacements if necessary
    }
    for old, new in replacements.items():
        text = text.replace(old, new)
    return text

In [None]:
# Apply replace_problematic_characters to each element in the specified columns
dataset['question_summary'] = dataset['question_summary'].apply(replace_problematic_characters)
dataset['answer_summary'] = dataset['answer_summary'].apply(replace_problematic_characters)

In [None]:
# Save the summarized data to a new Excel file
file_path = '/content/TextRank_SummarizedData.xlsx'

dataset.to_excel(file_path, index=False, engine='openpyxl')

In [None]:
# Verify that the file has been saved correctly
saved_data = pd.read_excel(file_path)
print(saved_data[['question_summary', 'answer_summary']].head())

                                    question_summary  \
0  Thanks! One that sends a queue and the other t...   
1  <a href="https://i.sstatic.net/BnByRPzu.png" r...   
2  I am not sure if it is relevant to my question...   
3  I can imagine some ways that I could call an A...   
4  Expose service as REST endpoints\nFeature like...   

                                      answer_summary  
0  Lambda is charged per miliseconds running. The...  
1  </blockquote> Any basic REST API can encapsula...  
2  This &quot;feels&quot; wrong as it seems like ...  
3  :-) Feel free to include a sign-up flow too if...  
4  In any cases, do not do security at the servic...  
