<a href="https://colab.research.google.com/github/JeanMusenga/PhD-Thesis_2024_Musenga/blob/main/TextRank_Using_SpaCy_and_pytextrank.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

https://chatgpt.com/share/2cc1719b-b534-4cbb-9a11-aece17986c97

# Step 1: Install Necessary Libraries

In [None]:
pip install spacy pytextrank

In [9]:
!python -m spacy download en_core_web_sm

In [13]:
# Import libraries
import pandas as pd
import spacy
import pytextrank

Step 2: Load the Dataset

In [None]:
# Load the dataset
file_path = 'DataSampePilot.xlsx'  # Update with the correct path
dataset = pd.read_excel(file_path)

In [None]:
# Display the first few rows of the dataset
print(dataset.head())

In [14]:
# Load spaCy model
nlp = spacy.load("en_core_web_sm")

In [15]:
# Add PyTextRank to the spaCy pipeline
nlp.add_pipe("textrank")

<pytextrank.base.BaseTextRankFactory at 0x7f538a3af250>

# Step 3: Tokenize the Text

In [21]:
# Define a function to tokenize text
def tokenize(text):
    doc = nlp(text)
    return [token.text for token in doc]

In [22]:
# Apply tokenization to 'Question_body' and 'Answer_body' columns
dataset['question_tokens'] = dataset['Question_body'].apply(lambda x: tokenize(str(x)))
dataset['answer_tokens'] = dataset['Answer_body'].apply(lambda x: tokenize(str(x)))


In [17]:
# Apply tokenization and summarization to 'Question_body' and 'Answer_body' columns
dataset['question_tokens'], dataset['question_summary'] = zip(*dataset['Question_body'].apply(lambda x: tokenize_and_summarize(str(x))))
dataset['answer_tokens'], dataset['answer_summary'] = zip(*dataset['Answer_body'].apply(lambda x: tokenize_and_summarize(str(x))))


In [None]:
# Display the first few rows with tokens
dataset[['Question_body', 'question_tokens', 'Answer_body', 'answer_tokens']].head()

# Step 4: Create a Similarity Matrix

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Define a function to create a similarity matrix
def create_similarity_matrix(texts):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(texts)
    similarity_matrix = cosine_similarity(tfidf_matrix)
    return similarity_matrix

# Create similarity matrices for 'Question_body' and 'Answer_body'
question_similarity_matrix = create_similarity_matrix(dataset['Question_body'])
answer_similarity_matrix = create_similarity_matrix(dataset['Answer_body'])

# Display the similarity matrices
print(question_similarity_matrix)
print(answer_similarity_matrix)


[[1.         0.22920805 0.30994537 0.34297233 0.20943778 0.1890436
  0.14886688 0.22215655 0.19942164 0.2188573 ]
 [0.22920805 1.         0.24870993 0.33226641 0.21089174 0.15187851
  0.15261209 0.22135724 0.31583627 0.17991187]
 [0.30994537 0.24870993 1.         0.38091193 0.18968868 0.2188276
  0.23539308 0.26591497 0.20386423 0.38101829]
 [0.34297233 0.33226641 0.38091193 1.         0.17714062 0.25682604
  0.23080007 0.35354224 0.25680798 0.25952111]
 [0.20943778 0.21089174 0.18968868 0.17714062 1.         0.13272595
  0.13610595 0.10805898 0.20648205 0.12768302]
 [0.1890436  0.15187851 0.2188276  0.25682604 0.13272595 1.
  0.18570336 0.18345517 0.1337157  0.18679124]
 [0.14886688 0.15261209 0.23539308 0.23080007 0.13610595 0.18570336
  1.         0.18015524 0.12890661 0.2078603 ]
 [0.22215655 0.22135724 0.26591497 0.35354224 0.10805898 0.18345517
  0.18015524 1.         0.19497718 0.20035726]
 [0.19942164 0.31583627 0.20386423 0.25680798 0.20648205 0.1337157
  0.12890661 0.19497718

# Step 5: Apply the TextRank Algorithm

In [27]:
import numpy as np

def text_rank(similarity_matrix, damping_factor=0.85, max_iter=100, tol=1e-4):
    n = similarity_matrix.shape[0]
    scores = np.ones(n) / n
    for _ in range(max_iter):
        prev_scores = scores.copy()
        for i in range(n):
            scores[i] = (1 - damping_factor) + damping_factor * np.sum(similarity_matrix[i] * prev_scores / np.sum(similarity_matrix[i]))
        if np.linalg.norm(scores - prev_scores) < tol:
            break
    return scores

# Apply TextRank to the similarity matrices
question_scores = text_rank(question_similarity_matrix)
answer_scores = text_rank(answer_similarity_matrix)

# Add scores to the dataset
dataset['question_scores'] = question_scores
dataset['answer_scores'] = answer_scores

# Display the first few rows with scores
dataset[['Question_body', 'question_scores', 'Answer_body', 'answer_scores']].head()


Unnamed: 0,Question_body,question_scores,Answer_body,answer_scores
0,Kinda new to AWS. I have this high-level quest...,0.999837,"You send a request, you get a response. In ord...",0.999837
1,I have some spring boot microservices and I wa...,0.999837,<blockquote>\ntl;dr: Spring MVC will not contr...,0.999837
2,I'm trying to properly design an application a...,0.999837,Determining the source of the information is b...,0.999837
3,I heard that for .NET8 Microsoft gifted us wit...,0.999837,I have always asked myself this very same ques...,0.999837
4,"I am trying to learn AWS services, and now it ...",0.999837,"Short answer is: no, you don't have to but you...",0.999837


# Step 6: Summarize the Text

In [28]:
import spacy
import pytextrank

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Add PyTextRank to the spaCy pipeline
nlp.add_pipe("textrank")

# Define a function to summarize text
def summarize_text(text, limit_phrases=15, limit_sentences=3):
    doc = nlp(text)
    summary = " ".join([sent.text for sent in doc._.textrank.summary(limit_phrases=limit_phrases, limit_sentences=limit_sentences)])
    return summary

# Apply summarization to 'Question_body' and 'Answer_body' columns
dataset['question_summary'] = dataset['Question_body'].apply(lambda x: summarize_text(str(x)))
dataset['answer_summary'] = dataset['Answer_body'].apply(lambda x: summarize_text(str(x)))

# Display the first few rows with summaries
dataset[['Question_body', 'question_summary', 'Answer_body', 'answer_summary']].head()


Unnamed: 0,Question_body,question_summary,Answer_body,answer_summary
0,Kinda new to AWS. I have this high-level quest...,So thatâ€™s how Iâ€™d build it: React calls AP...,"You send a request, you get a response. In ord...",Therefore it will be super-expensive compare t...
1,I have some spring boot microservices and I wa...,If I use Spring MVC for designing the client s...,<blockquote>\ntl;dr: Spring MVC will not contr...,"It can be part of your Spring MVC application,..."
2,I'm trying to properly design an application a...,"Then, as I understand it, there are several ap...",Determining the source of the information is b...,"Your repositories might return the same model,..."
3,I heard that for .NET8 Microsoft gifted us wit...,There is the further complication that even if...,I have always asked myself this very same ques...,The most challenging aspect here is to configu...
4,"I am trying to learn AWS services, and now it ...",Expose service as REST endpoints\nFeature like...,"Short answer is: no, you don't have to but you...",I would say that you best practice in AWS woul...


# Step 7: Save the Output

In [29]:
# Save the dataset with summaries to a new Excel file
output_path = '/content/DataSamplePilot_with_summaries.xlsx'  # Update with the correct path if necessary
dataset.to_excel(output_path, index=False)
print(f"Dataset saved to {output_path}")


Dataset saved to /content/DataSamplePilot_with_summaries.xlsx
