In [6]:
import nltk
import networkx as nx
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd
import string

# Download necessary resources
nltk.download('punkt')
nltk.download('stopwords')

# Load stopwords
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

# Function to clean and tokenize sentences
def preprocess_text(text):
    sentences = sent_tokenize(text)  # Split into sentences
    tokenized_sentences = []
    
    for sent in sentences:
        words = word_tokenize(sent.lower())  # Convert to lowercase and tokenize
        words = [word for word in words if word.isalnum() and word not in stop_words]  # Remove punctuation and stopwords
        tokenized_sentences.append(words)
    
    return sentences, tokenized_sentences


[nltk_data] Downloading package punkt to /home/am44/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/am44/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

def build_similarity_matrix(sentences):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(sentences)
    
    # Compute cosine similarity between all sentences
    similarity_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)
    
    return similarity_matrix


In [10]:
def textrank_summarize(text, num_sentences=3):
    sentences, tokenized_sentences = preprocess_text(text)
    
    if len(sentences) < num_sentences:
        return " ".join(sentences)  # Return full text if too short

    similarity_matrix = build_similarity_matrix([" ".join(words) for words in tokenized_sentences])
    
    # Build a graph and apply PageRank
    nx_graph = nx.from_numpy_array(similarity_matrix)
    scores = nx.pagerank(nx_graph)
    
    # Rank sentences based on PageRank scores
    ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
    
    # Extract the top N sentences
    summary = " ".join([ranked_sentences[i][1] for i in range(num_sentences)])
    
    return summary


In [14]:
# Load dataset
data = pd.read_csv('prepared_data.csv')
df = data.head(10)

# Apply summarization to each row
df['summary'] = df['content'].apply(lambda x: textrank_summarize(str(x), num_sentences=3))

# Save results
df.to_csv('summarized_data_2.csv', index=False)

print("Summarization completed. Check summarized_data.csv")


Summarization completed. Check summarized_data.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['summary'] = df['content'].apply(lambda x: textrank_summarize(str(x), num_sentences=3))


In [16]:
df = pd.read_csv('summarized_data_2.csv')
df

Unnamed: 0,title,content,summary
0,A Beginner’s Guide to Word Embedding with Gens...,1. Introduction of Word2vec\n\nWord2vec is one...,"At first, we need to generate a format of ‘lis..."
1,Hands-on Graph Neural Networks with PyTorch & ...,"In my last article, I introduced the concept o...","In this blog post, we will be using PyTorch an..."
2,How to Use ggplot2 in Python,Introduction\n\nThanks to its strict implement...,The Grammar of Graphics\n\nIn case you should ...
3,Databricks: How to Save Data Frames as CSV Fil...,Photo credit to Mika Baumeister from Unsplash\...,DBFS FileStore is where you create folders and...
4,A Step-by-Step Implementation of Gradient Desc...,A Step-by-Step Implementation of Gradient Desc...,Let’s derive the formula for calculating gradi...
5,An Easy Introduction to SQL for Data Scientists,Want to be inspired? Come join my Super Quotes...,"To install a MySQL server, you can run the fol..."
6,Hypothesis testing visualized,Hypothesis testing visualized\n\nIn this artic...,"For a general hypothesis testing problem, we n..."
7,Introduction to Latent Matrix Factorization Re...,Latent Factors are “Hidden Factors” unseen in ...,When working with an User-Item matrix of ratin...
8,Which 2020 Candidate is the Best at Twitter?,Which 2020 Candidate is the Best at Twitter?\n...,Which 2020 Candidate is the Best at Twitter? E...
9,What if AI model understanding were easy?,Irreverent Demystifiers\n\nWhat if AI model un...,About the What-If Tool\n\nThe What-If Tool is ...
