In [17]:
import nltk
import networkx as nx
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd
import string

# Download necessary resources
nltk.download('punkt')
nltk.download('stopwords')

# Load stopwords
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

# Function to clean and tokenize sentences
def preprocess_text(text):
    sentences = sent_tokenize(text)  # Split into sentences
    tokenized_sentences = []
    
    for sent in sentences:
        words = word_tokenize(sent.lower())  # Convert to lowercase and tokenize
        words = [word for word in words if word.isalnum() and word not in stop_words]  # Remove punctuation and stopwords
        tokenized_sentences.append(words)
    
    return sentences, tokenized_sentences


[nltk_data] Downloading package punkt to /home/am44/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /home/am44/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

def build_similarity_matrix(sentences):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(sentences)
    
    # Compute cosine similarity between all sentences
    similarity_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)
    
    return similarity_matrix


In [21]:
def textrank_summarize(text, num_sentences=3):
    sentences, tokenized_sentences = preprocess_text(text)
    
    if len(sentences) < num_sentences:
        return " ".join(sentences)  # Return full text if too short

    similarity_matrix = build_similarity_matrix([" ".join(words) for words in tokenized_sentences])
    
    # Build a graph and apply PageRank
    nx_graph = nx.from_numpy_array(similarity_matrix)
    scores = nx.pagerank(nx_graph)
    
    # Rank sentences based on PageRank scores
    ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
    
    # Extract the top N sentences
    summary = " ".join([ranked_sentences[i][1] for i in range(num_sentences)])
    
    return summary


In [23]:
# Load dataset
df = pd.read_csv('prepared_data.csv')

# Apply summarization to each row
df['summary'] = df['content'].apply(lambda x: textrank_summarize(str(x), num_sentences=3))

# Save results
df.to_csv('summarized_data.csv', index=False)

print("Summarization completed. Check summarized_data.csv")


Summarization completed. Check summarized_data.csv


In [25]:
df = pd.read_csv('summarized_data.csv')
df

Unnamed: 0,title,content,summary
0,A Beginner’s Guide to Word Embedding with Gens...,1. Introduction of Word2vec\n\nWord2vec is one...,"At first, we need to generate a format of ‘lis..."
1,Hands-on Graph Neural Networks with PyTorch & ...,"In my last article, I introduced the concept o...","In this blog post, we will be using PyTorch an..."
2,How to Use ggplot2 in Python,Introduction\n\nThanks to its strict implement...,The Grammar of Graphics\n\nIn case you should ...
3,Databricks: How to Save Data Frames as CSV Fil...,Photo credit to Mika Baumeister from Unsplash\...,DBFS FileStore is where you create folders and...
4,A Step-by-Step Implementation of Gradient Desc...,A Step-by-Step Implementation of Gradient Desc...,Let’s derive the formula for calculating gradi...
...,...,...,...
192387,Why do you need a cleaning service?,What could be more important than having a tid...,"So, whether you live in Sydney or North Shore,..."
192388,Daily cleaning and maintenance of bedding,Daily cleaning and maintenance of bedding\n\nW...,General the washing temperature is not exceed ...
192389,Beneficial Advice on Bond Cleaning!,The most important chore at the end is bond cl...,The most important chore at the end is bond cl...
192390,How I Learned Romanian in 37 Easy Steps,How I Learned Romanian in 37 Easy Steps\n\nHey...,"Step 3 — Go to Romania, meet 5,012 people who ..."


In [41]:
print(df['content'][1])

In my last article, I introduced the concept of Graph Neural Network (GNN) and some recent advancements of it. Since this topic is getting seriously hyped up, I decided to make this tutorial on how to easily implement your Graph Neural Network in your project. You will learn how to construct your own GNN with PyTorch Geometric, and how to use GNN to solve a real-world problem (Recsys Challenge 2015).

In this blog post, we will be using PyTorch and PyTorch Geometric (PyG), a Graph Neural Network framework built on top of PyTorch that runs blazingly fast. It is several times faster than the most well-known GNN framework, DGL.

Aside from its remarkable speed, PyG comes with a collection of well-implemented GNN models illustrated in various papers. Therefore, it would be very handy to reproduce the experiments with PyG.


In [43]:
print(df['summary'][1])

In this blog post, we will be using PyTorch and PyTorch Geometric (PyG), a Graph Neural Network framework built on top of PyTorch that runs blazingly fast. In my last article, I introduced the concept of Graph Neural Network (GNN) and some recent advancements of it. You will learn how to construct your own GNN with PyTorch Geometric, and how to use GNN to solve a real-world problem (Recsys Challenge 2015).


In [37]:
print(df['content'][10])

What I Learned from (Two-time) Kaggle Grandmaster Abhishek Thakur

Photo by Georgie Cobbs on Unsplash

Quick Bio

Before his many data scientist stints in companies scattered throughout Germany, Abhishek Thakur earned his bachelor’s in electrical engineering at NIT Surat and his master’s in computer science at the University of Bonn. Currently, he holds the title of Chief Data Scientist at Norway’s boost.ai, a “software company that specializes in conversational artificial intelligence (AI).” But I’m most impressed by Abhishek’s Kaggle clout.

You can visit his Kaggle profile here. Here’s a snapshot of his accolades:

Competitions Grandmaster (17 gold medals and an all-time high rank of #3 in the world)

Kernels Expert (he’s well within the top 1% of Kagglers)

Discussion Grandmaster (65 gold medals and an all-time high rank of #2 in the world)

I want to take a look at Abhishek’s tutorial, Approaching (Almost) Any NLP Problem on Kaggle. I’ve selected this kernel of Abhishek’s because 

In [39]:
print(df['summary'][10])

Since Abhishek is a pro and this is an NLP problem, the exploratory data analysis (you’ll most often see this referred to as EDA) is shallow compared to problems involving numerical data. And if you really want a firmer grasp of NLP or data science in general, be sure that you understand every line of Abhishek’s code by writing it yourself as you go through his kernel. Exploring the Data and Understanding the Problem

After importing the necessary Python modules and the data, Abhishek calls the head() method on the data to see what the first five rows look like.


In [8]:
import pandas as pd
import nltk
import networkx as nx
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd
import string

df = pd.read_csv('summarized_data.csv')

# Add these imports at the top
from rouge_score import rouge_scorer

# Add this function to your code
def evaluate_with_rouge(references, summaries):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge1_scores = []
    rouge2_scores = []
    rougeL_scores = []
    
    for ref, summ in zip(references, summaries):
        scores = scorer.score(ref, summ)
        rouge1_scores.append(scores['rouge1'].fmeasure)
        rouge2_scores.append(scores['rouge2'].fmeasure)
        rougeL_scores.append(scores['rougeL'].fmeasure)
    
    return {
        'rouge1': np.mean(rouge1_scores),
        'rouge2': np.mean(rouge2_scores),
        'rougeL': np.mean(rougeL_scores)
    }

# After generating summaries, evaluate them
references = df['content'].apply(str).tolist()
summaries = df['summary'].tolist()

rouge_scores = evaluate_with_rouge(references, summaries)
print("ROUGE Scores:")
print(f"ROUGE-1: {rouge_scores['rouge1']:.4f}")
print(f"ROUGE-2: {rouge_scores['rouge2']:.4f}")
print(f"ROUGE-L: {rouge_scores['rougeL']:.4f}")

ROUGE Scores:
ROUGE-1: 0.2912
ROUGE-2: 0.2827
ROUGE-L: 0.2438
