# Text summerization using nltk

In [None]:
# import necessary libraries

In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.probability import FreqDist
from nltk.tokenize.treebank import TreebankWordDetokenizer
from nltk.cluster.util import cosine_distance
import numpy as np
import networkx as nx

In [2]:
sample_text = """
Natural language processing (NLP) is a field of artificial intelligence that focuses on the interaction 
between computers and humans using natural language. The ultimate objective of NLP is to enable computers 
to understand, interpret, and generate human-like text. In recent years, NLP has seen tremendous advancements 
with the development of deep learning models and large-scale language models like GPT-3. These models have 
revolutionized various NLP tasks, including machine translation, sentiment analysis, and text summarization.
"""

In [3]:
# Step 1: Tokenization
sentences = sent_tokenize(sample_text)

In [4]:
sentences

['\nNatural language processing (NLP) is a field of artificial intelligence that focuses on the interaction \nbetween computers and humans using natural language.',
 'The ultimate objective of NLP is to enable computers \nto understand, interpret, and generate human-like text.',
 'In recent years, NLP has seen tremendous advancements \nwith the development of deep learning models and large-scale language models like GPT-3.',
 'These models have \nrevolutionized various NLP tasks, including machine translation, sentiment analysis, and text summarization.']

In [5]:
words = [word_tokenize(sentence) for sentence in sentences]

In [6]:
words

[['Natural',
  'language',
  'processing',
  '(',
  'NLP',
  ')',
  'is',
  'a',
  'field',
  'of',
  'artificial',
  'intelligence',
  'that',
  'focuses',
  'on',
  'the',
  'interaction',
  'between',
  'computers',
  'and',
  'humans',
  'using',
  'natural',
  'language',
  '.'],
 ['The',
  'ultimate',
  'objective',
  'of',
  'NLP',
  'is',
  'to',
  'enable',
  'computers',
  'to',
  'understand',
  ',',
  'interpret',
  ',',
  'and',
  'generate',
  'human-like',
  'text',
  '.'],
 ['In',
  'recent',
  'years',
  ',',
  'NLP',
  'has',
  'seen',
  'tremendous',
  'advancements',
  'with',
  'the',
  'development',
  'of',
  'deep',
  'learning',
  'models',
  'and',
  'large-scale',
  'language',
  'models',
  'like',
  'GPT-3',
  '.'],
 ['These',
  'models',
  'have',
  'revolutionized',
  'various',
  'NLP',
  'tasks',
  ',',
  'including',
  'machine',
  'translation',
  ',',
  'sentiment',
  'analysis',
  ',',
  'and',
  'text',
  'summarization',
  '.']]

In [7]:
# Step 2: Lowercasing and Removing Stopwords
stop_words = set(stopwords.words('english'))
stop_words

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [8]:
filtered_words = []
for sentence_words in words:
    filtered_words.append([word.lower() for word in sentence_words if word.isalnum() and word.lower() not in stop_words])

In [9]:
filtered_words

[['natural',
  'language',
  'processing',
  'nlp',
  'field',
  'artificial',
  'intelligence',
  'focuses',
  'interaction',
  'computers',
  'humans',
  'using',
  'natural',
  'language'],
 ['ultimate',
  'objective',
  'nlp',
  'enable',
  'computers',
  'understand',
  'interpret',
  'generate',
  'text'],
 ['recent',
  'years',
  'nlp',
  'seen',
  'tremendous',
  'advancements',
  'development',
  'deep',
  'learning',
  'models',
  'language',
  'models',
  'like'],
 ['models',
  'revolutionized',
  'various',
  'nlp',
  'tasks',
  'including',
  'machine',
  'translation',
  'sentiment',
  'analysis',
  'text',
  'summarization']]

In [10]:
# Step 3: Calculating Word Frequencies
flat_list = [word for sublist in filtered_words for word in sublist]
flat_list

['natural',
 'language',
 'processing',
 'nlp',
 'field',
 'artificial',
 'intelligence',
 'focuses',
 'interaction',
 'computers',
 'humans',
 'using',
 'natural',
 'language',
 'ultimate',
 'objective',
 'nlp',
 'enable',
 'computers',
 'understand',
 'interpret',
 'generate',
 'text',
 'recent',
 'years',
 'nlp',
 'seen',
 'tremendous',
 'advancements',
 'development',
 'deep',
 'learning',
 'models',
 'language',
 'models',
 'like',
 'models',
 'revolutionized',
 'various',
 'nlp',
 'tasks',
 'including',
 'machine',
 'translation',
 'sentiment',
 'analysis',
 'text',
 'summarization']

In [11]:
word_frequencies = FreqDist(flat_list)

In [12]:
word_frequencies

FreqDist({'nlp': 4, 'language': 3, 'models': 3, 'natural': 2, 'computers': 2, 'text': 2, 'processing': 1, 'field': 1, 'artificial': 1, 'intelligence': 1, ...})

In [13]:
max_frequency = max(word_frequencies.values())
max_frequency

4

In [None]:
# Normalize the frequency

In [14]:
for word in word_frequencies.keys():
    word_frequencies[word] = word_frequencies[word]/max_frequency

In [15]:
word_frequencies

FreqDist({'nlp': 1.0, 'language': 0.75, 'models': 0.75, 'natural': 0.5, 'computers': 0.5, 'text': 0.5, 'processing': 0.25, 'field': 0.25, 'artificial': 0.25, 'intelligence': 0.25, ...})

In [16]:
# Step 4: Scoring Sentences based on Word Frequencies
sentence_scores = {}
for i, sentence in enumerate(filtered_words):
    score = 0
    for word in sentence:
        score += word_frequencies[word]
    sentence_scores[i] = score

In [17]:
sentence_scores

{0: 6.0, 1: 3.5, 2: 5.5, 3: 4.5}

In [33]:
# Selecting Top N Sentences for Summarization
num_sentences = 2
selected_sentences_indices = sorted(sentence_scores, key=sentence_scores.get, reverse=True)[:num_sentences]

In [34]:
selected_sentences_indices

[0, 2]

In [35]:
# Generating the Summary
summary_sentences = [sent_tokenize(sample_text)[i] for i in selected_sentences_indices]
summary = TreebankWordDetokenizer().detokenize(summary_sentences)

In [36]:
# Displaying the Original Text and Summary
print("Original Text:\n", sample_text)
print("\nSummary:\n", summary)

Original Text:
 
Natural language processing (NLP) is a field of artificial intelligence that focuses on the interaction 
between computers and humans using natural language. The ultimate objective of NLP is to enable computers 
to understand, interpret, and generate human-like text. In recent years, NLP has seen tremendous advancements 
with the development of deep learning models and large-scale language models like GPT-3. These models have 
revolutionized various NLP tasks, including machine translation, sentiment analysis, and text summarization.


Summary:
 Natural language processing (NLP) is a field of artificial intelligence that focuses on the interaction 
between computers and humans using natural language. In recent years, NLP has seen tremendous advancements 
with the development of deep learning models and large-scale language models like GPT-3.
