# Basic step to follow for text summaries 

In [2]:
# text cleaning
# sentence tokenization
# word tokenisation
# work frequency table
# summarization

# Installation of useful Libraries

In [3]:
#!pip install -U spacy
#!python -m spacy download en_core_web_sm

# Input text

In [8]:
text = """
Machine learning (ML) is a field of inquiry devoted to understanding and building methods that 'learn', 
that is, methods that leverage data to improve performance on some set of tasks.[1] 
It is seen as a part of artificial intelligence. Machine learning algorithms build a model based on sample data, 
known as training data, in order to make predictions or decisions without being explicitly programmed to do so.[2] 
Machine learning algorithms are used in a wide variety of applications, such as in medicine, email filtering, 
speech recognition, and computer vision, where it is difficult or unfeasible to develop conventional algorithms 
to perform the needed tasks.[3]

A subset of machine learning is closely related to computational statistics, which focuses on making predictions 
using computers, but not all machine learning is statistical learning. The study of mathematical optimization 
delivers methods, theory and application domains to the field of machine learning. Data mining is a related field 
of study, focusing on exploratory data analysis through unsupervised learning.[5][6] Some implementations of machine 
learning use data and neural networks in a way that mimics the working of a biological brain.[7][8] In its 
application across business problems, machine learning is also referred to as predictive analytics.
"""

# importing important libraries 

In [9]:
import spacy
#import spacy.lang.en.stop_words as STOP_WORDS


In [10]:
from string import punctuation
import nltk
from nltk.corpus import stopwords

# downloading stopwords from nltk lib

In [11]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jayaverma/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Text Cleaning/ Preprocessing 

In [12]:
stops = list(stopwords.words('english'))
print(stops)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [13]:
nlp = spacy.load('en_core_web_sm')

In [14]:
doc = nlp(text)

In [15]:
tokens = [token.text for token in doc]
print(tokens)

['\n', 'Machine', 'learning', '(', 'ML', ')', 'is', 'a', 'field', 'of', 'inquiry', 'devoted', 'to', 'understanding', 'and', 'building', 'methods', 'that', "'", 'learn', "'", ',', 'that', 'is', ',', 'methods', 'that', 'leverage', 'data', 'to', 'improve', 'performance', 'on', 'some', 'set', 'of', 'tasks.[1', ']', 'It', 'is', 'seen', 'as', 'a', 'part', 'of', 'artificial', 'intelligence', '.', 'Machine', 'learning', 'algorithms', 'build', 'a', 'model', 'based', 'on', 'sample', 'data', ',', 'known', 'as', 'training', 'data', ',', 'in', 'order', 'to', 'make', 'predictions', 'or', 'decisions', 'without', 'being', 'explicitly', 'programmed', 'to', 'do', 'so.[2', ']', 'Machine', 'learning', 'algorithms', 'are', 'used', 'in', 'a', 'wide', 'variety', 'of', 'applications', ',', 'such', 'as', 'in', 'medicine', ',', 'email', 'filtering', ',', 'speech', 'recognition', ',', 'and', 'computer', 'vision', ',', 'where', 'it', 'is', 'difficult', 'or', 'unfeasible', 'to', 'develop', 'conventional', 'algorit

In [16]:
punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [17]:
punctuation = punctuation + '\n'

In [18]:
punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\n'

# Word frequency 

In [19]:
word_frequencies = {}
for word in doc:
    if word.text.lower() not in stops:
        if word.text.lower() not in punctuation:
            if word.text not in word_frequencies.keys():
                word_frequencies[word.text]=1
            else:
                word_frequencies[word.text]+=1

In [20]:
word_frequencies

{'Machine': 3,
 'learning': 9,
 'ML': 1,
 'field': 3,
 'inquiry': 1,
 'devoted': 1,
 'understanding': 1,
 'building': 1,
 'methods': 3,
 'learn': 1,
 'leverage': 1,
 'data': 5,
 'improve': 1,
 'performance': 1,
 'set': 1,
 'tasks.[1': 1,
 'seen': 1,
 'part': 1,
 'artificial': 1,
 'intelligence': 1,
 'algorithms': 3,
 'build': 1,
 'model': 1,
 'based': 1,
 'sample': 1,
 'known': 1,
 'training': 1,
 'order': 1,
 'make': 1,
 'predictions': 2,
 'decisions': 1,
 'without': 1,
 'explicitly': 1,
 'programmed': 1,
 'so.[2': 1,
 'used': 1,
 'wide': 1,
 'variety': 1,
 'applications': 1,
 'medicine': 1,
 'email': 1,
 'filtering': 1,
 'speech': 1,
 'recognition': 1,
 'computer': 1,
 'vision': 1,
 'difficult': 1,
 'unfeasible': 1,
 'develop': 1,
 'conventional': 1,
 'perform': 1,
 'needed': 1,
 'tasks.[3': 1,
 '\n\n': 1,
 'subset': 1,
 'machine': 5,
 'closely': 1,
 'related': 2,
 'computational': 1,
 'statistics': 1,
 'focuses': 1,
 'making': 1,
 'using': 1,
 'computers': 1,
 'statistical': 1,
 'st

In [21]:
max_frequency = max(word_frequencies.values())

In [22]:
max_frequency

9

In [23]:
for word in word_frequencies.keys():
    word_frequencies[word] = word_frequencies[word]/max_frequency

In [24]:
print(word_frequencies)

{'Machine': 0.3333333333333333, 'learning': 1.0, 'ML': 0.1111111111111111, 'field': 0.3333333333333333, 'inquiry': 0.1111111111111111, 'devoted': 0.1111111111111111, 'understanding': 0.1111111111111111, 'building': 0.1111111111111111, 'methods': 0.3333333333333333, 'learn': 0.1111111111111111, 'leverage': 0.1111111111111111, 'data': 0.5555555555555556, 'improve': 0.1111111111111111, 'performance': 0.1111111111111111, 'set': 0.1111111111111111, 'tasks.[1': 0.1111111111111111, 'seen': 0.1111111111111111, 'part': 0.1111111111111111, 'artificial': 0.1111111111111111, 'intelligence': 0.1111111111111111, 'algorithms': 0.3333333333333333, 'build': 0.1111111111111111, 'model': 0.1111111111111111, 'based': 0.1111111111111111, 'sample': 0.1111111111111111, 'known': 0.1111111111111111, 'training': 0.1111111111111111, 'order': 0.1111111111111111, 'make': 0.1111111111111111, 'predictions': 0.2222222222222222, 'decisions': 0.1111111111111111, 'without': 0.1111111111111111, 'explicitly': 0.1111111111

In [25]:
# sentence tokenization and sentence score

In [26]:
sentence_tokens = [sent for sent in doc.sents]
print(sentence_tokens)

[
Machine learning (ML) is a field of inquiry devoted to understanding and building methods that 'learn', that is, methods that leverage data to improve performance on some set of tasks.[1] It is seen as a part of artificial intelligence., Machine learning algorithms build a model based on sample data, known as training data, in order to make predictions or decisions without being explicitly programmed to do so.[2] Machine learning algorithms are used in a wide variety of applications, such as in medicine, email filtering, speech recognition, and computer vision, where it is difficult or unfeasible to develop conventional algorithms to perform the needed tasks.[3]

A subset of machine learning is closely related to computational statistics, which focuses on making predictions using computers, but not all machine learning is statistical learning., The study of mathematical optimization delivers methods, theory and application domains to the field of machine learning., Data mining is a r

In [27]:
sentence_score = {}
for sent in sentence_tokens:
    for word in sent:
        if word.text.lower() in word_frequencies.keys():
            if sent not in sentence_score.keys():
                sentence_score[sent]= word_frequencies[word.text.lower()]
            else:
                sentence_score[sent]+= word_frequencies[word.text.lower()]

In [28]:
sentence_score

{
 Machine learning (ML) is a field of inquiry devoted to understanding and building methods that 'learn', that is, methods that leverage data to improve performance on some set of tasks.[1] It is seen as a part of artificial intelligence.: 4.666666666666665,
 Machine learning algorithms build a model based on sample data, known as training data, in order to make predictions or decisions without being explicitly programmed to do so.[2] Machine learning algorithms are used in a wide variety of applications, such as in medicine, email filtering, speech recognition, and computer vision, where it is difficult or unfeasible to develop conventional algorithms to perform the needed tasks.[3]
 
 A subset of machine learning is closely related to computational statistics, which focuses on making predictions using computers, but not all machine learning is statistical learning.: 14.555555555555541,
 The study of mathematical optimization delivers methods, theory and application domains to the fi

In [29]:
from heapq import nlargest

In [30]:
select_length = int(len(sentence_tokens)*0.3)
select_length

1

# Summary generation

In [31]:
summary = nlargest(select_length, sentence_score, key = sentence_score.get)

In [32]:
print(summary)

[Machine learning algorithms build a model based on sample data, known as training data, in order to make predictions or decisions without being explicitly programmed to do so.[2] Machine learning algorithms are used in a wide variety of applications, such as in medicine, email filtering, speech recognition, and computer vision, where it is difficult or unfeasible to develop conventional algorithms to perform the needed tasks.[3]

A subset of machine learning is closely related to computational statistics, which focuses on making predictions using computers, but not all machine learning is statistical learning.]


In [33]:
final_summary = [word.text for word in summary]

In [34]:
summary = ' '. join(final_summary)

In [35]:
print(summary)

Machine learning algorithms build a model based on sample data, known as training data, in order to make predictions or decisions without being explicitly programmed to do so.[2] Machine learning algorithms are used in a wide variety of applications, such as in medicine, email filtering, speech recognition, and computer vision, where it is difficult or unfeasible to develop conventional algorithms to perform the needed tasks.[3]

A subset of machine learning is closely related to computational statistics, which focuses on making predictions using computers, but not all machine learning is statistical learning.


In [36]:
print(len(text))

1333


In [37]:
print(len(summary))

617


In [38]:
compresion_ratio = len(summary)/len(text)

In [39]:
compresion_ratio

0.4628657164291073