In [249]:
# import required libraries

import nltk # Natural Language Tool Kit (Natural Language Processing - NLP)

# Need the below libraries to build an effective summarizer 

from nltk.corpus import stopwords # corpus - collection of text

from nltk.tokenize import word_tokenize, sent_tokenize 

# tokenize - divides text into a number of tokens (Word & Scentence tokenizer)
# word_tokenize - split sentences or paragraphs into words
# sent_tokenizer - split paragraphs into sentences and keep track of them

# BETA

import pandas as pd

import markovify

import matplotlib as plt

# BETA

In [278]:
# Input data  
input = """The Indian Premier League (IPL) is a professional Twenty20 cricket league in India contested during April and May of every year by 8 teams representing 8 cities of India. The league was founded by the Board of Control for Cricket in India (BCCI) in 2008, and is regarded as the brainchild of Lalit Modi, the founder and former commissioner of the league. IPL has an exclusive window in ICC Future Tours Programme.

The IPL is the most-attended cricket league in the world and in 2014 ranked sixth by average attendance among all sports leagues. In 2010, the IPL became the first sporting event in the world to be broadcast live on YouTube. The brand value of IPL in 2018 was US$6.3 billion, according to Duff & Phelps. According to BCCI, the 2015 IPL season contributed ₹11.5 billion (US$182 million) to the GDP of the Indian economy.

There have been eleven seasons of the IPL tournament. The current IPL title holders are the Chennai Super Kings, who won the 2018 season."""

In [279]:
count_before = len(input) # Counts words in input                       
print("INPUT COUNT :" + str(count_before))  # Prints the count

INPUT COUNT :973


In [280]:
# Text Tokenizing

Stop_Words = set(stopwords.words("english")) # Assign a varaible to store stopwords


In [281]:
print(stopwords.words("english")) # These are the stopwords

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [282]:
Words = word_tokenize(input) # Assigning a variable to filter all the words in a text

In [283]:
print(Words) # Prints the words present in input

['The', 'Indian', 'Premier', 'League', '(', 'IPL', ')', 'is', 'a', 'professional', 'Twenty20', 'cricket', 'league', 'in', 'India', 'contested', 'during', 'April', 'and', 'May', 'of', 'every', 'year', 'by', '8', 'teams', 'representing', '8', 'cities', 'of', 'India', '.', 'The', 'league', 'was', 'founded', 'by', 'the', 'Board', 'of', 'Control', 'for', 'Cricket', 'in', 'India', '(', 'BCCI', ')', 'in', '2008', ',', 'and', 'is', 'regarded', 'as', 'the', 'brainchild', 'of', 'Lalit', 'Modi', ',', 'the', 'founder', 'and', 'former', 'commissioner', 'of', 'the', 'league', '.', 'IPL', 'has', 'an', 'exclusive', 'window', 'in', 'ICC', 'Future', 'Tours', 'Programme', '.', 'The', 'IPL', 'is', 'the', 'most-attended', 'cricket', 'league', 'in', 'the', 'world', 'and', 'in', '2014', 'ranked', 'sixth', 'by', 'average', 'attendance', 'among', 'all', 'sports', 'leagues', '.', 'In', '2010', ',', 'the', 'IPL', 'became', 'the', 'first', 'sporting', 'event', 'in', 'the', 'world', 'to', 'be', 'broadcast', 'live'

In [284]:
Sentences = sent_tokenize(input) # Store the scentences in a variable

In [285]:
print(Sentences) # Prints all scentences in the input

['The Indian Premier League (IPL) is a professional Twenty20 cricket league in India contested during April and May of every year by 8 teams representing 8 cities of India.', 'The league was founded by the Board of Control for Cricket in India (BCCI) in 2008, and is regarded as the brainchild of Lalit Modi, the founder and former commissioner of the league.', 'IPL has an exclusive window in ICC Future Tours Programme.', 'The IPL is the most-attended cricket league in the world and in 2014 ranked sixth by average attendance among all sports leagues.', 'In 2010, the IPL became the first sporting event in the world to be broadcast live on YouTube.', 'The brand value of IPL in 2018 was US$6.3 billion, according to Duff & Phelps.', 'According to BCCI, the 2015 IPL season contributed ₹11.5 billion (US$182 million) to the GDP of the Indian economy.', 'There have been eleven seasons of the IPL tournament.', 'The current IPL title holders are the Chennai Super Kings, who won the 2018 season.']


In [286]:
# Frequency table    
# This table keeps track of how many times each word appears in the summarizer                
   
Frequency_Table = dict() # Dictionaries are used to find which sentences has more content (key:value pairs)

In [287]:
print(dict()) # Haven't put anything in it, so NULL

{}


In [288]:

for word in Words:   # If a word (any word) is encountered in Words (has all words of input)
    
    word = word.lower() # Returns lowercased words (If there are any uppercase words then lower() convert them to lowercase)     
    
    if word in Stop_Words:  # If a word in Stop_Words array is encountered ignore it
        continue          
    
    if word in Frequency_Table:  # If a word is already present in Frequency_Table then, the number associated with the word is incremented by 1
        Frequency_Table[word] += 1  # '+=' FUNCTION IS THAT when x = 3 ; x += 2 ; print x >>> 5 
    
    else: 
        Frequency_Table[word] = 1   # Else if, it is a new word then it id added to Frequency_Table

In [289]:
print(Frequency_Table) # Gives the contents of Frequency_Table

{'indian': 2, 'premier': 1, 'league': 5, '(': 3, 'ipl': 8, ')': 3, 'professional': 1, 'twenty20': 1, 'cricket': 3, 'india': 3, 'contested': 1, 'april': 1, 'may': 1, 'every': 1, 'year': 1, '8': 2, 'teams': 1, 'representing': 1, 'cities': 1, '.': 9, 'founded': 1, 'board': 1, 'control': 1, 'bcci': 2, '2008': 1, ',': 6, 'regarded': 1, 'brainchild': 1, 'lalit': 1, 'modi': 1, 'founder': 1, 'former': 1, 'commissioner': 1, 'exclusive': 1, 'window': 1, 'icc': 1, 'future': 1, 'tours': 1, 'programme': 1, 'most-attended': 1, 'world': 2, '2014': 1, 'ranked': 1, 'sixth': 1, 'average': 1, 'attendance': 1, 'among': 1, 'sports': 1, 'leagues': 1, '2010': 1, 'became': 1, 'first': 1, 'sporting': 1, 'event': 1, 'broadcast': 1, 'live': 1, 'youtube': 1, 'brand': 1, 'value': 1, '2018': 2, 'us': 2, '$': 2, '6.3': 1, 'billion': 2, 'according': 2, 'duff': 1, '&': 1, 'phelps': 1, '2015': 1, 'season': 2, 'contributed': 1, '₹11.5': 1, '182': 1, 'million': 1, 'gdp': 1, 'economy': 1, 'eleven': 1, 'seasons': 1, 'tourn

In [290]:
# A dictionary to store scentences and values linked to it 
 

Sentences = sent_tokenize(input) # Storing the sentences tokens

Sentence_Value = dict()  # We need a value for each sentence to get the avg from them (To put important sentences together) 
                         # These values are stored in a dictionary (Dict can hold any data type here KEY:VALUE pairs)

    

In [291]:
print(Sentence_Value)    # NULL because we havent put anything in it

{}


In [292]:

for sentence in Sentences: # If a sentence  in the Sentences (has all sentences from input)
   
  for word, Frequency in Frequency_Table.items(): # .items() method lists all KEY:VALUE pairs from the dictionary
        
        if word in sentence.lower(): # If a word in sentence is lowercase
        
          if sentence in Sentence_Value: # If you encounter a sentence from Sentence_Value
              
             Sentence_Value[sentence] += Frequency # Frequency is added to Sentence_Value of the sentence (to pointed sentence from input)
           
          else: 
                Sentence_Value[sentence] = Frequency # Else Sentence_Value of a sentence is equated to frequency
            
            #Sentence_Value[sentence]  =  Sentence_Value[sentence] + Frequency                
            
            #Sentence_Value = Add numbers from Frequnecy_Table and also rest of the words (like, of, which etc) STOP WORDS
            
             #EXAMPLE:
                # In an attempt(2) to build(5) an AI-ready(2) workforce(2) ,(14)
                # Microsoft(4) announced(2) Intelligent(2) Cloud(5) Hub(3) 
                # which has been launched(1) to empower(1) the next(1) generation(1)
                # of students(2) with AI-ready(2) skills(5) .(12)': 78
                # 2+5+2+2+14+4+2+2+5+3+1+1+1+1+2+2+5+12 = 66 + remaining words(11) = 77 + 1(Frequency) =>> 78
                # 78 = 78 !!!!!!!!!!!!!!     
       

In [293]:
print(Frequency)

1


In [294]:
 print(Sentence_Value)

{'The Indian Premier League (IPL) is a professional Twenty20 cricket league in India contested during April and May of every year by 8 teams representing 8 cities of India.': 49, 'The league was founded by the Board of Control for Cricket in India (BCCI) in 2008, and is regarded as the brainchild of Lalit Modi, the founder and former commissioner of the league.': 47, 'IPL has an exclusive window in ICC Future Tours Programme.': 25, 'The IPL is the most-attended cricket league in the world and in 2014 ranked sixth by average attendance among all sports leagues.': 36, 'In 2010, the IPL became the first sporting event in the world to be broadcast live on YouTube.': 33, 'The brand value of IPL in 2018 was US$6.3 billion, according to Duff & Phelps.': 41, 'According to BCCI, the 2015 IPL season contributed ₹11.5 billion (US$182 million) to the GDP of the Indian economy.': 55, 'There have been eleven seasons of the IPL tournament.': 22, 'The current IPL title holders are the Chennai Super Ki

In [295]:

Sum_Values = 0 # Taking Sum as 0 

for sentence in Sentence_Value: # A sentence in Sentence_Value
    
    Sum_Values += Sentence_Value[sentence] # 0 += Value of all sentences in Sentence_Value
    

In [296]:
print(Sum_Values) # Sum of all sentences from Sentence_Value

343


In [297]:
print (len(Sentence_Value)) # This indicates the toatal number of lines in Sentence_Value

9


In [298]:
# Finding Average of Sentences from input
    
Avg = int(Sum_Values / len(Sentence_Value))


In [299]:
print(Avg) 

38


In [309]:
# Assigning a variable to store sentences 

Summary = '' # NULL !!!! 

for sentence in Sentences: 
    
    if (sentence in Sentence_Value) and (Sentence_Value[sentence] > (0.9 * Avg)):  # Helps us to get all sentences above Sentence_Value 59
        
        Summary += "" + sentence # Assigning a variable summary

In [310]:
print(" -------------------------------------------------------SUMMARY------------------------------------------------------------")

print(Summary) # Prints the summary of input

print(" -------------------------------------------------------SUMMARY------------------------------------------------------------")

 -------------------------------------------------------SUMMARY------------------------------------------------------------
The Indian Premier League (IPL) is a professional Twenty20 cricket league in India contested during April and May of every year by 8 teams representing 8 cities of India.The league was founded by the Board of Control for Cricket in India (BCCI) in 2008, and is regarded as the brainchild of Lalit Modi, the founder and former commissioner of the league.The IPL is the most-attended cricket league in the world and in 2014 ranked sixth by average attendance among all sports leagues.The brand value of IPL in 2018 was US$6.3 billion, according to Duff & Phelps.According to BCCI, the 2015 IPL season contributed ₹11.5 billion (US$182 million) to the GDP of the Indian economy.The current IPL title holders are the Chennai Super Kings, who won the 2018 season.
 -------------------------------------------------------SUMMARY------------------------------------------------------

In [302]:
count_after = len(Summary) # Count words in summary
print("OUTPUT COUNT :" + str(count_after) ) # Prints the count

OUTPUT COUNT :468


In [213]:
#BETA

Example =""" Envisioned as a three-year collaborative program,
Intelligent Cloud Hub will support around 100 institutions with AI infrastructure, 
course content and curriculum, developer support,
development tools and give students access to cloud and AI services. 
The company will provide AI development tools and Azure AI services such as Microsoft Cognitive Services,
Bot Services and Azure Machine Learning. According to Manish Prakash,
Country General Manager-PS, Health and Education,
Microsoft India, said, "With AI being the defining technology of our time,
it is transforming lives and industry and the jobs of tomorrow will require a different skillset."""

count = len(Example)
print(count)

653
