### Import our essential Librairies

In [24]:
import nltk
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
import numpy as np
import networkx as nx # NetworkX is a package for the Python programming language that's used to create, manipulate, and study the structure, dynamics, and functions of complex graph networks

### Create our function 'Sentences' for a later use

In [25]:
def read_article(file_name):
    sentences = []
    file = open(file_name, 'r') 
    f_data = file.readlines()
    f_data = [x for x in f_data if x != '\n'] # it should remove any break present
    f_data = [x.replace('\n',' ') for x in f_data] #this would remove that end of line
    f_data = ''.join(f_data) 
    article = f_data.split('. ') 
    for sentence in article:
        sentences.append(sentence.replace("^[a-zA-Z0-9!@#$&()-`+,/\"]", " ").split(" "))
    return sentences

### Define a Cosine Similarity matrix

In [26]:
def build_similarity_matrix(sentences, stop_words):
    similarity_matrix = np.zeros((len(sentences), len(sentences)))  # Create an empty similarity matrix
    
    for idx1 in range(len(sentences)):
        for idx2 in range(len(sentences)):
            if idx1 == idx2: # ignore if both are same sentences
                continue 
                similarity_matrix[idx1][idx2] = sentence_similarity(sentences[idx1], sentences[idx2], stop_words)
    return similarity_matrix

### We start our Algo step by step

In [53]:
def generate_summary(file_name, top_n=5):
    
    # nltk.download("stopwords")    ### if not already installed, delete the # and run the code one time
    stop_words = stopwords.words('english')
    summarize_text = []

    # Step 1 - Input Article and split it into Sentences
    sentences =  read_article(file_name)

    # Step 2 - Build a Similary Martix across sentences & remove Stop Words
    sentence_similarity_martix = build_similarity_matrix(sentences, stop_words)

    # Step 3 - Generate rank based on Matrix
    sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_martix)
    scores = nx.pagerank(sentence_similarity_graph)

    # Step 4 - Sort the rank and pick top sentences
    ranked_sentence = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)    
    print("Indexes of top ranked_sentence order are ", ranked_sentence)    

    for i in range(top_n):
      summarize_text.append(" ".join(ranked_sentence[i][1]))

    # Step 5 - output the summarized text
    print("Summarize Text: \n", ". ".join(summarize_text))


generate_summary("Readme_test.txt",3) # we choose randomly 3 to have the top 3 sentences as output. you can choose more or less also

Indexes of top ranked_sentence order are  [(0.06666666666666665, ['Therefore,', 'blockchains', 'are', 'resistant', 'to', 'modification', 'of', 'their', 'data', 'because', 'once', 'recorded,', 'the', 'data', 'in', 'any', 'given', 'block', 'cannot', 'be', 'altered', 'retroactively', 'without', 'altering', 'all', 'subsequent', 'blocks']), (0.06666666666666665, ['The', 'timestamp', 'proves', 'that', 'the', 'transaction', 'data', 'existed', 'when', 'the', 'block', 'was', 'published', 'in', 'order', 'to', 'get', 'into', 'its', 'hash']), (0.06666666666666665, ['The', 'invention', 'of', 'the', 'blockchain', 'for', 'bitcoin', 'made', 'it', 'the', 'first', 'digital', 'currency', 'to', 'solve', 'the', 'double-spending', 'problem', 'without', 'the', 'need', 'of', 'a', 'trusted', 'authority', 'or', 'central', 'server']), (0.06666666666666665, ['The', 'identity', 'of', 'Satoshi', 'Nakamoto', 'remains', 'unknown', 'to', 'date']), (0.06666666666666665, ['The', 'blockchain', 'was', 'popularized', 'by',