In [0]:
# BrambleXu's implementation of textrank
from collections import OrderedDict
import numpy as np
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

nlp = spacy.load('en_core_web_sm')

class TextRank4Keyword():
    """Extract keywords from text"""
    
    def __init__(self):
        self.d = 0.85 # damping coefficient, usually is .85
        self.min_diff = 1e-5 # convergence threshold
        self.steps = 100 # iteration steps
        self.node_weight = None # save keywords and its weight
        
        # list of keywords which will hold a lists of keywords and their pr
        self.keywords = []


    
    def set_stopwords(self, stopwords):  
        """Set stop words"""
        for word in STOP_WORDS.union(set(stopwords)):
            lexeme = nlp.vocab[word]
            lexeme.is_stop = True
    
    def sentence_segment(self, doc, candidate_pos, lower):
        """Store those words only in cadidate_pos"""
        sentences = []
        for sent in doc.sents:
            selected_words = []
            for token in sent:
                # Store words only with cadidate POS tag
                if token.pos_ in candidate_pos and token.is_stop is False:
                    if lower is True:
                        selected_words.append(token.text.lower())
                    else:
                        selected_words.append(token.text)
            sentences.append(selected_words)
        return sentences
        
    def get_vocab(self, sentences):
        """Get all tokens"""
        vocab = OrderedDict()
        i = 0
        for sentence in sentences:
            for word in sentence:
                if word not in vocab:
                    vocab[word] = i
                    i += 1
        return vocab
    
    def get_token_pairs(self, window_size, sentences):
        """Build token_pairs from windows in sentences"""
        token_pairs = list()
        for sentence in sentences:
            for i, word in enumerate(sentence):
                for j in range(i+1, i+window_size):
                    if j >= len(sentence):
                        break
                    pair = (word, sentence[j])
                    if pair not in token_pairs:
                        token_pairs.append(pair)
        return token_pairs
        
    def symmetrize(self, a):
        return a + a.T - np.diag(a.diagonal())
    
    def get_matrix(self, vocab, token_pairs):
        """Get normalized matrix"""
        # Build matrix
        vocab_size = len(vocab)
        g = np.zeros((vocab_size, vocab_size), dtype='float')
        for word1, word2 in token_pairs:
            i, j = vocab[word1], vocab[word2]
            g[i][j] = 1
            
        # Get Symmeric matrix
        g = self.symmetrize(g)
        
        # Normalize matrix by column
        norm = np.sum(g, axis=0)
        g_norm = np.divide(g, norm, where=norm!=0) # this is ignore the 0 element in norm
        
        return g_norm

    
    def get_keywords(self, number=10):
        """Print top number keywords"""
        node_weight = OrderedDict(sorted(self.node_weight.items(), key=lambda t: t[1], reverse=True))
        for i, (key, value) in enumerate(node_weight.items()):
            if(i < number):
              # add keyword and its pr
              self.keywords.append([key,value]) 
        return self.keywords
           
        
        
    def analyze(self, text, 
                candidate_pos=['NOUN', 'PROPN'], 
                window_size=4, lower=False, stopwords=list()):
        """Main function to analyze text"""
        
        # Set stop words
        self.set_stopwords(stopwords)
        
        # Pare text by spaCy
        doc = nlp(text)
        
        # Filter sentences
        sentences = self.sentence_segment(doc, candidate_pos, lower) # list of list of words
        
        # Build vocabulary
        vocab = self.get_vocab(sentences)
        
        # Get token_pairs from windows
        token_pairs = self.get_token_pairs(window_size, sentences)
        
        # Get normalized matrix
        g = self.get_matrix(vocab, token_pairs)
        
        # Initionlization for weight(pagerank value)
        pr = np.array([1] * len(vocab))
        
        # Iteration
        previous_pr = 0
        for epoch in range(self.steps):
            pr = (1-self.d) + self.d * np.dot(g, pr)
            if abs(previous_pr - sum(pr))  < self.min_diff:
                break
            else:
                previous_pr = sum(pr)

        # Get weight for each node
        node_weight = dict()
        for word, index in vocab.items():
            node_weight[word] = pr[index]
        
        self.node_weight = node_weight

In [6]:
text1 = '''An intelligent virtual assistant (IVA) or intelligent personal assistant (IPA) is a software agent that can perform tasks or services for an individual based on commands or questions. Sometimes the term "chatbot" is used to refer to virtual assistants generally or specifically accessed by online chat. In some cases, online chat programs are exclusively for entertainment purposes. Some virtual assistants are able to interpret human speech and respond via synthesized voices. Users can ask their assistants questions, control home automation devices and media playback via voice, and manage other basic tasks such as email, to-do lists, and calendars with verbal (spoken?) commands.[1] A similar concept, however with differences, lays under the dialogue systems.[2]

As of 2017, the capabilities and usage of virtual assistants are expanding rapidly, with new products entering the market and a strong emphasis on both email and voice user interfaces. Apple and Google have large installed bases of users on smartphones. Microsoft has a large installed base of Windows-based personal computers, smartphones and smart speakers. Amazon has a large install base for smart speakers.[3] Conversica has over 100 million engagements via its email and sms interface Intelligent Virtual Assistants for business.'''
text2 = '''Google Assistant is an artificial intelligence–powered[1] virtual assistant developed by Google that is primarily available on mobile and smart home devices. Unlike the company's previous virtual assistant, Google Now, the Google Assistant can engage in two-way conversations.

Assistant initially debuted in May 2016 as part of Google's messaging app Allo, and its voice-activated speaker Google Home. After a period of exclusivity on the Pixel and Pixel XL smartphones, it began to be deployed on other Android devices in February 2017, including third-party smartphones and Android Wear (now Wear OS), and was released as a standalone app on the iOS operating system in May 2017. Alongside the announcement of a software development kit in April 2017, the Assistant has been further extended to support a large variety of devices, including cars and third party smart home appliances. The functionality of the Assistant can also be enhanced by third-party developers.

Users primarily interact with the Google Assistant through natural voice, though keyboard input is also supported. In the same nature and manner as Google Now, the Assistant is able to search the Internet, schedule events and alarms, adjust hardware settings on the user's device, and show information from the user's Google account. Google has also announced that the Assistant will be able to identify objects and gather visual information through the device's camera, and support purchasing products and sending money, as well as identifying songs.

At CES 2018, the first Assistant-powered smart displays (smart speakers with video screens) were announced, with the first one being released in July 2018.[2] In 2020, Google Assistant is already available on more than 1 billion devices.[3] Google Assistant is available in more than 90 countries and in over 30 languages,[4] and is used by more than 500 million users monthly.[5]'''

# read text from pages
virtualFile=open('virtual assistant.txt','r', encoding='utf-8')
# converts to lowercase
page1 = virtualFile.read().lower()

googleFile=open('google assistant.txt','r', encoding='utf-8')
# converts to lowercase
page2 = googleFile.read().lower()

# use files instead of strings above
#text1 = page1
#text2 = page2

# number of extracted keywords per page
keywordsCount = 100

# extract only these POS
extracted_pos = ['NOUN', 'PROPN','VERB']

# using BrambleXu's implementation of textrank
tr4w1 = TextRank4Keyword()
tr4w1.analyze(text1, candidate_pos = extracted_pos, window_size=4, lower=False)
keywords1 = tr4w1.get_keywords(keywordsCount)
print("keywords of text1\n",keywords1)

tr4w2 = TextRank4Keyword()
tr4w2.analyze(text2, candidate_pos = extracted_pos, window_size=4, lower=False)
keywords2 = tr4w2.get_keywords(keywordsCount)
print("keywords of text2\n",keywords2)

# combine keywords of each page
sentence1 =""
for keyword in keywords1 :
  sentence1 += keyword[0] + " " 

print("sentence1 : \n",sentence1)

sentence2 =""
for keyword in keywords2 :
  sentence2 += keyword[0] + " " 

print("sentence2 : \n",sentence2)

# lemmatization
import nltk

# first-time use only
nltk.download('wordnet') # limmtization
nltk.download('punkt') # tokenization

lemmer = nltk.stem.WordNetLemmatizer()

# build a customized tokenizer  
def LemTokens(tokens):
    return [lemmer.lemmatize(token) for token in tokens]

def LemNormalize(text):
    return LemTokens([token for token in nltk.word_tokenize(text.lower())])

# Apply TF-IDF vectorization to lemmatized text
from sklearn.feature_extraction.text import TfidfVectorizer

TfidfVec = TfidfVectorizer(tokenizer=LemNormalize)
tfidf = TfidfVec.fit_transform([sentence1,sentence2])

# apply cosine similarity between two pages
from sklearn.metrics.pairwise import cosine_similarity
metrix = cosine_similarity(tfidf[0], tfidf[1])

print("Semantic relatedness between the two pages is ",metrix[0][0]*100 ,"%")

keywords of text1
 [['assistants', 2.9623285243306072], ['email', 2.6014164562289563], ['tasks', 1.8026550174362672], ['based', 1.7905509870530705], ['voice', 1.6160261994949492], ['chat', 1.4798499228395063], ['base', 1.474827417027417], ['questions', 1.4043983686067016], ['smartphones', 1.328450333694084], ['interface', 1.1842828282828282], ['sms', 1.1502828282828284], ['installed', 1.0994527597402597], ['bases', 1.0994527597402597], ['differences', 1.0814583333333332], ['lays', 1.0814583333333332], ['dialogue', 1.0814583333333332], ['assistant', 1.071659722222222], ['speech', 1.065813910934744], ['respond', 1.065813910934744], ['Intelligent', 1.0277411616161616], ['software', 1.025762941919192], ['entering', 1.0136300304633639], ['agent', 1.010029356060606], ['expanding', 1.0068562510020844], ['control', 1.0028733666025333], ['refer', 1.0025822310405643], ['products', 1.0002916466249798], ['home', 0.9931606140772806], ['Windows', 0.9823370490620491], ['computers', 0.9823370490620491