# Universal Sentnce Encoder

In [1]:
#Libraries
import tensorflow.compat.v1 as tf
import tensorflow_hub as hub
import pandas as pd
import numpy as np
import sys
tf.disable_eager_execution()

In [2]:
embed = hub.Module("https://tfhub.dev/google/universal-sentence-encoder/2")

In [9]:
title_list = ['Dad is cooking something', 'Dad is preparing something']
embeddings = []

try:
    #embed = hub.Module("https://tfhub.dev/google/universal-sentence-encoder/2")
    # Reduce logging output.
    tf.logging.set_verbosity(tf.logging.ERROR)
    with tf.Session() as session:
        session.run([tf.global_variables_initializer(), tf.tables_initializer()])
        message_embeddings = session.run(embed(title_list))

    for emm in message_embeddings:
        embeddings.append(emm)
except:
    None

In [10]:
np.dot(embeddings[0], embeddings[1])

0.8476613

# Keywords Extraction

In [None]:
!pip install fuzzywuzzy
!pip install unidecode
!pip install rake-nltk

## TextRank4Keyword

In [13]:
from collections import OrderedDict
import numpy as np
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

nlp = spacy.load('en_core_web_sm')

class TextRank4Keyword():
    """Extract keywords from text
    https://towardsdatascience.com/textrank-for-keyword-extraction-by-python-c0bae21bcec0

    """
    
    def __init__(self, iter_steps = 10):
        self.d = 0.85 # damping coefficient, usually is .85
        self.min_diff = 1e-5 # convergence threshold
        self.steps = iter_steps # iteration steps
        self.node_weight = None # save keywords and its weight

    
    def set_stopwords(self, stopwords):  
        """Set stop words"""
        for word in STOP_WORDS.union(set(stopwords)):
            lexeme = nlp.vocab[word]
            lexeme.is_stop = True
    
    def sentence_segment(self, doc, candidate_pos, lower):
        """Store those words only in cadidate_pos"""
        sentences = []
        for sent in doc.sents:
            selected_words = []
            for token in sent:
                # Store words only with cadidate POS tag
                if token.pos_ in candidate_pos and token.is_stop is False:
                    if lower is True:
                        selected_words.append(token.text.lower())
                    else:
                        selected_words.append(token.text)
            sentences.append(selected_words)
        return sentences
        
    def get_vocab(self, sentences):
        """Get all tokens"""
        vocab = OrderedDict()
        i = 0
        for sentence in sentences:
            for word in sentence:
                if word not in vocab:
                    vocab[word] = i
                    i += 1
        return vocab
    
    def get_token_pairs(self, window_size, sentences):
        """Build token_pairs from windows in sentences"""
        token_pairs = list()
        for sentence in sentences:
            for i, word in enumerate(sentence):
                for j in range(i+1, i+window_size):
                    if j >= len(sentence):
                        break
                    pair = (word, sentence[j])
                    if pair not in token_pairs:
                        token_pairs.append(pair)
        return token_pairs
        
    def symmetrize(self, a):
        return a + a.T - np.diag(a.diagonal())
    
    def get_matrix(self, vocab, token_pairs):
        """Get normalized matrix"""
        # Build matrix
        vocab_size = len(vocab)
        g = np.zeros((vocab_size, vocab_size), dtype='float')
        for word1, word2 in token_pairs:
            i, j = vocab[word1], vocab[word2]
            g[i][j] = 1
            
        # Get Symmeric matrix
        g = self.symmetrize(g)
        
        # Normalize matrix by column
        norm = np.sum(g, axis=0)
        g_norm = np.divide(g, norm, where=norm!=0) # this is ignore the 0 element in norm
        
        return g_norm

    
    def get_keywords(self, mean_threshold=False):
        scores = np.array(list(self.node_weight.items()))[:,1].astype(float)
        scores_mean = scores[scores>=np.mean(scores)]
        keywords = np.array(list(self.node_weight.items()))[:,0]
        keywords_mean = keywords[scores>=np.mean(scores)]
        keywords_mean = keywords_mean[np.argsort(scores_mean)[::-1]]
        scores_mean = scores_mean[np.argsort(scores_mean)[::-1]]
        if mean_threshold:
            return list(keywords_mean), list(scores_mean)
        else:
            return list(keywords), list(scores)
        
    def analyze(self, text, 
                candidate_pos=['NOUN', 'PROPN'], 
                window_size=4, lower=False, stopwords=list()):
        """Main function to analyze text"""
        
        # Set stop words
        self.set_stopwords(stopwords)
        
        # Pare text by spaCy
        doc = nlp(text)
        
        # Filter sentences
        sentences = self.sentence_segment(doc, candidate_pos, lower) # list of list of words
        
        # Build vocabulary
        vocab = self.get_vocab(sentences)
        
        # Get token_pairs from windows
        token_pairs = self.get_token_pairs(window_size, sentences)
        
        # Get normalized matrix
        g = self.get_matrix(vocab, token_pairs)
        
        # Initionlization for weight(pagerank value)
        pr = np.array([1] * len(vocab))
        
        # Iteration
        previous_pr = 0
        for epoch in range(self.steps):
            pr = (1-self.d) + self.d * np.dot(g, pr)
            if abs(previous_pr - sum(pr))  < self.min_diff:
                break
            else:
                previous_pr = sum(pr)

        # Get weight for each node
        node_weight = dict()
        for word, index in vocab.items():
            node_weight[word] = pr[index]
        
        self.node_weight = node_weight

In [16]:
tr4w = TextRank4Keyword()
tr4w.analyze('Dad is cooking something delicious', candidate_pos = ['NOUN', 'PROPN', 'VERB', 'ADJ'], window_size=4, lower=True)
keywords = tr4w.get_keywords(mean_threshold = False)
print(keywords)

(['dad', 'cooking', 'delicious'], [1.0, 1.0, 1.0])


In [17]:
tr4w = TextRank4Keyword()
tr4w.analyze('Dad is preparing something delicious', candidate_pos = ['NOUN', 'PROPN', 'VERB', 'ADJ'], window_size=4, lower=True)
keywords = tr4w.get_keywords(mean_threshold = False)
print(keywords)

(['dad', 'preparing', 'delicious'], [1.0, 1.0, 1.0])


In [26]:
text = """Diablo was conceived by David Brevik during the development of the fighting game Justice League Task Force (1995), 
developed by Japanese studio Sunsoft with two American studios, Condor Games (later Blizzard North) on the Sega Genesis version 
and Silicon & Synapse on the SNES version, which by the end of Justice League Task Force's development, had renamed themselves 
Blizzard Entertainment.[18] Brevik's concept was a personal computer game based heavily on the roguelike genre that featured 
turn-based gameplay,[19] but he wanted to improve how quickly the player would be able to get into the game compared to typical 
role-playing games. Brevik was inspired by NHL '94 and similar sports games to make it so that players only had to select a 
pre-determined class and would be able to jump into the game with minimal interactions. Brevik also wanted these classes to be 
combinations of typical character classes so that players would be not overly restricted in what type of attacks or equipment 
they could use. A further departure from the roguelike approach was to make the loot system from felled monsters more expansive.
[20] According to Matt Barton, the game Telengard, released by Avalon Hill in 1982, influenced the development of Diablo.[21] 
Barton and Bill Loguidice also cite The Legend of Zelda series as an influence on Diablo, particularly its move towards real-time 
action, away from the stat-heavy, turn-based gameplay of earlier computer RPGs.[22] Brevik also wanted a "modern and cool" 
interface intended to bring the quick directness of console games as well as Doom (1993) to computer RPGs.[23] He had named the 
game idea Diablo based on Mount Diablo, which was where Brevik lived when he conceived of the game idea.[20] Condor started 
development while pitching the game to publishers. Bill Roper said that the team's "initial pitch, in a nutshell, was to take 
the excitement and randomness of games like Moria, Nethack, and Rogue, and bring them into the 1990s with fantastic graphics and 
sound".[24] Condor Games had tried to shop Brevik's idea to other publishers but was turned down on the grounds that "RPGs are 
dead".[23] While Condor sought a publisher for Brevik's idea, Blizzard had completed their next game, the real-time strategy 
Warcraft: Orcs & Humans (1994) for the personal computer. Condor Games saw Blizzard shared their same interest in personal 
computer games and pitched Brevik's idea to them. About three to four months into development, Blizzard offered to work with 
them on Diablo but requested two major changes, to make it real-time and to have multiplayer, having had done these steps during 
their work in creating Warcraft.[23] Brevik did not want to change the turn-based nature of his game at first fearing that it would 
add extra time onto development, and Condor's development team put the idea to a vote, with the real-time approach winning out.[20] 
Brevik prepared Blizzard for the potential added development time and to ask for extra development costs, and then on a Friday 
afternoon as everyone had left, started to toy around with converting the game from turn-based to real-time; within hours he had 
the basics of the system in place, and was able to show this to the rest of Condor by the following Monday. They kept this news 
from Blizzard for a short while but eventually revealed how fast they had come up with the change, which Blizzard readily accepted.[20] 
The game was also originally conceived to be made in claymation (much like ClayFighter), but they decided to have a 3D isometric 
style instead.[25][26][27] In 1996, as development on Diablo continued, Blizzard acquired Condor Games. Blizzard named their Irving, 
California location to Blizzard South and Condor Games to Blizzard north to distinguish their studios.[28] Around eight months before 
the planned release, Blizzard South were finishing up Warcraft II, and began focusing on the upcoming release of Diablo. Blizzard did 
not want to rely on existing online gaming services like Total Entertainment Network for matchmaking. While Blizzard North finished up 
Diablo, Blizzard South began development of the basis of Battle.net. According to Brevik, when Blizzard South came up to start to see 
how Diablo's multiplayer code would incorporate with Battle.net, they discovered then that Diablo had no multiplayer code as Brevik 
nor others had any idea how to write such code. Blizzard South sent employees up to Blizzard North, including Mike O'Brien, at that 
point to incorporate multiplayer for Diablo and interface with Battle.net over the last six months of development.[29]"""

tr4w = TextRank4Keyword()
tr4w.analyze(text, candidate_pos = ['NOUN', 'PROPN'], window_size=4, lower=True)
keywords = tr4w.get_keywords(mean_threshold = False)
print(keywords)

(['diablo', 'david', 'brevik', 'development', 'fighting', 'game', 'justice', 'league', 'task', 'force', 'studio', 'sunsoft', 'studios', 'condor', 'games', 'blizzard', 'north', 'sega', 'genesis', 'version', 'silicon', 'synapse', 'snes', 'end', 'entertainment.[18', 'concept', 'computer', 'genre', 'turn', 'gameplay,[19', 'player', 'role', 'nhl', 'sports', 'players', 'class', 'interactions', 'classes', 'combinations', 'character', 'type', 'attacks', 'equipment', 'departure', 'approach', 'loot', 'system', 'monsters', 'matt', 'barton', 'telengard', 'avalon', 'hill', 'diablo.[21', 'bill', 'loguidice', 'legend', 'zelda', 'series', 'influence', 'time', 'action', 'gameplay', 'rpgs.[22', 'interface', 'directness', 'console', 'doom', 'rpgs.[23', 'idea', 'mount', 'publishers', 'roper', 'team', 'pitch', 'nutshell', 'excitement', 'randomness', 'moria', 'nethack', 'rogue', 'graphics', 'grounds', 'rpgs', 'dead".[23', 'publisher', 'strategy', 'warcraft', 'orcs', 'humans', 'interest', 'months', 'changes'

In [27]:
np.array(keywords[0])[np.argsort(keywords[1])[::-1]]

array(['blizzard', 'game', 'games', 'development', 'diablo', 'brevik',
       'time', 'condor', 'idea', 'computer', 'south', 'turn',
       'multiplayer', 'players', 'system', 'north', 'version', 'team',
       'approach', 'classes', 'total', 'justice', 'task', 'bill',
       'league', 'barton', 'warcraft', 'code', 'moria', 'entertainment',
       'publishers', 'force', 'interface', 'services', 'randomness',
       'battle.net', 'legend', 'excitement', 'place', 'rest', 'character',
       'nethack', 'pitch', 'avalon', 'nutshell', 'telengard', 'type',
       'network', 'zelda', 'console', 'release', 'synapse', 'series',
       'snes', 'loot', 'silicon', 'loguidice', 'vote', 'months', 'sega',
       'influence', 'genesis', 'gaming', 'end', 'studios', 'rogue',
       'action', 'attacks', 'interest', 'changes', 'class', 'hill',
       'combinations', 'location', 'afternoon', 'friday', 'sunsoft',
       'directness', 'fearing', 'matchmaking', 'steps', 'studio',
       'sports', 'fighting', 