In [1]:
from gensim.models import fasttext as ft
import numpy as np
import os
import nltk
from nltk.corpus import stopwords
import operator
import re

In [2]:
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\yoshi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yoshi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# https://fasttext.cc/docs/en/crawl-vectors.html
model=ft.load_facebook_vectors(os.path.join(os.getcwd(),"binaries/data/cc.en.300.bin"))

  "C extension not loaded, training will be slow. "


In [4]:
window_size=3
stemmer = nltk.wordnet.WordNetLemmatizer()
stops = set(stopwords.words("english"))

In [5]:
def rank(text,top=0,collapse=True):
    text=text.lower()
    tokens = nltk.word_tokenize(text)
    stemmed_words = [stemmer.lemmatize(word) for word in tokens]
    cleaned_words= [word for word in stemmed_words if word not in stops]
    filtered_words=[]
    counts=dict()
    scores=dict()
    for word in cleaned_words:
        filtered=re.sub('[^a-zA-Z]', '', word)
        if(len(filtered)>=3 or filtered in model.wv.vocab):
            filtered_words.append(word)
            counts.setdefault(word,0)
            counts[word]+=1
    scoreList=np.zeros(len(filtered_words))
    for i in range(len(filtered_words)):
        leftBound=max(0,i-window_size)
        rightBound=min(len(filtered_words)-1,i+window_size)
        contextSize=rightBound-leftBound+1
        for j in range(i+1,rightBound+1):
            similarity_score=model.wv.similarity(filtered_words[i],filtered_words[j])
            similarity_score=(similarity_score+1)/2.0
            scoreList[i]+=similarity_score
            scoreList[j]+=similarity_score
        wordScore=scoreList[i]/(counts[filtered_words[i]]*contextSize)
        if filtered_words[i] not in scores:
            scores[filtered_words[i]]=wordScore
        else:
            scores[filtered_words[i]]=min(scores[filtered_words[i]],wordScore)
    wordScores=list(map(list, scores.items()))
    wordScores=sorted(wordScores, key = lambda x: x[1])#,reverse=True)
    filteredWordScores=[]
    for word in wordScores:
        filtered=re.sub('[^a-zA-Z]', '', word[0])
        if(len(filtered)>2):
            filteredWordScores.append([filtered,word[1]])
    if collapse:
        phraseScores=collapsePhrases(tokens,scores,filteredWordScores[:len(filteredWordScores)//3],cleaned_words)
        if(top==0):
            top=len(phraseScores)
        return phraseScores[:top]
    else:
        if(top==0):
            top=len(filteredWordScores)
        return filteredWordScores[:top]

In [6]:
def collapsePhrases(tokens,scores,filteredWordScores,cleaned_words):
    dictionary=set()
    for word in cleaned_words:
        dictionary.add(word)
    phrases=dict()
    bagOfWords=set()
    for word in filteredWordScores:
        bagOfWords.add(word[0])
    phrase=""
    totalScore=0
    wordCount=0
    i=0
    while i in range(len(tokens)):
        word=stemmer.lemmatize(tokens[i])
        if (word in stops or tokens[i] in stops) and phrase!="":
            j=i+1
            while j<len(tokens) and stemmer.lemmatize(tokens[j]) not in dictionary:
                j+=1
            if(j<len(tokens) and stemmer.lemmatize(tokens[j]) in bagOfWords):
                for k in range(i,j):
                    phrase+=tokens[k]+" "
                i=j-1
            else:
                if(wordCount>0):
                    phrases[phrase[:len(phrase)-1]]=totalScore/wordCount
                    totalScore=0
                    wordCount=0
                    phrase=""
        elif word in bagOfWords:
            totalScore+=scores[word]
            wordCount+=1
            phrase+=tokens[i]+" "
        else:
            if(wordCount>0):
                phrases[phrase[:len(phrase)-1]]=totalScore/wordCount
                totalScore=0
                wordCount=0
                phrase=""
        i+=1
    phraseScores=list(map(list, phrases.items()))
    phraseScores=sorted(phraseScores, key = lambda x: x[1])#,reverse=True)
    return phraseScores

In [7]:
rank("""
fastText is a library for learning of word embeddings and text classification created by Facebook's AI Research lab. The model allows to create an unsupervised learning or supervised learning algorithm for obtaining vector representations for words. Facebook makes available pretrained models for 294 languages. fastText uses a neural network for word embedding.
""",0,False)

  # This is added back by InteractiveShellApp.init_path()


[['word', 0.1458354284365972],
 ['learning', 0.16090598447933896],
 ['fasttext', 0.19167518741596723],
 ['facebook', 0.23167448633882617],
 ['model', 0.24257811784212077],
 ['library', 0.4467983305454254],
 ['embedding', 0.45143986865878105],
 ['created', 0.4735349738704307],
 ['network', 0.47493385321771103],
 ['available', 0.48399459198117256],
 ['classification', 0.48418148620320217],
 ['pretrained', 0.4844980499308024],
 ['language', 0.4857285368655409],
 ['make', 0.4907319114676544],
 ['research', 0.4967011834627816],
 ['neural', 0.4974703682320459],
 ['obtaining', 0.49845168300505194],
 ['lab', 0.5003112197986671],
 ['vector', 0.509221863001585],
 ['text', 0.5125238767692021],
 ['representation', 0.5140907155748989],
 ['create', 0.5158568439739091],
 ['allows', 0.5190341147993293],
 ['algorithm', 0.5199467938925538],
 ['embeddings', 0.523207523460899],
 ['supervised', 0.5408256676580224],
 ['unsupervised', 0.5429242226694312]]