In [ ]:
from gensim.models import fasttext as ft
import numpy as np
import os
import nltk
from nltk.corpus import stopwords
import operator
import re

In [ ]:
nltk.download('wordnet')
nltk.download('stopwords')

In [ ]:
# https://fasttext.cc/docs/en/crawl-vectors.html
if "model" not in globals():
    model=ft.load_facebook_vectors(os.path.join(os.getcwd(),"binaries/data/cc.en.300.bin"))

In [ ]:
window_size=3
stemmer = nltk.wordnet.WordNetLemmatizer()
stops = set(stopwords.words("english"))

In [ ]:
def rank(text,top=0,collapse=False):
    text=text.lower()
    tokens = nltk.word_tokenize(text)
    stemmed_words = [stemmer.lemmatize(word) for word in tokens]
    cleaned_words= [word for word in stemmed_words if word not in stops]
    filtered_words=[]
    counts=dict()
    scores=dict()
    for word in cleaned_words:
        filtered=re.sub('[^a-zA-Z]', '', word)
        if(len(filtered)>=3 or filtered in model.wv.vocab):
            filtered_words.append(word)
            counts.setdefault(word,0)
            counts[word]+=1
    scoreList=np.zeros(len(filtered_words))
    for i in range(len(filtered_words)):
        leftBound=max(0,i-window_size)
        rightBound=min(len(filtered_words)-1,i+window_size)
        contextSize=rightBound-leftBound+1
        for j in range(i+1,rightBound+1):
            similarity_score=model.wv.similarity(filtered_words[i],filtered_words[j])
            similarity_score=(similarity_score+1)/2.0
            scoreList[i]+=similarity_score
            scoreList[j]+=similarity_score
        wordScore=scoreList[i]/(counts[filtered_words[i]]*contextSize)
        if filtered_words[i] not in scores:
            scores[filtered_words[i]]=wordScore
        else:
            scores[filtered_words[i]]=min(scores[filtered_words[i]],wordScore)
    wordScores=list(map(list, scores.items()))
    wordScores=sorted(wordScores, key = lambda x: x[1])#,reverse=True)
    filteredWordScores=[]
    for word in wordScores:
        filtered=re.sub('[^a-zA-Z]', '', word[0])
        if(len(filtered)>2):
            filteredWordScores.append([filtered,word[1]])
    if collapse:
        phraseScores=collapsePhrases(tokens,scores,filteredWordScores[:len(filteredWordScores)//3],cleaned_words)
        if(top==0):
            top=len(phraseScores)
        return phraseScores[:top]
    else:
        if(top==0):
            top=len(filteredWordScores)
        return filteredWordScores[:top]

In [ ]:
def collapsePhrases(tokens,scores,filteredWordScores,cleaned_words):
    dictionary=set()
    for word in cleaned_words:
        dictionary.add(word)
    phrases=dict()
    bagOfWords=set()
    usedBag=set()
    for word in filteredWordScores:
        bagOfWords.add(word[0])
    phrase=""
    totalScore=0
    wordCount=0
    i=0
    while i in range(len(tokens)):
        word=stemmer.lemmatize(tokens[i])
        if (word in stops or tokens[i] in stops) and phrase!="":
            j=i+1
            while j<len(tokens) and stemmer.lemmatize(tokens[j]) not in dictionary:
                j+=1
            if(j<len(tokens) and stemmer.lemmatize(tokens[j]) in bagOfWords):
                for k in range(i,j):
                    phrase+=tokens[k]+" "
                    usedBag.add(tokens[k])
                i=j-1
            else:
                if(wordCount>0):
                    if(wordCount>1 or phrase[:len(phrase)-1] not in usedBag):
                        phrases[phrase[:len(phrase)-1]]=totalScore/wordCount
                    totalScore=0
                    wordCount=0
                    phrase=""
        elif word in bagOfWords:
            totalScore+=scores[word]
            wordCount+=1
            usedBag.add(tokens[i])
            phrase+=tokens[i]+" "
        else:
            if(wordCount>0):
                if(wordCount>1 or phrase[:len(phrase)-1] not in usedBag):
                    phrases[phrase[:len(phrase)-1]]=totalScore/wordCount
                totalScore=0
                wordCount=0
                phrase=""
        i+=1
    phraseScores=list(map(list, phrases.items()))
    phraseScores=sorted(phraseScores, key = lambda x: x[1])#,reverse=True)
    return phraseScores