In [ ]:
from gensim.models import fasttext as ft
import numpy as np
import os
import nltk
import operator
import re
from nltk.stem.porter import *

In [ ]:
nltk.download('wordnet')
nltk.download('stopwords')

In [ ]:
# https://fasttext.cc/docs/en/crawl-vectors.html
if "model" not in globals():
    model=ft.load_facebook_vectors(os.path.join(os.getcwd(),"binaries/data/cc.en.300.bin"))

In [ ]:
window_size=3
stemmer = nltk.wordnet.WordNetLemmatizer()
porterStemmer=PorterStemmer()
# https://github.com/Alir3z4/stop-words
stops=set(line.strip() for line in open(os.path.join(os.getcwd(),"binaries/data/stopwords-en.txt"),encoding='utf8'))

In [ ]:
def rank(text,top=None,collapse=False):
    text=text.lower()
    tokens = nltk.word_tokenize(text)
    stemmed_words = [stemmer.lemmatize(word) for word in tokens]
    cleaned_words= [word for word in stemmed_words if porterStemmer.stem(word) not in stops and word not in stops]
    filtered_words=[]
    counts=dict()
    scores=dict()
    for word in cleaned_words:
        filtered=re.sub('[^a-zA-Z]', '', word)
        if(len(filtered)>=3 or filtered in model.vocab):
            filtered_words.append(filtered)
            counts.setdefault(filtered,0)
            counts[filtered]+=1
    scoreList=np.zeros(len(filtered_words))
    for i in range(len(filtered_words)):
        leftBound=max(0,i-window_size)
        rightBound=min(len(filtered_words)-1,i+window_size)
        contextSize=rightBound-leftBound+1
        for j in range(i+1,rightBound+1):
            similarity_score=model.similarity(filtered_words[i],filtered_words[j])
            similarity_score=(similarity_score+1)/2.0
            scoreList[i]+=similarity_score
            scoreList[j]+=similarity_score
        wordScore=scoreList[i]/(counts[filtered_words[i]]*contextSize)
        if filtered_words[i] not in scores:
            scores[filtered_words[i]]=wordScore
        else:
            scores[filtered_words[i]]=min(scores[filtered_words[i]],wordScore)
    wordScores=list(map(list, scores.items()))
    wordScores=sorted(wordScores, key = lambda x: x[1])#,reverse=True)
    filteredWordScores=[]
    for word in wordScores:
        filtered=re.sub('[^a-zA-Z]', '', word[0])
        if(len(filtered)>2):
            filteredWordScores.append([filtered,word[1]])
    if collapse:
        if(top==None):
            top=min(10,len(filtered_words)//3)
        #print(scores)
        #print(filteredWordScores)
        phraseScores=collapsePhrases(tokens,scores,filteredWordScores[:top],cleaned_words)
        phraseScores=diversifyResults([x[0] for x in phraseScores])
        return phraseScores[:top]
    else:
        if(top==None):
            top=len(filteredWordScores)//3
        return filteredWordScores[:top]

In [None]:
def diversifyResults(phrases):
    phrases=[[phrase,0.0] for phrase in phrases]
    for phrase1 in range(len(phrases)):
        for phrase2 in range(len(phrases)):
            if(phrase1!=phrase2):
                score=model.similarity(phrases[phrase1][0],phrases[phrase2][0])
                score=(score+1)/2.0
                phrases[phrase1][1]+=score
                phrases[phrase2][1]+=score
    phrases=[[phrases[i][0],phrases[i][1]/(len(phrases)-1)] for i in range(len(phrases))]
    phrases=sorted(phrases, key = lambda x: x[1],reverse=True)
    return phrases

In [ ]:
def collapsePhrases(tokens,scores,filteredWordScores,cleaned_words,sort=False):
    dictionary=set()
    for word in cleaned_words:
        dictionary.add(word)
    phrases=dict()
    bagOfWords=set()
    for word in filteredWordScores:
        bagOfWords.add(word[0])
    phrase=""
    totalScore=0
    wordCount=0
    i=0
    def reset():
        nonlocal wordCount,phrase,totalScore
        if(wordCount>0):
            phrases[phrase[:len(phrase)-1]]=totalScore/wordCount
            totalScore=0
            wordCount=0
            phrase=""
    while i in range(len(tokens)):
        word=stemmer.lemmatize(tokens[i])
        if (word in stops or tokens[i] in stops) and phrase!="":
            j=i+1
            while j<len(tokens) and stemmer.lemmatize(tokens[j]) not in dictionary:
                j+=1
            if(j<len(tokens) and stemmer.lemmatize(tokens[j]) in bagOfWords):
                for k in range(i,j):
                    phrase+=tokens[k]+" "
                i=j-1
            else:
                reset()
        elif word in bagOfWords:
            totalScore+=scores[word]
            wordCount+=1
            phrase+=tokens[i]+" "
        else:
            reset()
        i+=1
    phraseScores=list(map(list, phrases.items()))
    if sort:
        phraseScores=sorted(phraseScores, key = lambda x: x[1])#len(x[0].split()))#,reverse=True)
    return phraseScores