In [1]:
import os
import numpy as np
import gensim

# Indexer

**Preprocessor**

In [2]:
import Stemmer
import os
import re
from functools import partial
from nltk.stem import WordNetLemmatizer

In [3]:
class Preprocessor:
    
    def __init__(self, enable_case_folding=True, enable_remove_stop_words=True,
                enable_stemmer=False, enable_lemmatizer=True, min_length=2):
        self.steps = []
        self.SPLIT_WORDS_PATTERN = re.compile(r'\s|\.|\:|\?|\(|\)|\[|\]|\{|\}|\<|\>|\'|\!|\"|\-|,|;|\$|\*|\%|#')
        self.steps.append(self.__split_words)
        if enable_case_folding:
            self.steps.append(self.__case_folding)
        
        if enable_remove_stop_words:
            self.steps.append(self.__remove_stop_words)
            self.stop_words = {'a', 'able', 'about', 'across', 'after', 'all',
                               'almost', 'also', 'am', 'among', 'an', 'and',
                               'any', 'are', 'as', 'at', 'be', 'because', 'been', 
                               'but', 'by', 'can', 'cannot', 'could', 'dear', 'did', 
                               'do', 'does', 'either', 'else', 'ever', 'every', 'for',
                               'from', 'get', 'got', 'had', 'has', 'have', 'he', 'her',
                               'hers', 'him', 'his', 'how', 'however', 'i', 'if', 'in', 
                               'into', 'is', 'it', 'its', 'just', 'least', 'let', 'like',
                               'likely', 'may', 'me', 'might', 'most', 'must', 'my', 'neither',
                               'no', 'nor', 'not', 'of', 'off', 'often', 'on', 'only', 'or',
                               'other', 'our', 'own', 'rather', 'said', 'say', 'says', 'she', 
                               'should', 'since', 'so', 'some', 'than', 'that', 'the', 'their',
                               'them', 'then', 'there', 'these', 'they', 'this', 'tis', 'to', 
                               'too', 'twas', 'us', 'wants', 'was', 'we', 'were', 'what', 'when',
                               'where', 'which', 'while', 'who', 'whom', 'why', 'will', 'with',
                               'would', 'yet', 'you', 'your'}
        
        if enable_stemmer:
            self.steps.append(self.__stem)
            self.stemmer = Stemmer.Stemmer('english')
        
        if enable_lemmatizer:
            self.steps.append(self.__lemmatiza)
            self.lemmatizer = WordNetLemmatizer()
            
        if min_length:
            self.steps.append(lambda words: self.__remove_short_words(words, min_length))
            
            
    def process(self, words):
        for i, step in enumerate(self.steps):
            words = list(step(words))
        
        return words
    
    
    def __split_words(self, words):
        return list(filter(lambda word: word != '', self.SPLIT_WORDS_PATTERN.split(words)))
    
    def __case_folding(self, words):
        return map(lambda word: word.casefold(), words)
    
    
    def __remove_stop_words(self, words):
        return filter(lambda word: word not in self.stop_words, words)
    
    
    def __stem(self, words):
        return map(lambda word: self.stemmer.stemWord(word), words)
    
    
    def __lemmatiza(self, words):
        return map(lambda word: self.lemmatizer.lemmatize(word), words)
    
    
    def __remove_short_words(self, words, min_length):
        return filter(lambda word: len(word) >= min_length, words)

**Indexer**

In [4]:
class Indexer:
    
    def __init__(self, preprocessor=None, field='content',
                 output_file_path='.\\index'):
        self.preprocessor = preprocessor
        self.output_file_path = output_file_path
        self.dictionary = {}
        self.field_name = field
        self.document_num = 0
        if field == 'content':
            self.field = 3
        elif field == 'title':
            self.field = 2
        elif field == 'authors':
            self.field = 4
        elif field == 'keywords':
            self.field = 5
        elif field == 'publication':
            self.field = 7
        
        
    def index(self, corpus):
        self.document_num = len(corpus.documents)
        for document in corpus.documents:
            docID = document.docID
            words_stream = document[self.field]
            if isinstance(words_stream, list):
                words = self.__generate_tokens(' '.join(words_stream))
            else:
                words = self.__generate_tokens(words_stream)
            self.__add_into_dictionary(docID, words) 
        self.__flush_index_entry()
            
        
    def get_posting_list(self, word):
        if word in self.dictionary.keys():
            return dictionary[word]
        else:
            return { }
    
    
    def get_term_DF(self, word):
        if word in self.dictionary.keys():
            return len(self.dictionary[word])
        else:
            return 0
    
    
    def get_doc_TF(self, docID, word):
        posting_list = get_posting_list(word)
        if docID in posting_list.keys():
            return posting_list[docID]
        else:
            return 0
        
    
    def generate_tokens(self, words_stream):
        if self.preprocessor is not None:
            words = self.preprocessor.process(words_stream)
        return words
    
        
    def __generate_tokens(self, words_stream):
        if self.preprocessor is not None:
            words = self.preprocessor.process(words_stream)
        return words
    
    
    def __add_into_dictionary(self, docID, words):
        for word in words:
            if word in self.dictionary.keys():
                posting_list = self.dictionary[word]
                if docID in posting_list.keys():
                    posting_list[docID] += 1
                else:
                    posting_list[docID] = 1
            else:
                posting_list = {docID : 1}
                self.dictionary[word] = posting_list            
    
        
    def __flush_index_entry(self):
        if not os.path.exists(self.output_file_path):
            os.mkdir(self.output_file_path)
        index_file_name = self.output_file_path + "\\" + self.field_name + ".index"
        index_file = open(index_file_name, 'w', encoding='utf-8')
        for term, posting_list in self.dictionary.items():
            self.__write_index_entry(index_file, term, posting_list)
        index_file.close()
        
        
    def __write_index_entry(self, file, term, posting_list):
        posting = list(map(lambda e: '{}|{}'.format(e[0], e[1]),
                           posting_list.items()))
        line = '{}\t{}\t{}\n'.format(term, str(len(posting_list)), ','.join(posting))
        file.write(line)

**Corpus**

In [5]:
class Corpus:
    
    def __init__(self, extractor):
        self.extractor = extractor
        self.documents = []
        
        
    def build(self, documents_path):
        documents = []
        if os.path.isdir(documents_path):
            for root, _, file in os.walk(documents_path):
                document_file = os.path.join(root, file)
                documents += extractor.extract(document_file)
        else:
            documents += extractor.extract(documents_path)
        
        self.documents = documents
    

**OHSUMED_Extractor**

In [6]:
from collections import defaultdict, namedtuple, Counter

In [7]:
class OHSUMED_Extractor():
    def __init__(self):
        pass
    
    
    def extract(self, documents_file):
        file = open(documents_file, 'r', encoding='utf-8')
        Document = namedtuple('Document', ['docID', 'sID', 'title',  'content', 
                                           'authors', 'keyswords', 'pType', 'publication'])
        documents = []
        not_finish = True
        
        docID = 0
        sID = 0
        title = ''
        content = ''
        authors = []
        keywords = []
        pType = ''
        publication = ''
        source = ''
        
        have_content = False
        while not_finish:
            line = file.readline()
            if line == None or len(line) < 2:
                not_finish = False
                break
            tag = line[1]
            if tag == 'I':
                sID = int(line[3:])
            elif tag == 'U':
                line = file.readline()
                docID = int(line.strip())
            elif tag == 'S':
                line = file.readline()
                publication = [line.strip()]
            elif tag == 'M':
                line = file.readline()
                line = line.strip().strip('.')
                keywords = [word.split('/')[0].strip() for word in line.split(';')]
            elif tag == 'T':
                line = file.readline()
                title = line.strip()
            elif tag == 'P':
                line = file.readline()
                pType = line.strip()
            elif tag == 'W':
                have_content = True
                line = file.readline()
                content = line.strip()
            elif tag == 'A':
                line = file.readline().strip().strip('.')
                authors = [ author.strip() for author in line.split(';')]
                if have_content:
                    documents.append(Document(docID, sID, title, content, 
                                              authors, keywords, pType, publication))
                have_content = False
                    
        file.close()
        return documents

# Model

**Boolean**

In [8]:
class BooleanModel:
    
    def __init__(self, indexer):
        self.indexer = indexer
    
    
    def search(self, query, topK=20, normalized=False):
        query_posting_list = self.__vectorize(query)
        if normalized:
            document_scores = self.__documents_normalized_score(query_posting_list)
        else:
            document_scores = self.__documents_score(query_posting_list)

        return document_scores[ : topK]
        
        
    def __vectorize(self, content):
        words = self.indexer.generate_tokens(content)
        posting_list = {}
        for word in words:
            posting_list[word] = 1
        return posting_list
    
    
    def __score_tf(self, posting_list1, posting_list2):        
        score = 0
        for term, tf1 in posting_list1.items():
            if term in posting_list2.keys():
                score += 1
        return score
    
    
    def __score_normalized_tf(self, posting_list1, posting_list2):
        score = 0
        norm1 = len(posting_list1)
        norm2 = len(posting_list2)
        
        for term, tf1 in posting_list1.items():
            if term in posting_list2.keys():
                score += 1
        score = score / (math.sqrt(norm1) * math.sqrt(norm2))
        return score
    
    
    def __documents_score(self, query_posting_list):
        document_scores = Counter()
        for term, query_tf in query_posting_list.items():
            documents = self.indexer.dictionary.get(term, { })
            for docID, _ in documents.items():
                document_scores[docID] += 1
        document_scores = list(document_scores.items())
        document_scores.sort(key=lambda ds: ds[1], reverse=True)
        return document_scores

    
    def __documents_normalized_score(self, query_posting_list):
        document_scores = Counter()
        document_norms = Counter()
        query_norm = len(query_posting_list)
        
        for _, documents in self.indexer.dictionary.items():
            for docID, _ in documents.items():
                document_norms[docID] += 1
        
        for term, query_tf in query_posting_list.items():
            if term in self.indexer.dictionary.keys():
                documents = self.indexer.dictionary[term]
                for docID, tf in documents.items():
                    document_scores[docID] += 1

        query_norm = math.sqrt(query_norm)
        for docID in document_scores:
            document_norm = math.sqrt(document_norms[docID])
            document_scores[docID] /= (document_norm * query_norm)
            
        document_scores = list(document_scores.items())
        document_scores.sort(key=lambda ds: ds[1], reverse=True)
        return document_scores
    

**VSM**

In [9]:
from collections import Counter
import math

In [10]:
class TFModel:
    
    def __init__(self, corpus, indexer):
        self.indexer = indexer
        self.corpus = corpus
        
    
    def search(self, query, topK=20, normalized=False):
        query_posting_list = self.__vectorize(query)
        if normalized:
            document_scores = self.__documents_normalized_score(query_posting_list)
        else:
            document_scores = self.__documents_score(query_posting_list)

        return document_scores[ : topK]
        

    def similarity(self, content1, content2, normalized=False):
        posting_list1 = self.__vectorize(content1)
        posting_list2 = self.__vectorize(content2)
        
        if len(posting_list2) < len(posting_list1):
            temp = posting_list1
            posting_list1 = posting_list2
            posting_list2 = temp
        if normalized:
            return self.__score_normalized_tf(posting_list1, posting_list2)
        else:
            return self.__score_tf(posting_list1, posting_list2)
        
        
    def __vectorize(self, content):
        words = self.indexer.generate_tokens(content)
        posting_list = {}
        for word in words:
            if word in posting_list.keys():
                posting_list[word] += 1
            else:
                posting_list[word] = 1
        return posting_list

    
    def __score_tf(self, posting_list1, posting_list2):        
        score = 0
        for term, tf1 in posting_list1.items():
            if term in posting_list2.keys():
                score += posting_list2[term] * tf1
        return score
    
    
    def __score_normalized_tf(self, posting_list1, posting_list2):
        score = 0
        norm1 = 0
        norm2 = 0
        for _, tf in posting_list2.items():
            norm2 += tf * tf
        
        for term, tf1 in posting_list1.items():
            norm1 += tf1 * tf1
            if term in posting_list2.keys():
                score += posting_list2[term] * tf1
        score = score / (math.sqrt(norm1) * math.sqrt(norm2))
        return score
    
    
    def __documents_score(self, query_posting_list):
        document_scores = Counter()
        for term, query_tf in query_posting_list.items():
            documents = self.indexer.dictionary.get(term, {})
            for docID, tf in documents.items():
                document_scores[docID] += tf * query_tf
        document_scores = list(document_scores.items())
        document_scores.sort(key=lambda ds: ds[1], reverse=True)
        return document_scores
    
    
    def __documents_normalized_score(self, query_posting_list):
        document_scores = Counter()
        document_norms = Counter()
        query_norm = 0
        
        for term, query_tf in query_posting_list.items():
            query_norm += query_tf * query_tf
            documents = self.indexer.dictionary[term]
            for docID, tf in documents.items():
                document_norms[docID] += tf * tf
                document_scores[docID] += tf * query_tf
        
        query_norm = math.sqrt(query_norm)
        for docID in document_scores:
            document_norm = math.sqrt(document_norms[docID])
            document_scores[docID] /= (document_norm * query_norm)
            
        document_scores = list(document_scores.items())
        document_scores.sort(key=lambda ds: ds[1], reverse=True)
        return document_scores
    
    
    def __documents_normalized_score_2(self, query_posting_list):
        document_scores = Counter()
        document_norms = Counter()
        query_norm = 0
        
        for _, documents in self.indexer.dictionary.items():
            for docID, tf in documents.items():
                document_norms[docID] += tf * tf
        
        for term, query_tf in query_posting_list.items():
            query_norm += query_tf * query_tf
            documents = self.indexer.dictionary[term]
            for docID, tf in documents.items():
                document_scores[docID] += tf * query_tf
        
        query_norm = math.sqrt(query_norm)
        for docID in document_scores:
            document_norm = math.sqrt(document_norms[docID])
            document_scores[docID] /= (document_norm * query_norm)
            
        document_scores = list(document_scores.items())
        document_scores.sort(key=lambda ds: ds[1], reverse=True)
        return document_scores
        

**TF_IDFModel**

In [11]:
class TF_IDFModel:
    
    def __init__(self, corpus, indexer, args):
        self.corpus = corpus
        self.indexer = indexer
        if args[0] == 'add':
            self.__TF_IDF = partial(self.__TF_IDF_add, alpha=args[1])
        elif args[0] == 'mul':
            self.__TF_IDF = partial(self.__TF_IDF_mul, a=args[1], b=args[2])
        elif args == 'log':
            self.__TF_IDF = self.__TF_IDF_log
    
    
    def search(self, query, topK=20, normalized=False):
        query_posting_list = self.__vectorize(query)
        if normalized:
            document_scores = self.__documents_normalized_score(query_posting_list)
        else:
            document_scores = self.__documents_score(query_posting_list)

        return document_scores[ : topK]


    def similarity(self, content1, content2, normalized=False):
        posting_list1 = self.__vectorize(content1)
        posting_list2 = self.__vectorize(content2)
        
        if len(posting_list2) < len(posting_list1):
            temp = posting_list1
            posting_list1 = posting_list2
            posting_list2 = temp
        if normalized:
            return self.__score_normalized_tf_idf(posting_list1, posting_list2)
        else:
            return self.__score_tf_idf(posting_list1, posting_list2)
        
        
    def __TF_IDF_add(self, tf, df, alpha):
        idf = self.indexer.document_num / df
        return idf * alpha + (1 - alpha) * tf
    
    
    def __TF_IDF_mul(self, tf, df, a, b):
        idf = self.indexer.document_num / df
        return math.pow(idf, a) * math.pow(tf, b)
        
        
    def __TF_IDF_log(self, tf, df):
        idf = math.log(self.indexer.document_num / df)
        return math.log(1 + tf) * idf       
        
    
    def __vectorize(self, content):
        words = self.indexer.generate_tokens(content)
        posting_list = {}
        for word in words:
            if word in posting_list.keys():
                posting_list[word] += 1
            else:
                posting_list[word] = 1
        return posting_list

    
    def __score_tf_idf(self, posting_list1, posting_list2):        
        score = 0
        for term, tf1 in posting_list1.items():
            df = self.indexer.get_term_DF(term)
            tf_idf1 = self.__TF_IDF(tf1, df)
            if term in posting_list2.keys():
                tf_idf2 = self.__TF_IDF(posting_list2[term], df)
                score +=  tf_idf1 * tf_idf2
        return score
    
    
    def __score_normalized_tf_idf(self, posting_list1, posting_list2):
        score = 0
        norm1 = 0
        norm2 = 0
        for term, tf in posting_list2.items():
            df = self.indexer.get_term_DF(term)
            tf_idf = self.__TF_IDF(tf, df)
            norm2 += tf_idf * tf_idf
        
        for term, tf1 in posting_list1.items():
            df = self.indexer.get_term_DF(term)
            tf_idf1 = self.__tf_idf(tf1, idf)
            norm1 += tf_idf1 * tf_idf1
            if term in posting_list2.keys():
                tf_idf2 = self.__TF_IDF(posting_list2[term], df)
                score += tf_idf1 * tf_idf2

        score = score / (math.sqrt(norm1) * math.sqrt(norm2))
        return score
    
    
    def __documents_score(self, query_posting_list):
        document_scores = Counter()
        for term, query_tf in query_posting_list.items():
            df = self.indexer.get_term_DF(term)
            query_tf_idf = self.__TF_IDF(query_tf, df)
            documents = self.indexer.dictionary.get(term, {})
            for docID, tf in documents.items():
                tf_idf = self.__TF_IDF(tf, df)
                document_scores[docID] += query_tf_idf * tf_idf
        document_scores = list(document_scores.items())
        document_scores.sort(key=lambda ds: ds[1], reverse=True)
        return document_scores
    
    
    def __documents_normalized_score(self, query_posting_list):
        document_scores = Counter()
        document_norms = Counter()
        query_norm = 0
        
        for term, query_tf in query_posting_list.items():
            df = self.indexer.get_term_DF(term)
            query_tf_idf = self.__TF_IDF(query_tf, df)
            query_norm += query_tf_idf * query_tf_idf
            documents = self.indexer.dictionary[term]
            for docID, tf in documents.items():
                tf_idf = self.__TF_IDF(tf, df)
                document_norms[docID] += tf_idf * tf_idf
                document_scores[docID] += query_tf_idf * tf_idf
        
        query_norm = math.sqrt(query_norm)
        for docID in document_scores:
            document_norm = math.sqrt(document_norms[docID])
            document_scores[docID] /= (document_norm * query_norm)
            
        document_scores = list(document_scores.items())
        document_scores.sort(key=lambda ds: ds[1], reverse=True)
        return document_scores
    
    
    def __documents_normalized_score_2(self, query_posting_list):
        document_scores = Counter()
        document_norms = Counter()
        query_norm = 0
        
        for term, documents in self.indexer.dictionary.items():
            df = self.indexer.get_term_DF(term)
            for docID, tf in documents.items():
                tf_idf = self.__TF_IDF(tf, df)
                document_norms[docID] += tf_idf * tf_idf
        
        for term, query_tf in query_posting_list.items():
            df = self.indexer.get_term_DF(term)
            query_tf_idf = self.__TF_IDF(query_tf, df)
            query_norm += query_tf_idf * query_tf_idf
            documents = self.indexer.dictionary[term]
            for docID, tf in documents.items():
                tf_idf = self.__TF_IDF(tf, df)
                document_scores[docID] += query_tf_idf * tf_idf
        
        query_norm = math.sqrt(query_norm)
        for docID in document_scores:
            document_norm = math.sqrt(document_norms[docID])
            document_scores[docID] /= (document_norm * query_norm)
            
        document_scores = list(document_scores.items())
        document_scores.sort(key=lambda ds: ds[1], reverse=True)
        return document_scores
        

**LSIModel**

In [12]:
import os, sys, pickle
import numpy
from gensim import corpora, models, similarities

In [13]:
class LSIModel:
    
    def __init__(self, corpus, preprocessor, output_filename='.\\lsi', num_topics=50):
        self.num_topics = num_topics
        self.origin_corpus = corpus
        self.docs = []
        self.preprocessor = preprocessor
        dict_suffix = 'ohsu'
        corpus_suffix = 'ohsu'
        self.output_filename = output_filename
        self.dict_filename = output_filename + '\\%s.dict' % dict_suffix
        self.corpus_filename = output_filename + '/%s.mm' % corpus_suffix
        self.lsi_filename = output_filename + '\\%s_%s.lsi' % (corpus_suffix, num_topics)
        self.index_filename = output_filename + '\\%s_%s.lsi.index' % (corpus_suffix, num_topics)
        self.doc2id_filename = output_filename + "\\%s.doc2id.pickle" % corpus_suffix
        self.id2doc_filename = output_filename + "\\%s.id2doc.pickle" % corpus_suffix
        self._create_directories()
        
        
    def _create_directories(self):
        if not os.path.exists(self.output_filename):
            os.mkdir(self.output_filename)
            
            
    def _create_docs_dict(self, docs):
        self.doc2id = dict(zip(docs, range(len(docs))))
        self.id2doc = dict(zip(range(len(docs)), docs))
        pickle.dump(self.doc2id, open(self.doc2id_filename, "wb"))
        pickle.dump(self.id2doc, open(self.id2doc_filename, "wb"))
        
    
    def _load_docs_dict(self):
        self.doc2id = pickle.load(open(self.doc2id_filename, 'rb'))
        self.id2doc = pickle.load(open(self.id2doc_filename, 'rb'))
        
        
    def _generate_dictionary(self):
        print("generating dictionary...")
        documents = []
        for document in self.origin_corpus.documents:
            tokens = preprocessor.process(document.content)
            documents.append(tokens)
        self.dictionary = corpora.Dictionary(documents)
        self.dictionary.save(self.dict_filename)
        
        
    def _load_dictionary(self, regenerate=False):
        if not os.path.exists(self.dict_filename) or regenerate is True:
            self._generate_dictionary()
        else:
            self.dictionary = corpora.Dictionary.load(self.dict_filename)
        
        
    def _generate_corpus(self):
        print("generating corpus...")
        self.corpus = []
        corpus_memory_friendly = self._vectorize_corpus(self.origin_corpus, self.dictionary)
        count = 0
        for vector in corpus_memory_friendly:
            self.corpus.append(vector)
            count += 1
            if count % 10000 == 0:
                print("%d vectors processed" % count)
        self._create_docs_dict(self.docs)
        corpora.MmCorpus.serialize(self.corpus_filename, self.corpus)
        
    
    def _vectorize_corpus(self, corpus, dictionary):
        for document in corpus.documents:
            docID = document.docID
            tokens = preprocessor.process(document.content)
            self.docs.append(docID)
            yield self.dictionary.doc2bow(tokens)
        
    
    def _vectorize(self, content):
        tokens = self.preprocessor.process(content)
        bow = self.dictionary.doc2bow(tokens)
        return self.lsi[bow]
        
        
    def _load_corpus(self, regenerate=False):
        if not os.path.exists(self.corpus_filename) or regenerate is True:
            self._generate_corpus()
        else:
            self.corpus = corpora.MmCorpus(self.corpus_filename)
            
            
    def _generate_lsi_model(self, regenerate=False):
        print("generating lsi models...")
        if not os.path.exists(self.lsi_filename) or regenerate is True:
            self.lsi = models.LsiModel(self.corpus, id2word=self.dictionary, num_topics=self.num_topics)
            self.lsi.save(self.lsi_filename)
            self.index = similarities.MatrixSimilarity(self.lsi[self.corpus])
            self.index.save(self.index_filename)
        elif not os.path.exists(self.index_filename):
            self.lsi = models.LsiModel.load(self.lsi_filename)
            self.index = similarities.MatrixSimilarity(self.lsi[self.corpus])
            self.index.save(self.index_filename)
            
            
    def _load_lsi_model(self, regenerate=False):
        if os.path.exists(self.lsi_filename) and os.path.exists(self.index_filename) and regenerate is False:
            self.lsi = models.LsiModel.load(self.lsi_filename)
            self.index = similarities.MatrixSimilarity.load(self.index_filename)
        else:
            self._generate_lsi_model(regenerate)
    
    def load(self, regenerate=False):
        self._load_dictionary(regenerate)
        self._load_corpus(regenerate)
        self._load_lsi_model(regenerate)
        self._load_docs_dict()
    
    def _get_vector(self, doc):
        vec_bow = None
        try:
            vec_bow = self.corpus[self.doc2id[doc]]
        except KeyError:
            print("Document '%s' does not exist. Have you used the proper string cleaner?" % doc)
        return vec_bow
    
    
    def search(self, query, topK=20, normalized=False):
        query_vector = self._vectorize(query)
        query_sims = self.index[query_vector]
        query_sims = sorted(enumerate(query_sims), key=lambda item: -item[1])[:topK]
        sims = [(self.id2doc[docid], weight) for docid, weight in query_sims]
        return sims
    
    
    def get_similars(self, doc, num_sim=20):
        vec_bow = self._get_vector(doc)
        if vec_bow is None:
            return []
        vec_lsi = self.lsi[vec_bow]
        sims = self.index[vec_lsi]
        sims = sorted(enumerate(sims), key=lambda item: -item[1])[1:num_sim+1]
        sims = [(self.id2doc[docid], weight) for docid, weight in sims]
        return sims
    
    def get_pairwise_similarity(self, doc1, doc2):
        vec_bow1 = self._get_vector(doc1)
        vec_bow2 = self._get_vector(doc2)
        if vec_bow1 is None or vec_bow2 is None:
            return None
        vec_lsi1 = [val for idx,val in self.lsi[vec_bow1]]
        vec_lsi2 = [val for idx,val in self.lsi[vec_bow2]]
        return cosine(vec_lsi1, vec_lsi2)
        

**TS_SSModel**

In [14]:
import math
import numpy as np

In [15]:
class TS_SS:
    
    def Cosine(self, vec1: np.ndarray, vec2: np.ndarray):
        return np.dot(vec1, vec2.T) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
    
    
    def VectorSize(self, vec: np.ndarray):
        return np.linalg.norm(vec)
    
    
    def Euclidean(self, vec1: np.ndarray, vec2: np.ndarray):
        return np.linalg.norm(vec1 - vec2)
    
    
    def Theta(self, vec1: np.ndarray, vec2: np.ndarray):
        return np.arccos(self.Cosine(vec1, vec2)) + np.radians(10)
    
    
    def Triangle(self, vec1: np.ndarray, vec2: np.ndarray):
        theta = np.radians(self.Theta(vec1, vec2))
        return (self.VectorSize(vec1) * self.VectorSize(vec2) * np.sin(theta)) / 2
    
    
    def Magnitude_Difference(self, vec1: np.ndarray, vec2: np.ndarray):
        return abs(self.VectorSize(vec1) - self.VectorSize(vec2))
    
    
    def Sector(self, vec1: np.ndarray, vec2: np.ndarray):
        ED = self.Euclidean(vec1, vec2)
        MD = self.Magnitude_Difference(vec1, vec2)
        theta = self.Theta(vec1, vec2)
        return math.pi * (ED + MD)**2 * theta/360
    
    
    def __call__(self, vec1: np.ndarray, vec2: np.ndarray):
        return self.Triangle(vec1, vec2) * self.Sector(vec1, vec2)

In [16]:
class TS_SSModel:
    
    def __init__(self, corpus, indexer, preprocessor, measure='Cosine'):
        self.origin_corpus = corpus
        self.indexer = indexer
        self.preprocessor = preprocessor
        model = TS_SS()
        self.measure_type = measure
        if measure == 'Cosine':
            self.SM = model.Cosine
        elif measure == 'ED':
            self.SM = model.Euclidean
        elif measure == 'TS':
            self.SM = model.Triangle
        elif measure == 'SS':
            self.SM = model.Sector
        elif measure == 'TS-SS':
            self.SM = model
        print("building corpus...")
        self.__build_corpus()
            
    
    def search(self, query, topK=20, normalized=False):
        query_bow = self.__get_bow(query)
        document_scores = self.__document_scores(query_bow)
        return document_scores[ : topK]
        
        
    def __build_corpus(self):
        self.doc_bows = {}
        for term, posting_list in self.indexer.dictionary.items():
            df = len(posting_list)
            for docID, tf in posting_list.items():
                if docID not in self.doc_bows.keys():
                    self.doc_bows[docID] = {}
                self.doc_bows[docID][term] = self.__TF_IDF(tf, df)
    
    
    def __get_bow(self, text):
        tokens = self.preprocessor.process(text)
        bow = {}
        for token in tokens:
            if token in bow:
                bow[token] += 1
            else:
                bow[token] = 1
        
        for term in bow.keys():
            df = len(self.indexer.dictionary[term])
            bow[term] = self.__TF_IDF(bow[term], df)
        return bow
    
        
    def __vectorize(self, query_bow, doc_bow):
        terms = set.union(set(query_bow.keys()), doc_bow.keys())
        query_vector = []
        doc_vector = []
        for term in terms:
            if term in query_bow.keys():
                query_vector.append(query_bow[term])
            else:
                query_vector.append(0)
            if term in doc_bow.keys():
                doc_vector.append(doc_bow[term])
            else:
                doc_vector.append(0)
        query_vector = np.array(query_vector)
        doc_vector = np.array(doc_vector)
        return query_vector, doc_vector
        
        
    def __TF_IDF(self, tf, df):
        idf = math.log(self.indexer.document_num / df)
        return math.log(1 + tf) * idf       
        
    
    def __document_scores(self, query_bow):
        document_scores = Counter()
        
        for docID, doc_bow in self.doc_bows.items():
            query_vector, doc_vector = self.__vectorize(query_bow, doc_bow)
            score = self.SM(query_vector, doc_vector)
            document_scores[docID] = score
        
        document_scores = list(document_scores.items())
        if self.measure_type == 'Cosine':
            document_scores.sort(key=lambda ds: ds[1], reverse=True)
        else:
            document_scores.sort(key=lambda ds: ds[1], reverse=False)
        return document_scores
        

**WMDModel**

In [17]:
import os
import numpy as np
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import stop_words
from sklearn.metrics import euclidean_distances
from pyemd import emd

In [18]:
class WMDModel:
    
    def __init__(self, corpus, indexer, preprocessor):
        self.corpus = corpus
        self.indexer = indexer
        self.preprocessor = preprocessor
        self.model_path = "C:\\Users\\dell\\Desktop\\DocumentSimilarity\\Word_Mover_Distance-master\\data\\word_model.mod"
        self.model = Word2Vec.load(self.model_path)

    
    def build(self):
        self.__build_vocabulary()
        self.__vectorize_corpus()
        self.__calculate_word_distance()
        
    
    def search(self, query, topK=20, normalized=False):
        query_vector = self._vectorize(query)
        document_scores = self.__documents_score(query_vector)
        return document_scores[ : topK]
        
        
    def _vectorize(self, content):
        v = self.vectorizer.transform([content])[0]
        v = v.toarray().ravel()
        v = v.astype(np.double)
        v /= v.sum()
        return v
        
        
    def __build_vocabulary(self):
        print("building vocabulary...")
        self.vocabulary = []
        for word, _ in self.indexer.dictionary.items():
            if word in self.model.wv.vocab:
                self.vocabulary.append(word)
    
    
    def __vectorize_corpus(self):
        print("vectorizing documents...")
        self.vectorizer = CountVectorizer(vocabulary=self.vocabulary)
        self.docIDs = []
        docs = []
        for document in self.corpus.documents:
            self.docIDs.append(document.docID)
            docs.append(document.content)
        self.corpus_vector = []
        for v in self.vectorizer.transform(docs):
            v = v.toarray().ravel()
            v = v.astype(np.double)
            v /= v.sum()
            self.corpus_vector.append(v)
        
        
    def __calculate_word_distance(self):
        W = np.array([self.model[w] for w in self.vectorizer.get_feature_names()
                         if w in self.model])
        self.words_distance = euclidean_distances(W).astype(np.double)
        self.words_distance /= self.words_distance.max()
        
        
    def __documents_score(self, query_vector):
        documents_score = Counter()
        for index, doc_vector in enumerate(self.corpus_vector):
            docID = self.docIDs[index]
            vector1_ix = np.nonzero(query_vector)
            vector2_ix = np.nonzero(doc_vector)
            union_idx = np.union1d(vector1_ix, vector2_ix)
            vector1 = query_vector[union_idx]
            vector2 = doc_vector[union_idx]
            D = self.words_distance[:,union_idx][union_idx]
            score = emd.emd(vector1, vector2, D)
            documents_score[docID] = score
        
        documents_score = list(documents_score.items())
        documents_score.sort(key=lambda ds: ds[1], reverse=False)
        return documents_score
        

**LDAModel**

In [19]:
from sklearn.neighbors import LSHForest

In [20]:
class LDAModel:
    def __init__(self, corpus, preprocesser, num_topics=20):
        self.origin_corpus = corpus
        self.preprocessor = preprocessor
        self.num_topics = num_topics
        self.docID = []
        self.dictionary = None
        self.corpus = None
        
        
    def build(self):
        print("build corpus...")
        self.__build_corpus()
        print("build model...")
        self.__build_model()
        print("vectorize documents...")
        self.__vectorize_corpus()
        
    
    def search(self, query, topK=20, normalized=False):
        query_vector = self.__vectorize(query)
        scores = self.__document_scores(query_vector)
        return scores
        
        
    def __vectorize(self, text):
        tokens = self.preprocessor.process(text)
        bow = self.dictionary.doc2bow(tokens)
        vector = [x[1] for x in self.model.get_document_topics(bow, 
                                                               minimum_probability=0.0)]
        return vector
        
        
    def __build_corpus(self):
        self.texts = []
        for document in self.origin_corpus.documents:
            docID = document.docID
            tokens = self.preprocessor.process(document.content)
            self.docID.append(docID)
            self.texts.append(tokens)
        self.dictionary = corpora.Dictionary(self.texts)
        self.corpus = [self.dictionary.doc2bow(text) for text in self.texts]
        
    
    def __build_model(self):
        self.model = models.ldamodel.LdaModel(self.corpus, 
                                              id2word=self.dictionary,
                                              num_topics=self.num_topics)
    
    
    def __vectorize_corpus(self):
        self.lsh = LSHForest(n_estimators=200, 
                             n_neighbors=self.num_topics)
        self.vectorized_docs = []
        for text in self.texts:
            bow = self.dictionary.doc2bow(text)
            vectorized_doc = [x[1] for x in self.model.get_document_topics(bow, 
                                                                          minimum_probability=0.0)]
            self.vectorized_docs.append(vectorized_doc)
        self.lsh.fit(self.vectorized_docs)
        
    
    
    def __document_scores(self, query_vector):
        distances, indices = self.lsh.kneighbors([query_vector])
        document_scores = Counter()
        for i, distance in enumerate(distances[0]):
            index = indices[0][i]
            document_scores[self.docID[index]] = distance
        document_scores = list(document_scores.items())
        return document_scores

In [21]:
class Corpus_2:
    
    def __init__(self, extractor):
        self.extractor = extractor
        pass
    
    
    def build(self, path):
        self.documents = {}
        for document in self.extractor.extract(path):
            self.documents[document.docID] = document

In [22]:
document_file = 'G:\\dataset\\corpus\\OHSUMED\\ohsu-trec\\trec9-train\\ohsumed.87'

In [23]:
extractor = OHSUMED_Extractor()
corpus = Corpus(extractor)
corpus.build(document_file)
preprocessor = Preprocessor()
print('indexing content...')
indexer = Indexer(preprocessor, 'content')
indexer.index(corpus)
print('indexing authors...')
indexer_authors = Indexer(preprocessor, 'authors')
indexer_authors.index(corpus)
print('indexing title...')
indexer_title = Indexer(preprocessor, 'title')
indexer_title.index(corpus)
print('indexing keywords...')
indexer_keywords = Indexer(preprocessor, 'keywords')
indexer_keywords.index(corpus)
print('indexing publication...')
indexer_publication = Indexer(preprocessor, 'publication')
indexer_publication.index(corpus)

indexing content...
indexing authors...
indexing title...
indexing keywords...
indexing publication...


In [24]:
corpus_2 = Corpus_2(extractor)
corpus_2.build(document_file)

In [25]:
boolean_model = BooleanModel(indexer)
boolean_model_authors = BooleanModel(indexer_authors)
boolean_model_title = BooleanModel(indexer_title)
boolean_model_keywords = BooleanModel(indexer_keywords)
boolean_model_publication = BooleanModel(indexer_publication)

In [26]:
tf_model = TFModel(corpus, indexer)
args = 'log'
tf_idf_model = TF_IDFModel(corpus, indexer, args)
ts_ss_model = TS_SSModel(corpus, indexer, preprocessor, measure='TS-SS')
lsi_model = LSIModel(corpus, preprocessor, ".\\lsi")
lsi_model.load()
lda_model = LDAModel(corpus, preprocessor)
lda_model.build()
wmd_model = WMDModel(corpus, indexer, preprocessor)
wmd_model.build()

building corpus...
build corpus...
build model...
vectorize documents...




building vocabulary...
vectorizing documents...




In [27]:
def search(model, query, topK=20):
    result = model.search(query, topK=topK, normalized=True)
    
    document_tuple = []
    for docID, similarity in result:
        content = corpus_2.documents[docID]
        document_tuple.append((docID, similarity, content))
    return document_tuple

In [39]:
def show(document_tuples, query, field='content'):
    i=1
    for docID, similarity, document in document_tuples:
        print('['+ str(i) + '] docID: \033[34m%d\t\033[0msimilarity/distance: \033[32m%.5f\033[0m' 
              % (int(docID), float(similarity)))
        content = document.content
        title = document.title
        keywords = '; '.join(document.keyswords)
        authors = '; '.join(document.authors)
        publication = ' '.join(document.publication)
        if field == 'content':
            content = mark(content, query)
        elif field == 'title':
            title = mark(title, query)
        elif field == 'keywords':
            keywords = mark(keywords, query)
        elif field == 'authors':
            authors = mark(authors, query)
        elif field == 'publication':
            publication = mark(publication, query)
        print('    Title: \033[4m%s\033[0m\n' % title)
        print('    Authors: ' + authors)
        print('    Keywords: ' + keywords)
        print('    Abstract: ' + content)
        print('    Publication: ' + publication)
        i+=1

In [29]:
def is_similarity(word1, word2):
    set1 = set(word1)
    set2 = set(word2)
    jaccord = len(set.intersection(set1, set2)) / (len(set.union(set1, set2)))
    if jaccord > 0.8:
        return True
    else:
        return False

In [30]:
def mark(content, query):
    result = ''
    for word1 in content.split():
        tag = False
        for word2 in query.split():
            if is_similarity(word1, word2):
                tag = True
        if tag:
            result += "\033[1;31m"+ word1 + "\033[0m "
        else:
            result += word1 + " "
    return result

In [31]:
def query_document(query, field='content', model_type='boolean', topK=20):
    if field == 'content':
        if model_type == 'boolean':
            model = boolean_model
        elif model_type == 'tf':
            model = tf_model
        elif model_type == 'tf-idf':
            model = tf_idf_model
        elif model_type == 'ts-ss':
            model = ts_ss_model
        elif model_type == 'lsi':
            model = lsi_model
        elif model_type == 'lda':
            model = lda_model
        elif model_type == 'wmd':
            model = wmd_model
    elif field == 'authors':
        model = boolean_model_authors
    elif field == 'keywords':
        model = boolean_model_keywords
    elif field == 'title':
        model = boolean_model_title
    elif field == 'publication':
        model = boolean_model_publication
    
    show(search(model, query, topK), query, field)

In [42]:
query = " Although certain gold [Au(I)] compounds have been used effectively in the treatment of rheumatoid arthritis for some years, the molecular basis for such therapeutic action has been unclear. One possible mechanism of the action of Au(I) compounds is that they"

In [49]:
query_document(query, field='content', model_type='wmd')

[1] docID: [34m87177966	[0msimilarity/distance: [32m0.15969[0m
    Title: [4mAntiarthritic gold compounds effectively quench electronically excited singlet oxygen.[0m

    Authors: Corey EJ; Mehrotra MM; Khan AU
    Keywords: Arthritis, Rheumatoid; Auranofin; Chemistry, Physical; Human; Kinetics; Lipid Peroxides; Oxygen; Support, Non-U.S. Gov't; Support, U.S. Gov't, P.H.S
    Abstract: [1;31mAlthough[0m [1;31mcertain[0m [1;31mgold[0m [1;31m[Au(I)][0m [1;31mcompounds[0m [1;31mhave[0m [1;31mbeen[0m [1;31mused[0m [1;31meffectively[0m [1;31min[0m [1;31mthe[0m [1;31mtreatment[0m [1;31mof[0m [1;31mrheumatoid[0m [1;31marthritis[0m [1;31mfor[0m [1;31msome[0m [1;31myears,[0m [1;31mthe[0m [1;31mmolecular[0m [1;31mbasis[0m [1;31mfor[0m [1;31msuch[0m [1;31mtherapeutic[0m [1;31maction[0m [1;31mhas[0m [1;31mbeen[0m [1;31munclear.[0m [1;31mOne[0m [1;31mpossible[0m [1;31mmechanism[0m [1;31mof[0m [1;31mthe[0m [1;31maction[0m [1;