### Process Stop Words

In [1]:
string = """a , at, are, for, of, I , is, there, then, many, do, to, and, by, the, not, have, with, while"""

stop_words = [s.strip() for s in string.split(',')]

for s in stop_words:
    print "---"+s+"---"

---a---
---at---
---are---
---for---
---of---
---I---
---is---
---there---
---then---
---many---
---do---
---to---
---and---
---by---
---the---
---not---
---have---
---with---
---while---


### Split Corpus into Lines and Check for Whitespace

In [2]:
corpus = """
d1: for English model retrieval have a relevance model while vector space
model retrieval do not;
d2: The R-precision measure is relevant to average precision measure.;
d3: The most efficient retrieval models are language model and vector space
model.;
d4: The English language is the most efficient language.;
d5: Retrieval efficiency is measured by the average precision of the
retrieval model.
"""

stems = {
    "models":"model",
    "r-precision":"precis",
    "precision":"precis",
    "precise":"precis",
    "efficient":"effic",
    "efficiency":"effic",
    #"recall":"retrieval",
    "relevant":"relevan",
    "relevance":"relevan",
    "measured":"measure",
}

corpus = [c.strip() for c in corpus.split(";") if c]

# Check no blank lines
# I often surround things with "---" to check for whitespace
for c in corpus:
    print "---"+c+"---"

corpus_length = len(corpus)

---d1: for English model retrieval have a relevance model while vector space
model retrieval do not---
---d2: The R-precision measure is relevant to average precision measure.---
---d3: The most efficient retrieval models are language model and vector space
model.---
---d4: The English language is the most efficient language.---
---d5: Retrieval efficiency is measured by the average precision of the
retrieval model.---


# Process Text

In [3]:
from collections import namedtuple

Document = namedtuple('Document',['title', 'keywords'])

In [4]:
def stem(word):
        stems = {
            "models":"model",
            "r-precision":"precis",
            "precision":"precis",
            "precise":"precis",
            "efficient":"effic",
            "efficiency":"effic",
            #"recall":"retrieval",
            "relevant":"relevan",
            "relevance":"relevan",
            "measured":"measure",
        }
        
        word = remove_punctuation(word)
        return stems.get(word, word)

In [5]:
def process_text(sentence):
    '''
        Prepare words to be processed by:
            - making lowercase
            - splitting on space
            - removing stop words
            - removing punctuation from end of word
            - remove empty strings
            - remove unique words with set
    '''
    sentence = sentence.lower()
    words = [stem(w) for w in sentence.split() if w not in stop_words]
    
    # remove empty words
    # use set to remove unique words
    return {w for w in words if w}
    
def remove_punctuation(word):
    '''
        If the last digit is not alpha then remove it
        Assumption: no numbers
    '''

    if not word[-1].isalpha():
        word = word[:-1]

    return word

In [6]:
words_list = []
words_set = set()
documents = []

for c in corpus:
    doc, text = c.split(": ")
    keywords = process_text(text)
    
    current_document = Document(title = doc.strip(), keywords=keywords)
    
    documents.append(current_document)
    print "title", current_document.title, "keywords", current_document.keywords
    
    for kw in keywords:
        if not kw in words_set:
            words_list.append(kw)
            words_set.add(kw)
            
print words_list



title d1 keywords set(['space', 'relevan', 'vector', 'english', 'model', 'retrieval'])
title d2 keywords set(['precis', 'relevan', 'average', 'measure'])
title d3 keywords set(['language', 'space', 'most', 'vector', 'model', 'effic', 'retrieval'])
title d4 keywords set(['most', 'effic', 'language', 'english'])
title d5 keywords set(['average', 'precis', 'measure', 'model', 'effic', 'retrieval'])
['space', 'relevan', 'vector', 'english', 'model', 'retrieval', 'precis', 'average', 'measure', 'language', 'most', 'effic']


In [7]:
index = {}


for word in words_list:
    vector = []
    
    for d in documents:
        vector.append(1) if word in d.keywords else vector.append(0)
        index[word] = vector
        
print corpus
        
    
for key,value in index.iteritems():
    print key,value

['d1: for English model retrieval have a relevance model while vector space\nmodel retrieval do not', 'd2: The R-precision measure is relevant to average precision measure.', 'd3: The most efficient retrieval models are language model and vector space\nmodel.', 'd4: The English language is the most efficient language.', 'd5: Retrieval efficiency is measured by the average precision of the\nretrieval model.']
language [0, 0, 1, 1, 0]
space [1, 0, 1, 0, 0]
average [0, 1, 0, 0, 1]
measure [0, 1, 0, 0, 1]
precis [0, 1, 0, 0, 1]
most [0, 0, 1, 1, 0]
relevan [1, 1, 0, 0, 0]
vector [1, 0, 1, 0, 0]
english [1, 0, 0, 1, 0]
model [1, 0, 1, 0, 1]
effic [0, 0, 1, 1, 1]
retrieval [1, 0, 1, 0, 1]


# Weighted Index

Using corpus frequency as (number of documents a term appears in)/(total number of documents)

In [8]:
import math 
    
def term_weights(l):
    """
        Input is a vector of 1's and 0's indicating if a term is in a document.
        If we get the sum of this vector we have the number of documents it appears in
        We divide this by the length of the corpus
    """
    total = sum(l)
    return [total*1.0/corpus_length if item else 0 for item in l]
    
weighted_index = {key:term_weights(value) for key,value in index.iteritems()}
print weighted_index

{'language': [0, 0, 0.4, 0.4, 0], 'space': [0.4, 0, 0.4, 0, 0], 'average': [0, 0.4, 0, 0, 0.4], 'english': [0.4, 0, 0, 0.4, 0], 'precis': [0, 0.4, 0, 0, 0.4], 'most': [0, 0, 0.4, 0.4, 0], 'relevan': [0.4, 0.4, 0, 0, 0], 'vector': [0.4, 0, 0.4, 0, 0], 'measure': [0, 0.4, 0, 0, 0.4], 'model': [0.6, 0, 0.6, 0, 0.6], 'effic': [0, 0, 0.6, 0.6, 0.6], 'retrieval': [0.6, 0, 0.6, 0, 0.6]}


In [9]:
def q(keyword):
    # Search for a term in the index if not there return a vector of zeros
    return index.get(stem(keyword), [0]*corpus_length)

def AND(*args):
    """
        Takes in a list of lists. The inner list are 1's and 0's representing a term in a document
        zip(*args) combines the value at index i for each list for i = 0 to length-1
        if a term is true at all positons return true else return false
    """
    return[all(x) for x in zip(*args)]

def OR(*args):
    """
        Takes in a list of lists. The inner list are 1's and 0's representing a term in a document
        zip(*args) combines the value at index i for each list for i = 0 to length-1
        if a term is true at any positon return true else return false
    """
    return[any(x) for x in zip(*args)]

# Retrieval or Relevant

In [10]:
answer = OR(q('relevant'), q('retrieval') )

for term in ('relevant','retrieval'):
    print term, q(term)
    
print "Answer:",[y[0] for x,y in zip(answer, documents) if x]

relevant [1, 1, 0, 0, 0]
retrieval [1, 0, 1, 0, 1]
Answer: ['d1', 'd2', 'd3', 'd5']


# Efficient and Model and Efficient and Retrieval

In [11]:
answer = AND (q('efficient'), q('model'), q('efficient'), q('retrieval'))

for term in ('efficient', 'model','efficient','retrieval'):
    print term, q(term)
print "Answer:",[y[0] for x,y in zip(answer, documents) if x]

efficient [0, 0, 1, 1, 1]
model [1, 0, 1, 0, 1]
efficient [0, 0, 1, 1, 1]
retrieval [1, 0, 1, 0, 1]
Answer: ['d3', 'd5']


# (Precise and Recall) or average

In [12]:
clause1 = AND(q('precise'), q('recall'))
average = q('average')
answer = OR(clause1, average)

print "precise, recall", q('precise'), q('recall')
print "precise AND recall:", clause1
print "average:", average
print "Answer:",[y[0] for x,y in zip(answer, documents) if x]

precise, recall [0, 1, 0, 0, 1] [0, 0, 0, 0, 0]
precise AND recall: [False, False, False, False, False]
average: [0, 1, 0, 0, 1]
Answer: ['d2', 'd5']
