In [1]:
def remove_all_punctuation_and_numbers(word):
    '''
        If the last digit is not alpha then remove it
    '''

    if not word[-1].isalpha():
        word = word[:-1]

    return word

In [2]:
from __future__ import division # means division can return decimals
from collections import defaultdict # a dict where you can set a default value
from collections import namedtuple # much more efficient than objects
from collections import Counter # creates automatically key:frequency dictionary
import math

corpus = """
d1: for English model retrieval have a relevance model while vector space
model retrieval do not;
d2: The R-precision measure is relevant to average precision measure.;
d3: The most efficient retrieval models are language model and vector space
model.;
d4: The English language is the most efficient language.;
d5: Retrieval efficiency is measured by the average precision of the
retrieval model.
"""

stems = {
    "models":"model",
    "r-precision":"precis",
    "precision":"precis",
    "precise":"precis",
    "efficient":"effic",
    "efficiency":"effic",
    "recall":"retrieval",
    "relevant":"relevan",
    "relevance":"relevan",
    "measured":"measure",
}

string = """a , at, are, for, of, I , is, there, then, many, do, to, and, by, the, not, have, with, while"""

stop_words = [s.strip() for s in string.split(',')]

In [3]:
class ExtendedBooleanModel(object):
    """
        Preprocessing:
        1. Gathering
        2. Stemming
        3. Stopword Removal
        4. Indexing
    
        1a. Read in documents use doc_sep to seperate docs (default ";") done in constructor
        1b. Seperate each doc into name and contentsusing title_sep (default = ":") 
        1c. Break into constituent parts if TF-IDF we need to count occurrences
        
        2a. lowercase
        2b. remove punctuation #NEED DEFAULT PUNCTUATION LIST
        2c. use stemming dictionary (deafault = None) 
        ??  Check for hyphens
        
        3a. Remove stopwords (default in __stopwords__)

        4a. Create index
        
        
        inverse document frequency:
        for each document
        - 
    """
    

    """
        If we are going to have a lot of documents then the overhead of objet creation will slow us down
        'title' is the name of the document
        'keyword' is the set of stemmed words in the document 
    """
    Document = namedtuple('Document',['title', 'keywords'])

    
    def __init__(self, corpus, stopwords, **kwargs):
        # kwargs.get(foo, default) searches for named arguments e.g pnorm=4 but provides a default if not found
        self.pnorm = kwargs.get('pnorm',2)
        self.doc_sep = kwargs.get('doc_sep', ';') # seperates documents in corpus
        self.title_sep = kwargs.get('title_sep', ':') # seperates document title from comments
        self.remove_punctuation = kwargs.get('punctuation', None) # function to remove punctuation
        self.stopwords = stopwords
        self.pnorm = kwargs.get("pnorm", 2)
        self.last_query = None 
        
        """
            We set defaultdict default to 0. Now we can go through each documents keywords and call:
                self.term_weights += 1
            We do not have to worry wether the word was in the dictionary already.
            This is performed as a side effect in create_document_tuples to avoid multiple passes
            An alternative to explore: join the sets in a list and pass them to a Counter object
        """
        self.term_weights = defaultdict(lambda: 0)
        
        # default function to remove punctuation
        if not self.remove_punctuation:
            self.remove_punctuation = remove_all_punctuation_and_numbers
            
        self.stemming_dict = kwargs.get('stemming_dict', None)
        
        # Needs to be split into doc and contents
        self._preprocess(corpus)
        
    def _preprocess(self, corpus):
        
        self.documents = self._read_documents(corpus) # creates the self.documents list of namedtuples
        self.number_of_documents = len(self.documents) # save vale for calcs, if need to add docs may need to change
        
        # in _read_documents::create_document_tuples frequencies of terms
        # were added to self.term weights. Divide this by the number of docs
        # An alternative: add 1/len(docs) but hardly seems worth it and potential loss of accuracy
        self.term_weights = {key: value/self.number_of_documents for key, value in self.term_weights.items()}
        
        self.index = self.build_index() # builds a dictionary of word:list pairs
        print self.index
    
    def _read_documents(self, corpus):
        """
            self.corpus.split(self.doc_sep) creates individual document strings (1a)
            Each document performs:
                - speration of document string into title and contents using self.title_sep
                - transition to lower case
                - removal of punctuation
                - stemming
                - stopword removal
        """
       
        return [self.create_document_tuples(self, doc) for doc in corpus.split(self.doc_sep) if doc]
        
    def stem(self, word):
        word = self.remove_punctuation(word.strip().lower()) 
        
        if word not in self.stopwords:
            return self.stemming_dict.get(word, word)
        else:
            return None
    
         
    
    def build_index(self):
        """
            Builds a dictionary of word:list pairs
            For each word,value create a list
                For each document append value, if word is in document.keywords, or else 0 to the list
                
            So corpus frequency for a word is 0.4 and word appears in docs 1 and 5 should return [0.4,0,0,0,0.4]
        """
        index = {}

        for word, value in self.term_weights.items(): # value = corpus frequency
            vector = []

            for d in self.documents:
                vector.append(value) if word in d.keywords else vector.append(0)
                index[word] = vector
                
        return index
        
            
    def create_document_tuples(self, model, string):
        
        title, contents = string.split(self.title_sep)

        title=title.strip()
        
        # term frequency irrelevant so use set to avoid stemming multiple words
        candidate_keywords = set(contents.split()) 
        keywords = filter(None,(self.stem(word) for word in candidate_keywords))
        
        print title,  "- KEYWORDS:", keywords
        
        for kw in keywords:
            self.term_weights[kw] += 1
            
            
        doc_tuple = ExtendedBooleanModel.Document(title = title, keywords = keywords)

        return doc_tuple
    
        
    
    def OR(self, *args):
        
        """
            Evaluates an OR query in the Extended Boolean Model
            'args' is of variable length and will either be a word from the query or the list of weights associated
            with that query i.e [w1, w2, ... wn] where weight wn is the corpus frequency for a word in document n
            
            Any words in args are converted to the appropriate list of weights using @apply_weights
            
            Now we have 'terms' which is a list of list where the nth entry of each list refers to document n. We can
            join the nth entries with the zip command - using *, the splat operator, since we have a nested list - 
            see Zip and Splat below. 
            
            This forms an unnamed intermediate data structure on which we call apply_pnorm on each and this converts
            our lists into single entries which have to be raised to 1/pnorm before being returned.                    
        """
        
        def apply_weights(word):
            """
                This takes a word from the query and stems it and looks up its weight in the index
            """
            print "In OR.apply_weight with term:", word, "-->", self.index[self.stem(word)]
            return self.index[self.stem(word)]
            #return [item**pnorm for item in weights]
            
        def apply_pnorm(weights):
            """
                Thia takes in a list of term weights that are raised to the power of pnorm
                This is then summed and divided by length of the list
                All this is raised to the power of 1/pnorm i.e the inverse root
            """
            
            print "\nIn OR.apply_pnorm with p-norm =", pnorm
            print weights, "--> (1-w)^p", [x**pnorm for x in weights], "--> sum =", sum(x**pnorm for x in weights), 
            print "sum/n =" , sum(x**pnorm for x in weights)/len(weights)
            print "returned value", (sum(x**pnorm for x in weights)/len(weights))**(1/pnorm)
        
            return (sum(x**pnorm for x in weights)/len(weights))**(1/pnorm)
        print "OR function with arguments:", ",".join(str(a) for a in args )
        
        pnorm = self.pnorm 

        terms = (apply_weights(a) if not isinstance(a, list) else a for a in args)
        weights = [apply_pnorm(x) for x in zip(*terms)]
        
        print "weights after pnorm", [round(w, 4) for w in weights]
        print "\n=========================================\n"
        return weights
        
        
    def AND(self, *args):
        
        """
            Evaluates an AND query in the Extended Boolean Model
            'args' is of variable length and will either be a word from the query or the list of weights associated
            with that query i.e [w1, w2, ... wn] where weight wn is the corpus frequency for a word in document n
            
            Any words in args are converted to the appropriate list of weights using @apply_weights
            
            Now we have 'terms' which is a list of list where the nth entry of each list refers to document n. We can
            join the nth entries with the zip command - using *, the splat operator, since we have a nested list - 
            see Zip and Splat below. 
            
            This forms an unnamed intermediate data structure on which we call @apply_pnorm on each and this converts
            our lists into single entries which have to be raised to 1/pnorm before being returned.                    
        """
        
        
        
        def apply_weights(word):
            """
                This takes a word from the query and stems it and looks up its weight in the index
            """
            print "In AND.apply_weight with term:", word, "-->", self.index[self.stem(word)]
            return self.index[self.stem(word)]
            
        def apply_pnorm(weights):
            """
                Thia takes in a list of term weights that are then subtracted from one
                and the result is raised to the power of pnorm
                This is then summed and divided by length of the list
                All this is raised to the power of 1/pnorm i.e the inverse root
            """
            
            
            print "\nIn AND.apply_pnorm with p-norm =", pnorm
            print weights, "--> (1-w)^p", [(1-x)**pnorm for x in weights], "--> sum =", sum((1-x)**pnorm for x in weights), 
            print "sum/n =" , sum((1-x)**pnorm for x in weights)/len(weights), "-->(sum/n)**1/p =",
            print (sum((1-x)**pnorm for x in weights)/len(weights))**(1/pnorm)
            print "returned value", 1- (sum((1-x)**pnorm for x in weights)/len(weights))**(1/pnorm)
            
            
            return 1- (sum((1-x)**pnorm for x in weights)/len(weights))**(1/pnorm)
        
        pnorm = self.pnorm
        print "AND function with arguments:", ",".join(str(a) for a in args)
        
        terms = [apply_weights(a) if not isinstance(a, list) else a for a in args ]
        weights = [apply_pnorm(x) for x in zip(*terms)]
        print "\n1 - weights after pnorm", [round(w, 4) for w in weights]
        print "\n========================================="
        return weights
                   
        
ebm = ExtendedBooleanModel(corpus, stop_words, stemming_dict=stems) 

d1 - KEYWORDS: ['space', 'vector', 'english', 'relevan', 'model', 'retrieval']
d2 - KEYWORDS: ['relevan', 'precis', 'precis', 'measure', 'measure', 'average']
d3 - KEYWORDS: ['model', 'language', 'space', 'effic', 'most', 'vector', 'model', 'retrieval', 'model']
d4 - KEYWORDS: ['language', 'effic', 'language', 'most', 'english']
d5 - KEYWORDS: ['precis', 'effic', 'model', 'average', 'retrieval', 'measure', 'retrieval']
{'language': [0, 0, 0.6, 0.6, 0], 'space': [0.4, 0, 0.4, 0, 0], 'average': [0, 0.4, 0, 0, 0.4], 'measure': [0, 0.6, 0, 0, 0.6], 'precis': [0, 0.6, 0, 0, 0.6], 'most': [0, 0, 0.4, 0.4, 0], 'relevan': [0.4, 0.4, 0, 0, 0], 'vector': [0.4, 0, 0.4, 0, 0], 'english': [0.4, 0, 0, 0.4, 0], 'model': [1.0, 0, 1.0, 0, 1.0], 'effic': [0, 0, 0.6, 0.6, 0.6], 'retrieval': [0.8, 0, 0.8, 0, 0.8]}


In [4]:
OR = ebm.OR
AND = ebm.AND

### Q1: relevant retrieval 



In [5]:
weights = OR('relevant', 'retrieval')

for document, weight in sorted(zip(ebm.documents, weights), key= lambda x: x[1], reverse = True):
    print document.title, "has a weight of", weight

OR function with arguments: relevant,retrieval
In OR.apply_weight with term: relevant --> [0.4, 0.4, 0, 0, 0]
In OR.apply_weight with term: retrieval --> [0.8, 0, 0.8, 0, 0.8]

In OR.apply_pnorm with p-norm = 2
(0.4, 0.8) --> (1-w)^p [0.16000000000000003, 0.6400000000000001] --> sum = 0.8 sum/n = 0.4
returned value 0.632455532034

In OR.apply_pnorm with p-norm = 2
(0.4, 0) --> (1-w)^p [0.16000000000000003, 0] --> sum = 0.16 sum/n = 0.08
returned value 0.282842712475

In OR.apply_pnorm with p-norm = 2
(0, 0.8) --> (1-w)^p [0, 0.6400000000000001] --> sum = 0.64 sum/n = 0.32
returned value 0.565685424949

In OR.apply_pnorm with p-norm = 2
(0, 0) --> (1-w)^p [0, 0] --> sum = 0 sum/n = 0.0
returned value 0.0

In OR.apply_pnorm with p-norm = 2
(0, 0.8) --> (1-w)^p [0, 0.6400000000000001] --> sum = 0.64 sum/n = 0.32
returned value 0.565685424949
weights after pnorm [0.6325, 0.2828, 0.5657, 0.0, 0.5657]


d1 has a weight of 0.632455532034
d3 has a weight of 0.565685424949
d5 has a weight of 0.

### Q2: efficient model efficient retrieval


In [6]:
weights = AND("efficient", "model", "efficient", "retrieval")

for document, weight in sorted(zip(ebm.documents, weights), key= lambda x: x[1], reverse = True):
    print document.title, "has a weight of", weight

AND function with arguments: efficient,model,efficient,retrieval
In AND.apply_weight with term: efficient --> [0, 0, 0.6, 0.6, 0.6]
In AND.apply_weight with term: model --> [1.0, 0, 1.0, 0, 1.0]
In AND.apply_weight with term: efficient --> [0, 0, 0.6, 0.6, 0.6]
In AND.apply_weight with term: retrieval --> [0.8, 0, 0.8, 0, 0.8]

In AND.apply_pnorm with p-norm = 2
(0, 1.0, 0, 0.8) --> (1-w)^p [1, 0.0, 1, 0.03999999999999998] --> sum = 2.04 sum/n = 0.51 -->(sum/n)**1/p = 0.714142842854
returned value 0.285857157146

In AND.apply_pnorm with p-norm = 2
(0, 0, 0, 0) --> (1-w)^p [1, 1, 1, 1] --> sum = 4 sum/n = 1.0 -->(sum/n)**1/p = 1.0
returned value 0.0

In AND.apply_pnorm with p-norm = 2
(0.6, 1.0, 0.6, 0.8) --> (1-w)^p [0.16000000000000003, 0.0, 0.16000000000000003, 0.03999999999999998] --> sum = 0.36 sum/n = 0.09 -->(sum/n)**1/p = 0.3
returned value 0.7

In AND.apply_pnorm with p-norm = 2
(0.6, 0, 0.6, 0) --> (1-w)^p [0.16000000000000003, 1, 0.16000000000000003, 1] --> sum = 2.32 sum/n =

### Q3: precise precision with average recall

This will be represented by the following code:

```python
OR(AND("precise","recall"), "average")`
```

It will be processed by depth first search. If a function is reached it will continue down the tree until it reaches a node whose children are all words. As here the AND node has.
<img src="https://chart.googleapis.com/chart?cht=gv:dot&amp;chl=digraph{ OR [style=filled,fillcolor=forestgreen]; AND[style=filled,fillcolor=red]; OR->AND[color=red] ;  AND->precise; AND->recall; OR->average}&amp;chs=250x250" alt="neato chart">

In [7]:
weights = OR(AND("precise","recall"), "average")

for document, weight in sorted(zip(ebm.documents, weights), key= lambda x: x[1], reverse = True):
    print document.title, "has a weight of", weight

AND function with arguments: precise,recall
In AND.apply_weight with term: precise --> [0, 0.6, 0, 0, 0.6]
In AND.apply_weight with term: recall --> [0.8, 0, 0.8, 0, 0.8]

In AND.apply_pnorm with p-norm = 2
(0, 0.8) --> (1-w)^p [1, 0.03999999999999998] --> sum = 1.04 sum/n = 0.52 -->(sum/n)**1/p = 0.721110255093
returned value 0.278889744907

In AND.apply_pnorm with p-norm = 2
(0.6, 0) --> (1-w)^p [0.16000000000000003, 1] --> sum = 1.16 sum/n = 0.58 -->(sum/n)**1/p = 0.761577310586
returned value 0.238422689414

In AND.apply_pnorm with p-norm = 2
(0, 0.8) --> (1-w)^p [1, 0.03999999999999998] --> sum = 1.04 sum/n = 0.52 -->(sum/n)**1/p = 0.721110255093
returned value 0.278889744907

In AND.apply_pnorm with p-norm = 2
(0, 0) --> (1-w)^p [1, 1] --> sum = 2 sum/n = 1.0 -->(sum/n)**1/p = 1.0
returned value 0.0

In AND.apply_pnorm with p-norm = 2
(0.6, 0.8) --> (1-w)^p [0.16000000000000003, 0.03999999999999998] --> sum = 0.2 sum/n = 0.1 -->(sum/n)**1/p = 0.316227766017
returned value 0.68377

# Calculate P-Norms for Q3

In [8]:
ebm.pnorm = 1
weights1 = OR('relevant', 'retrieval')
ebm.pnorm = 10
weights2 = OR('relevant', 'retrieval')
ebm.pnorm = 100
weights3 = OR('relevant', 'retrieval')

for document, weight1, weight2, weight3 in \
        sorted(zip(ebm.documents, weights1, weights2, weights3), key= lambda x: x[1], reverse = True):
    print document.title, "has a weights of", weight1, weight2, weight3, "for pnorms of 1, 10 and 100 respectively"

OR function with arguments: relevant,retrieval
In OR.apply_weight with term: relevant --> [0.4, 0.4, 0, 0, 0]
In OR.apply_weight with term: retrieval --> [0.8, 0, 0.8, 0, 0.8]

In OR.apply_pnorm with p-norm = 1
(0.4, 0.8) --> (1-w)^p [0.4, 0.8] --> sum = 1.2 sum/n = 0.6
returned value 0.6

In OR.apply_pnorm with p-norm = 1
(0.4, 0) --> (1-w)^p [0.4, 0] --> sum = 0.4 sum/n = 0.2
returned value 0.2

In OR.apply_pnorm with p-norm = 1
(0, 0.8) --> (1-w)^p [0, 0.8] --> sum = 0.8 sum/n = 0.4
returned value 0.4

In OR.apply_pnorm with p-norm = 1
(0, 0) --> (1-w)^p [0, 0] --> sum = 0 sum/n = 0.0
returned value 0.0

In OR.apply_pnorm with p-norm = 1
(0, 0.8) --> (1-w)^p [0, 0.8] --> sum = 0.8 sum/n = 0.4
returned value 0.4
weights after pnorm [0.6, 0.2, 0.4, 0.0, 0.4]


OR function with arguments: relevant,retrieval
In OR.apply_weight with term: relevant --> [0.4, 0.4, 0, 0, 0]
In OR.apply_weight with term: retrieval --> [0.8, 0, 0.8, 0, 0.8]

In OR.apply_pnorm with p-norm = 10
(0.4, 0.8) --> (

In [9]:
# Checking the behaviour

for i in range(5):
    p = 4**i 
    print (0.1**(p))**(1/p), ( (0.4**p + 0.8**p)/2)**(1/p), ( (0.2**p + 0.4**p + 0.6**p)/3)**(1/p)

0.1 0.6 0.4
0.1 0.68299059407 0.478141237146
0.1 0.766083355147 0.560237874087
0.1 0.791382410555 0.589788405722
0.1 0.797836844868 0.597430644527


In [10]:
for i in range(5):
    p = 4**i 
    print (0.1**(p))**(1/p), ( (0.4**p + 0.8**p)/2)**(1/p), ( (0.2**p + 0.4**p + 0.6**p)/2)**(1/p)

0.1 0.6 0.6
0.1 0.68299059407 0.529150262213
0.1 0.766083355147 0.574616601241
0.1 0.791382410555 0.593536807916
0.1 0.797836844868 0.598377633651


### Zip and Splat

In [11]:
matrix = [[1,2,3],[4,5,6]]

print matrix, "as list of columns"
print zip (*matrix), "as list of rows"
print zip(*zip (*matrix)), "back to list of columns"

[[1, 2, 3], [4, 5, 6]] as list of columns
[(1, 4), (2, 5), (3, 6)] as list of rows
[(1, 2, 3), (4, 5, 6)] back to list of columns
