In [1]:
from ipy_table import * # to display calculations

def remove_all_punctuation_and_numbers(word):
    '''
        If the last digit is not alpha then remove it
    '''

    if not word[-1].isalpha():
        word = word[:-1]

    return word

In [2]:
from __future__ import division # means division can return decimals
from collections import defaultdict # a dict where you can set a default value
from collections import namedtuple # much more efficient than objects
from collections import Counter # creates automatically key:frequency dictionary
import math

corpus = """
d1: for English model retrieval have a relevance model while vector space
model retrieval do not;
d2: The R-precision measure is relevant to average precision measure.;
d3: The most efficient retrieval models are language model and vector space
model.;
d4: The English language is the most efficient language.;
d5: Retrieval efficiency is measured by the average precision of the
retrieval model.
"""

stems = {
    "models":"model",
    "r-precision":"precis",
    "precision":"precis",
    "precise":"precis",
    "efficient":"effic",
    "efficiency":"effic",
    #"recall":"retrieval",
    "relevant":"relevan",
    "relevance":"relevan",
    "measured":"measure",
}

string = """a , at, are, for, of, I , is, there, then, many, do, to, and, by, the, not, have, with, while"""

stop_words = [s.strip() for s in string.split(',')]



class VectorSpaceModel(object):
    """
        Preprocessing:
        1. Gathering
        2. Stemming
        3. Stopword Removal
        4. Indexing
    
        1a. Read in documents use doc_sep to seperate docs (default ";") done in constructor
        1b. Seperate each doc into name and contentsusing title_sep (default = ":") 
        1c. Break into constituent parts if TF-IDF we need to count occurrences
        
        2a. lowercase
        2b. remove punctuation #NEED DEFAULT PUNCTUATION LIST
        2c. use stemming dictionary (deafault = None) 
        ??  Check for hyphens
        
        3a. Remove stopwords (default in __stopwords__)

        4a. Create index
        
        
        inverse document frequency:
        for each document
        - 
    """
    

    """
        If we are going to have a lot of documents then the overhead of objet creation will slow us down
        'title' is the name of the document
        'string' is the original string
        keywords' are the words in a document
    """
    Document = namedtuple('Document',['title', 'string', 'keywords'])

    
    def __init__(self, model_corpus, stopwords, **kwargs):
        # kwargs.get(foo, default) searches for named arguments e.g doc-sep='\n' but provides a default if not found
        self.doc_sep = kwargs.get('doc_sep', ';') # seperates documents in corpus
        self.title_sep = kwargs.get('title_sep', ':') # seperates document title from comments
        self.remove_punctuation = kwargs.get('punctuation', None) # default function to remove punctuation
        self.stopwords = stopwords
      
        # dictionary to store how many documents a word was in
        self.document_frequencies = defaultdict(lambda: 0)   
        
        # default function to remove punctuation
        if not self.remove_punctuation:
            self.remove_punctuation = remove_all_punctuation_and_numbers
            
        self.stemming_dict = kwargs.get('stemming_dict', None)
        
        self.number_of_documents = 0
        self.idf_squared = {} # key = document frequency : value = idf**2 to speed up calculations
        self.ipy_table = [] # for displaying calculations
        self.ipy_table_headers = [] # to know which rows to style in output
        
        
        # Needs to be split into doc and contents
        self._preprocess(model_corpus)
        
        
        
    def _preprocess(self, model_corpus):
        self._read_documents(model_corpus)
        self.tf_idf()
    
    def _read_documents(self, model_corpus):
        """
            self.corpus.split(self.doc_sep) creates individual document strings (1a)
            Each document performs:
                - speration of document string into title and contents using self.title_sep
                - transition to lower case
                - removal of punctuation
                - stemming
                - stopword removal
        """
       
        self.documents = [self.create_document_tuples(self, doc) for doc in model_corpus.split(self.doc_sep) if doc]
        
    def stem(self, word):
        word = self.remove_punctuation(word.strip().lower()) # remove punctuation and change to lower case
        if word not in self.stopwords:
            return self.stemming_dict.get(word, word) # find a stem else return the word as is
        else:
            return None # this is a stopword so we ignore it
    
    def inverse_document_frequency(self, doc_frequency):
        if doc_frequency:
            return math.log10(self.number_of_documents/doc_frequency)
        else: return 0 # to avoid divison by zero
        
    def tf_idf(self):

        tf_idf = defaultdict(lambda: [])
        set_of_df = set(self.document_frequencies.values()) 
        self.number_of_documents = len(self.documents)

        # cache the values of idf^2 for a given document frequency
        # assumes document idf = query idf
        self.idf_squared = {df : (self.inverse_document_frequency(df)**2) for df in set_of_df}
        self.idf_squared.update({0:0})
        # Get the lengths of vectors
        self.document_lengths = {document.title: self.get_vector_length(document) for document in self.documents}
         
    
    def get_vector_length(self,document):
        
        """
            Vector length is math.sqrt(sum(tf_idf**2))
            tf_idf**2 = tf*idf*tf*idf = tf**2 * idf**2
            
            idf_squared already calculated
            tf is frequency divide by most frequent word
            
        """
        tf_dict = document.keywords
    
        total = 0 # not used in fact
        
        header_row = [document.title,'tf', 'df','idf', 'tfidf','tfidf**2']
        
        header_row_number = len(self.ipy_table)
        self.ipy_table_headers.append(header_row_number)
        self.ipy_table.append(header_row)
         
        
        
        
        
        for key, tf in tf_dict.items():
            
            """
                Only used for display purposes
            """
            
            #print "KEY, DF", key, self.document_frequencies[key]
            #print "IDF SQUARED", self.idf_squared [self.document_frequencies[key]]
            
            idf2 = self.idf_squared [self.document_frequencies[key]]
            
            idf = self.inverse_document_frequency(self.document_frequencies [key])
            tfbyidf = (idf * tf)**2
            total += tfbyidf
            #print key, "-> tf=", round(tf,2), "df=", self.document_frequencies [key], "idf", round(idf,2), "tfidf=", 
            #print round(math.log10(self.number_of_documents/self.document_frequencies [key]) * tf_dict[key],2),
            #print "tfidf**2=", round(tfbyidf,5)
            row = [key, round(tf,2), self.document_frequencies [key],
                   self.inverse_document_frequency(self.document_frequencies [key]), 
                   round(idf,2)*round(tf,2), tfbyidf]
            #print len(row), row
            #self.document_lengths_table.append(row)
            self.ipy_table.append(row)

    
        
        # Actual calculation
        query_length_squared = sum(self.idf_squared[self.document_frequencies [kw]] * (tf_dict [kw]**2) for kw in tf_dict.keys())
        

        vector_length = math.sqrt(query_length_squared)
        
        self.ipy_table.append(["", "", "", "", "Total", query_length_squared ])
        self.ipy_table.append(["", "", "", "", "Length", vector_length ])
        
        make_table(self.ipy_table, interactive=False)
        
        for row_number in self.ipy_table_headers:
            set_row_style(row_number,color="blue")
            
        return vector_length
        
    
    def query(self, string):
          
        self.ipy_table = [] # New table so reset
        self.ipy_table_headers = []

        
        keywords = filter (None, (self.stem(word) for word in string.split())) # Stemming
        
        counter = Counter(keywords) # Count frequencies
        most_frequent = counter.most_common(1)[0][1] # returns list of tuples hence long access line
        query_tf_dict = {key: value/most_frequent for key, value in counter.items()}
        
        # get_vector_length expects a document so turn query into document
        query = self.Document(title="query", string=string, keywords=query_tf_dict)
        query_length = self.get_vector_length(query)
        
        
        """
            Dot product:
            
            sum of
            for key value in tf_dict:
                
                (query tf * term idf) * (document tf * term idf)
                = query tf * document tf * term idf squared 
        """
        
        query_table = []
        row_headers = []
        
        for document in self.documents:
            
            self.ipy_table_headers.append(len(self.ipy_table)+len(query_table))
            
            query_table.append( [document.title + ".q", "dtf", "didf", "qtf","qidf", "product"] )
          
            
            for term, tf in query_tf_dict.items():
                qidf = self.inverse_document_frequency(self.document_frequencies[term])
                dtf = document.keywords.get(term, 0)
                didf = self.inverse_document_frequency(self.document_frequencies[term])
                #totalA += tf*qidf*dtf*didf
                
                query_table.append([term, dtf, didf, tf, qidf, tf*qidf*dtf*didf])
                                  
                #print term, "-> qtf=", round(tf,2) , "qidf=", round(qidf,2),
                #print "dtf=", round(dtf,2),
                #print "didf=", round(didf,2),
                #print "q.tfidf=", tf*qidf, "d.tfidf=",dtf*didf, "=>", tf*qidf*dtf*didf
                
                                                                    
            dot_product = sum(document.keywords.get(key, 0) * query_tf * 
                                      self.idf_squared[self.document_frequencies[key]] \
                                      for key, query_tf in query_tf_dict.items())
      
            
            document_length = self.document_lengths[document.title]
            
            cos = dot_product/(document_length * query_length)
            
            # Add cos calculations to table for display
            query_table.append(["","","","", "d.q", dot_product])
            query_table.append(["","","","", "|d|*|q|", document_length*query_length])
            query_table.append(["","","","", "cos", cos])
            
            #print document.title, "cos=", cos, "||d||=", document_length, "||q||=",query_length,  "d.q=", dot_product, \
            #"||d||*||q||=", document_length*query_length
         
        self.ipy_table.extend(query_table)
        
        make_table(self.ipy_table, interactive=False)
        
        
        
        for row_number in self.ipy_table_headers:
            set_row_style(row_number,color="red")
        
        
            
    def create_document_tuples(self, model, string):
        
        title, contents = string.split(self.title_sep)
        
        string = contents.strip()
        
        title=title.strip()

        tf = {}
        
        contents = filter(None,(self.stem(word) for word in contents.split()))
    
        counter = Counter(contents)
        
        keywords = set( counter.keys())
    
        most_frequent = counter.most_common(1)[0][1]
        
        for kw in counter.keys():
            
            tf[kw] = counter[kw]/most_frequent
            self.document_frequencies[kw] += 1 # increment the document frequencies dictionary
            
            
        doc_tuple = VectorSpaceModel.Document(title = title, string = string, keywords =tf)
        
        
        
        return doc_tuple   

In [3]:
        
vsm = VectorSpaceModel(corpus, stop_words, stemming_dict=stems) 
render()

0,1,2,3,4,5
d1,tf,df,idf,tfidf,tfidf**2
space,0.3300,2,0.3979,0.1320,0.0176
relevan,0.3300,2,0.3979,0.1320,0.0176
vector,0.3300,2,0.3979,0.1320,0.0176
english,0.3300,2,0.3979,0.1320,0.0176
model,1.0000,3,0.2218,0.2200,0.0492
retrieval,0.6700,3,0.2218,0.1474,0.0219
,,,,Total,0.1415
,,,,Length,0.3761
d2,tf,df,idf,tfidf,tfidf**2


### Q1: relevant retrieval 

In [4]:
vsm.query("relevant retrieval")
render()

0,1,2,3,4,5
query,tf,df,idf,tfidf,tfidf**2
relevan,1.0000,2,0.3979,0.4000,0.1584
retrieval,1.0000,3,0.2218,0.2200,0.0492
,,,,Total,0.2076
,,,,Length,0.4556
d1.q,dtf,didf,qtf,qidf,product
relevan,0.3333,0.3979,1.0000,0.3979,0.0528
retrieval,0.6667,0.2218,1.0000,0.2218,0.0328
,,,,d.q,0.0856
,,,,|d|*|q|,0.1714


### Q2: efficient model efficient retrieval

In [5]:
vsm.query("efficient model efficient retrieval")
render()

0,1,2,3,4,5
query,tf,df,idf,tfidf,tfidf**2
model,0.5000,3,0.2218,0.1100,0.0123
effic,1.0000,3,0.2218,0.2200,0.0492
retrieval,0.5000,3,0.2218,0.1100,0.0123
,,,,Total,0.0738
,,,,Length,0.2717
d1.q,dtf,didf,qtf,qidf,product
model,1.0000,0.2218,0.5000,0.2218,0.0246
effic,0,0.2218,1.0000,0.2218,0.0000
retrieval,0.6667,0.2218,0.5000,0.2218,0.0164


### Q3: precise precision with average recall

In [6]:
vsm.query("precise precision with average recall")
render()

0,1,2,3,4,5
query,tf,df,idf,tfidf,tfidf**2
precis,1.0000,2,0.3979,0.4000,0.1584
recall,0.5000,0,0,0.0000,0.0000
average,0.5000,2,0.3979,0.2000,0.0396
,,,,Total,0.1979
,,,,Length,0.4449
d1.q,dtf,didf,qtf,qidf,product
precis,0,0.3979,1.0000,0.3979,0.0000
recall,0,0,0.5000,0,0.0000
average,0,0.3979,0.5000,0.3979,0.0000
