# Process Stop Words

In [1]:
stop_words = """
a about above across adj after again against all almost alone along also although always am among
an and another any anybody anyone anything anywhere apart are around as aside at away be
because been before behind being below besides between beyond both but by can cannot could
deep did do does doing done down downwards during each either else enough etc even ever every
everybody everyone except far few for forth from get gets got had hardly has have having her here
herself him himself his how however i if in indeed instead into inward is it its itself just kept many
maybe might mine more most mostly much must myself near neither next no nobody none nor not
nothing nowhere of off often on only onto or other others ought our ours out outside over own p
per please plus pp quite rather really said seem self selves several shall she should since so some
somebody somewhat still such than that the their theirs them themselves then there therefore
these they this thorough thoroughly those through thus to together too toward towards under until
up upon v very was well were what whatever when whenever where whether which while who
whom whose will with within without would yet young your yourself
""".split()

# Split Corpus into Lines and Check for Whitespace

In [2]:
corpus = """
Doc1= English Language and Written Rules.

Doc2= Semantic Indexing made easy.

Doc3= Information Retrieval is fun!

Doc4= Writing English essays and academic papers for beginners.

Doc5= Retrieving Information from semantically diverse documents.

Doc6= Learn how to Index written textbooks.
"""

corpus = [c for c in corpus.split("\n") if c]

# Check no blank lines
# I often surround things with "---" to check for whitespace
for c in corpus:
    print "---"+c+"---"


---Doc1= English Language and Written Rules.---
---Doc2= Semantic Indexing made easy.---
---Doc3= Information Retrieval is fun!---
---Doc4= Writing English essays and academic papers for beginners.---
---Doc5= Retrieving Information from semantically diverse documents.---
---Doc6= Learn how to Index written textbooks.---


# Process Text

In [3]:
def process_text(sentence):
    '''
        Prepare words to be processed by:
            - making lowercase
            - splitting on space
            - removing stop words
            - removing punctuation from end of word
            - remove empty strings
            - remove unique words with set
            
        Q. Do I want to return a set or list?
    '''
    def remove_punctuation(word):
        '''
            If the last digit is not alpha then remove it
            Assumption: no numbers
        '''
        
        if not word[-1].isalpha():
            word = word[:-1]
            
        return word
    
    sentence = sentence.lower()
    words = [remove_punctuation(w) for w in sentence.split() if w not in stop_words]
    
    # remove empty words
    # use set to remove unique words
    return {w for w in words if w}
    


print "Test process text", process_text("A mad - MAn!") == ['mad', 'man']

Test process text False


In [4]:
from collections import defaultdict

index = defaultdict(set)

for c in corpus:    
    # Will split on "= " but better not to assume spaces will always be there
    document, text = c.split("= ")
    print "Document:", document
    print "Text:", "---"+text+"---"
    for word in process_text(text):
        print word
        index[word].add(document)

Document: Doc1
Text: ---English Language and Written Rules.---
rules
written
language
english
Document: Doc2
Text: ---Semantic Indexing made easy.---
indexing
made
semantic
easy
Document: Doc3
Text: ---Information Retrieval is fun!---
fun
information
retrieval
Document: Doc4
Text: ---Writing English essays and academic papers for beginners.---
papers
writing
beginners
academic
english
essays
Document: Doc5
Text: ---Retrieving Information from semantically diverse documents.---
information
semantically
diverse
documents
retrieving
Document: Doc6
Text: ---Learn how to Index written textbooks.---
textbooks
index
written
learn


In [5]:
print index

defaultdict(<type 'set'>, {'semantic': set(['Doc2']), 'diverse': set(['Doc5']), 'information': set(['Doc3', 'Doc5']), 'documents': set(['Doc5']), 'retrieving': set(['Doc5']), 'writing': set(['Doc4']), 'written': set(['Doc1', 'Doc6']), 'easy': set(['Doc2']), 'papers': set(['Doc4']), 'semantically': set(['Doc5']), 'index': set(['Doc6']), 'rules': set(['Doc1']), 'learn': set(['Doc6']), 'indexing': set(['Doc2']), 'beginners': set(['Doc4']), 'essays': set(['Doc4']), 'made': set(['Doc2']), 'language': set(['Doc1']), 'textbooks': set(['Doc6']), 'academic': set(['Doc4']), 'english': set(['Doc1', 'Doc4']), 'fun': set(['Doc3']), 'retrieval': set(['Doc3'])})


# Perform Boolean Operations

Search for 'English'

In [6]:
index['english']

{'Doc1', 'Doc4'}

Search for 'Index'

In [7]:
index['index']

{'Doc6'}

Search for 'Write AND English'

In [8]:
index['write'] & index ['english']

set()

Search for 'Retrieve AND Information'

In [9]:
index['retrieve'] & index ['information']

set()

Search for 'Fun OR English'

In [10]:
index['fun'] | index ['english']

{'Doc1', 'Doc3', 'Doc4'}

Search for 'English OR Information BUT NOT Write'

In [11]:
print index['english']
print index['information']
print index['write']

{doc for doc in (index['english'] | index['information']) if doc not in index['write'] }

set(['Doc1', 'Doc4'])
set(['Doc3', 'Doc5'])
set([])


{'Doc1', 'Doc3', 'Doc4', 'Doc5'}

Search for 'Index OR Semantic BUT NOT Easy OR Information'

In [12]:
print index['index'] | index['semantic']
print index['easy']  | index['information']

[doc for doc in (index['index'] | index['semantic']) if doc not in (index['easy'] | index['information']) ]

set(['Doc2', 'Doc6'])
set(['Doc2', 'Doc3', 'Doc5'])


['Doc6']