In [2]:
from whoosh.index import create_in
from whoosh.fields import *

import time
# from whoosh.analysis import *
# schema = Schema(pageId=TEXT(stored=True), senId=NUMERIC(stored=True), content=TEXT(analyzer = StemmingAnalyzer()))

schema = Schema(doc=ID(stored=True), wiki=NUMERIC(stored=True), content=TEXT)
ix = create_in("wiki_index_dir", schema)

prefix = "./wiki-pages-text/wiki-"
suffix  = ".txt"

In [3]:
import nltk
from nltk.corpus import stopwords

nltk.download('wordnet')
lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()

def lemmatize(word):
    lemma = lemmatizer.lemmatize(word,'v')
    if lemma == word:
        lemma = lemmatizer.lemmatize(word,'n')
    return lemma

sw = stopwords.words('english') + ['!',',','.','?','\s',"\n","-lrb-","-rrb-"]

def pre_process(sentence):
    tokens = nltk.word_tokenize(sentence)
    tokens = [lemmatize(t) for t in tokens]
    tokens = [t.lower() for t in tokens]
    tokens =  [t for t in tokens if t not in sw]
    return tokens

[nltk_data] Downloading package wordnet to /Users/yigewen/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
for i in range(109):
    start = time.clock()
    writer = ix.writer(procs=4, limitmb=256)
    
    if i <9:
        wikiNum = "00"+str(i+1)
    elif i<99:
        wikiNum = "0"+str(i+1)
    else:
        wikiNum = str(i+1)
    
    filename = prefix+wikiNum+suffix
    print("wiki-"+wikiNum)
    
    doc = None
    content = []
    for line in open(filename):
        a,b,c = line.split(" ",2)
        if b.isalpha():
#             print(a, b, c)
            pass
        else:
            if doc == None:
                doc = a
                content += pre_process(c)
            elif doc != a:
                writer.add_document(doc = doc, wiki = i, content = content)
                doc = a
                content = pre_process(c)
            else:
                content += pre_process(c)
                
#             c = pre_process(c)
#             writer.add_document(doc = a, wiki = i, content = c)

    writer.commit()
    elapsed = (time.clock() - start)
    print("Time used:",elapsed)

wiki-001
Time used: 96.535419
wiki-002
Time used: 96.441498
wiki-003
Time used: 105.038993
wiki-004
Time used: 106.76213999999999
wiki-005
Time used: 119.11498499999999
wiki-006
Time used: 122.85691199999997
wiki-007
Time used: 127.23674899999992
wiki-008
Time used: 125.31959400000005
wiki-009
Time used: 126.13245300000005
wiki-010
Time used: 126.07707600000003
wiki-011
Time used: 125.63484399999993
wiki-012
Time used: 124.84989799999994
wiki-013
Time used: 123.9692030000001
wiki-014
Time used: 129.72792099999992
wiki-015
Time used: 137.61704499999996
wiki-016
Time used: 132.25347399999987
wiki-017
Time used: 127.50029300000006
wiki-018
Time used: 125.95057799999995
wiki-019
Time used: 124.26766199999975
wiki-020
Time used: 126.96140999999989
wiki-021
Time used: 128.54655700000012
wiki-022
Time used: 131.16999699999997
wiki-023
Time used: 130.47530099999994
wiki-024
Time used: 129.87069100000008
wiki-025
Time used: 126.85052300000007
wiki-026
Time used: 125.39572500000031
wiki-027
Time

In [59]:
manQuery = pre_process("Slovenia has a sparse river network.")
manQuery = " ".join(manQuery) 
print(manQuery)

slovenia sparse river network


In [61]:
from whoosh.qparser import QueryParser
from whoosh import scoring

start = time.clock()

with ix.searcher(weighting=scoring.TF_IDF()) as searcher1:
    query = QueryParser("content", ix.schema).parse("slovenia river network")
    results = searcher1.search(query)
    for hit in results:
        print(hit)
        
elapsed = (time.clock() - start)
print("Time used:",elapsed)

<Hit {'doc': 'Slovenia', 'wiki': 90}>
<Hit {'doc': 'Slovene_Hills', 'wiki': 89}>
<Hit {'doc': 'Dravinja', 'wiki': 30}>
<Hit {'doc': 'Krupa_-LRB-Lahinja-RRB-', 'wiki': 55}>
<Hit {'doc': 'List_of_A2_roads', 'wiki': 58}>
<Hit {'doc': 'Sann', 'wiki': 85}>
<Hit {'doc': 'Unica', 'wiki': 100}>
Time used: 8.155667999999423


In [None]:
from whoosh import scoring

with ix.searcher(weighting=scoring.BM25F()) as searcher2:
    query = QueryParser("content", ix.schema).parse("Soviet Cup")
    results = searcher2.search(query)
    for result in results:
        print(result)

In [44]:
print(list(ix.searcher().documents(doc = 'A.I._Artificial_Intelligence')))

[{'doc': 'A.I._Artificial_Intelligence', 'wiki': 5}]
