In [1]:
import nltk
import math
import string

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
#from nltk.stem.snowball import SnowballStemmer
#nltk.download()

from collections import Counter
from operator import itemgetter

Parsing

In [2]:
index_marker = ".I"
header_marker = ".T"
authors_marker = ".A"
meta_marker = ".B"
annotation_marker = ".W"

index_type = "index"
header_type = "header"
authors_type = "authors"
meta_type = "meta"
annotation_type = "annotation"
unknown_type = "unknown"

line_types = {index_marker:"index", header_marker:"header", authors_marker:"authors", meta_marker:"meta", annotation_marker:"annotation"}

def get_line_type(line):
    if line.startswith(index_marker):
        return line_types[index_marker]
    elif line.startswith(header_marker):
        return line_types[header_marker]
    elif line.startswith(authors_marker):
        return line_types[authors_marker]
    elif line.startswith(meta_marker):
        return line_types[meta_marker]
    elif line.startswith(annotation_marker):
        return line_types[annotation_marker]
    else:
        return unknown_type


text_file_name = "data/cran.all.1400"
documents = []
with open(text_file_name, 'r') as file:
    docId = 0
    header = ""
    annotation = ""
    
    is_header = False
    is_annotation = False
    
    for line in file:
        line_type = get_line_type(line)
        if line_type == index_type:
            is_header = False
            is_annotation = False
            
            if (docId):
                document = (docId, header, annotation)
                documents.append(document)
            
            docId = line.split()[1]
            header = ""
            annotation = ""
        elif line_type == header_type:
            is_header = True
            is_annotation = False
        elif line_type == annotation_type:
            is_header = False
            is_annotation = True
        elif line_type == authors_type or line_type == meta_type:
            is_header = False
            is_annotation = False
        elif line_type == unknown_type:
            if is_header:
                header += line
            elif is_annotation:
                annotation += line
                
    document = (docId, header, annotation)
    documents.append(document)
    
    
queries_file_name = "data/cran.qry"
queries = []
with open(queries_file_name, 'r') as file:
    queryId = 0
    text = ""
    
    is_text = False
    
    for line in file:
        line_type = get_line_type(line)
        if line_type == index_type:
            is_text = False
            
            if (queryId):
                query = (queryId, text)
                queries.append(query)
            
            queryId = line.split()[1]
            text = ""
        elif line_type == annotation_type:
            is_annotation = True
        elif line_type == unknown_type:
            text += line
    
    query = (queryId, text)
    queries.append(query)

Normalize and build index

In [3]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
punctuation = [c for c in string.punctuation]


def normalize_text(text):
    words = word_tokenize(text)
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words and word not in punctuation]
    return words
    

def build_index(documents, index_by):
    docs = dict()
    index = dict()
    
    average_doc_lenght = 0
    
    for document in documents:
        text = document[index_by]
        normalized_text = normalize_text(text)
        
        doc_lenght = len(normalized_text)
        average_doc_lenght += doc_lenght
        
        docId = document[0]
        docs[docId] = doc_lenght
            
        words_in_doc = Counter(normalized_text)
        for word, count in words_in_doc.items():
            if word not in index:
                index[word] = []
            index[word].append((docId, count / doc_lenght))
            
    average_doc_lenght /= len(documents)
    
    return (index, docs, average_doc_lenght)


# Index by header
header_index, header_docs, header_avg_doc_lenght = build_index(documents, 1)

# Index by annotation
annotation_index, annotation_docs, annotation_avg_doc_lenght = build_index(documents, 2)

Search

In [41]:
def search(query, index, docs_index, doc_lenght_avg):
    search_terms = normalize_text(query)
    
    hits = dict()
    for search_term in search_terms:
        if search_term in index:
            hits[search_term] = index[search_term]
            
    found_docs = dict()
    for search_term in hits:
        doc_tf = hits[search_term]
        for docId, tf in doc_tf:
            if docId not in found_docs:
                found_docs[docId] = []
            hit = (search_term, tf)
            found_docs[docId].append(hit)
    
    N = len(docs_index)
    k1 = 1.2
    b = 0.75
    result = []
    for docId in found_docs:
        rsv = 0
        Ld = docs_index[docId]
        for search_term in search_terms:
            Nt = len(index.get(search_term, []))
            tf = 0
            terms_found_in_doc = found_docs[docId]
            for search_term_in_doc, found_tf in terms_found_in_doc:
                if (search_term_in_doc == search_term):
                    tf = found_tf
            if (docId == "875"):
                print(search_term, Nt, tf)
            rsv += math.log(1 + (N - Nt + 0.5) / (Nt + 0.5)) * tf * (k1 + 1) / (k1 * ((1 - b) + b * Ld / doc_lenght_avg) + tf)
        result.append((docId, rsv))
        
    result.sort(key=itemgetter(1), reverse=True)
    return result[:10]
    
    
#search("what similarity laws must be obeyed when constructing aeroelastic models of heated high speed aircraft .", header_index, header_docs, header_avg_doc_lenght)

Searching

In [42]:
output_file_name = "data/header_search_results"

with open(output_file_name, 'w') as file:
    for idx, query in enumerate(queries):
        search_result = search(query[1], header_index, header_docs, header_avg_doc_lenght)
        break
        
        for docId, _ in search_result:
            file.write("{} {}\n".format(idx + 1, docId))

similarity 8 0
law 9 0
must 0 0
obeyed 0 0
constructing 0 0
aeroelastic 4 0.3333333333333333
model 32 0.3333333333333333
heated 7 0
high 53 0
speed 110 0
aircraft 26 0
