In [1]:
import re
import os
import sys
import math
import pickle
from peewee import *
from tqdm import tqdm
from collections import Counter
from nltk.tokenize import RegexpTokenizer

PATH = r"/root/course/cs332/gutenberg/book"
NEW_PATH = r"/root/course/cs332/gutenberg/contents"
CHUNK_PATH = r"/root/course/cs332/gutenberg/chunks"
BASE_PATH = r"/root/course/cs332/gutenberg/"

token_dic = dict()
chunk_token = dict()
chunk_count = 1

file_count = 2661
file_length = dict()

gutenberg_db = PostgresqlDatabase('<data base name>', user='<user name>', password='<user password>',
                           host='<host name>', port='<port number>')

class BaseModel(Model):
    """A base model that will use our Postgresql database"""
    class Meta:
        database = gutenberg_db

class Book(BaseModel):
    author = CharField()
    title = TextField()
    context = BlobField()

In [4]:
'''
========================
    DATA PREPARATION
========================

In this stage, the book context will be extracted and stored into the database.
'''

Book.create_table()

def rewrite_file(doc, new_doc, doc_idx) -> bool:
    _title, _author = str(), str()
    begin, end = -1, -1
    
    with open(doc, "r+", encoding="ISO-8859-1") as _file:
        _content = _file.read()
    _pattern = re.search(r"Title: (.*)\n(.*)", _content)
    if _pattern != None:
        _title = _pattern.group(1)
        if any(c.isalpha() for c in _pattern.group(2)):
            _title += " " + _pattern.group(2).strip()
#         print(_title)
    else:
        return False
    _pattern = re.search(r"Author: (.*)", _content)
    if _pattern != None:
        _author = _pattern.group(1)
#         print(_author)
    else:
        return False
    
    with open(doc, "r+", encoding="ISO-8859-1") as _file:
        text_enum = enumerate(_file.readlines())
        for i, line in text_enum:
            _pattern = re.search(r"(^\*\*\*START OF (.*)$|^\*\*\* START OF (.*)$)", line)
            if _pattern != None:
                begin = i + 10
                break
        for i, line in text_enum:
            _pattern = re.search(r"(^\*\*\*END OF (.*)$|^\*\*\* END OF (.*)$)", line)
            if _pattern != None:
                end = i - 3
                break
    if begin == -1 or end == -1:
        return False
#     else:
#         print(begin)
#         print(end)
    
    with open(doc, "r+", encoding="ISO-8859-1") as _file:
        _target = _file.readlines()[begin:end]
    with open(new_doc, "w+", encoding="ISO-8859-1") as _file:
        _file.writelines(_target)

    with open(new_doc, "rb") as _file:
        data = _file.read()
    book = Book(author=_author, title=_title, context=data)
    book.save()
    return True

    
def rename():
    i = 1
    j = 1
    for root, dirs, files in os.walk(PATH, topdown=True):
        dirs.sort()
        for filename in tqdm(sorted(files)):
#             if i == 4:
#                 break
#             print(filename)
            abs_path = os.path.join(root, filename)
            abs_new = os.path.join(NEW_PATH, str(i))
            if rewrite_file(abs_path, abs_new, i):
                i += 1
            j += 1
    print(f"i={i}, j={j}")

rename()

100%|██████████| 2714/2714 [4:02:23<00:00,  5.36s/it]   
100%|██████████| 1/1 [00:01<00:00,  1.84s/it]
100%|██████████| 1/1 [00:06<00:00,  6.29s/it]
100%|██████████| 1/1 [00:02<00:00,  2.84s/it]
100%|██████████| 1/1 [00:06<00:00,  6.28s/it]
100%|██████████| 1/1 [00:02<00:00,  2.36s/it]
100%|██████████| 1/1 [00:03<00:00,  3.22s/it]
100%|██████████| 1/1 [00:15<00:00, 15.48s/it]
100%|██████████| 1/1 [00:04<00:00,  4.56s/it]
100%|██████████| 1/1 [00:07<00:00,  7.17s/it]
100%|██████████| 1/1 [00:00<00:00,  1.10it/s]
100%|██████████| 1/1 [00:08<00:00,  8.17s/it]
100%|██████████| 1/1 [00:08<00:00,  8.37s/it]
100%|██████████| 1/1 [00:02<00:00,  2.62s/it]
100%|██████████| 1/1 [00:08<00:00,  8.38s/it]
100%|██████████| 1/1 [00:05<00:00,  5.16s/it]
100%|██████████| 1/1 [00:00<00:00, 1728.18it/s]
100%|██████████| 1/1 [00:00<00:00, 1258.04it/s]
100%|██████████| 1/1 [00:00<00:00, 1339.18it/s]

i=2662, j=2733





In [2]:
'''
===================================
    INVERTED INDEX CONSTRUCTION
===================================

In this stage, the inverted index list is going to be built and stored in a byte file.
'''

def SPIMI_invert(doc, doc_id):
    global chunk_token, chunk_count
    with open(doc, "r+", encoding="ISO-8859-1") as _file:
        _content = _file.read()
    _content = _content.lower()
    word_tokenizer = RegexpTokenizer('[A-Za-z]+')
    terms = word_tokenizer.tokenize(_content)
    for item in terms:
        if sys.getsizeof(chunk_token) > 1000000:
            chunk_file = os.path.join(CHUNK_PATH, f"{chunk_count}.txt")
            with open(chunk_file, "wb") as _file:
                pickle.dump(chunk_token, _file)
            chunk_token.clear()
            chunk_count += 1
        if item in chunk_token:
            if doc_id in chunk_token[item]:
                chunk_token[item][doc_id] += 1
            else:
                chunk_token[item][doc_id] = 1
        else:
            chunk_token[item] = dict()
            chunk_token[item][doc_id] = 1

def SPIMI_merge():
    global token_dic
    for i in tqdm(range(chunk_count)):
        chunk_file = os.path.join(CHUNK_PATH, f"{i+1}.txt")
        with open(chunk_file, "rb") as _file:
            _dict = pickle.loads(_file.read())
        for item in set(list(token_dic.keys())+list(_dict.keys())):
            if item in token_dic and item in _dict:
                for doc_id in set(list(token_dic[item].keys())+list(_dict[item].keys())):
                    if doc_id in token_dic[item] and doc_id in _dict[item]:
                        token_dic[item][doc_id] += _dict[item][doc_id]
                    elif doc_id in _dict[item]:
                        token_dic[item][doc_id] = _dict[item][doc_id]
            elif item in _dict:
                token_dic[item] = _dict[item]

def tokenize():
    global chunk_count, chunk_token
    if not os.path.exists(CHUNK_PATH):
        os.makedirs(CHUNK_PATH)
    i = 1
    for root, dirs, files in os.walk(NEW_PATH, topdown=True):
        dirs.sort()
        for filename in tqdm(sorted(files)):
            abs_new = os.path.join(NEW_PATH, str(i))
            SPIMI_invert(abs_new, i)
            i += 1
    chunk_file = os.path.join(CHUNK_PATH, f"{chunk_count}.txt")
    with open(chunk_file, "wb") as _file:
        pickle.dump(chunk_token, _file)
    chunk_token.clear()
    SPIMI_merge()

tokenize()

with open(os.path.join(BASE_PATH, f"inverted_index"), "wb") as _file:
    pickle.dump(token_dic, _file)

100%|██████████| 2661/2661 [04:30<00:00,  9.83it/s]
100%|██████████| 365/365 [15:07<00:00,  2.49s/it]


In [3]:
'''
================================
    FILE LENGTH RETRIEVEMENT
================================

In this stage, the length of each file vector is collected and stored in a byte file.
'''

file_count = 2661
def get_file_len():
    global file_length
    for item in tqdm(token_dic):
        N_df = file_count/len(token_dic[item])
        for doc_id in token_dic[item]:
            _len = (1 + math.log10(token_dic[item][doc_id])) * math.log10(N_df)
            if doc_id in file_length:
                file_length[doc_id] += pow(_len, 2)
            else:
                file_length[doc_id] = pow(_len, 2)

get_file_len()
with open(os.path.join(BASE_PATH, f"file_len"), "wb") as _file:
    pickle.dump(file_length, _file)

100%|██████████| 518339/518339 [00:20<00:00, 25332.38it/s] 


In [2]:
'''
===========================
    INVERTED INDEX TEST
===========================

This stage tests the constructed inverted index.
'''

def cosine_score(_input) -> dict():
    score = dict()
    query = dict(Counter(_input))
    for item in query:
        if item in token_dic:
            N_df = file_count/len(token_dic[item])
            for doc_id in token_dic[item]:
                W_td = (1 + math.log10(token_dic[item][doc_id])) * math.log10(N_df)
                W_tq = (1 + math.log10(query[item])) * math.log10(N_df)
                if doc_id in score:
                    score[doc_id] += (W_tq * W_td)
                else:
                    score[doc_id] = W_tq * W_td
    for doc_id in score:
        score[doc_id] /= math.sqrt(file_length[doc_id])
    return score


with open(os.path.join(BASE_PATH, f"inverted_index"), "rb") as _file:
    token_dic = pickle.loads(_file.read())

with open(os.path.join(BASE_PATH, f"file_len"), "rb") as _file:
    file_length = pickle.loads(_file.read())

while(True):
    _input = str.split(input().lower())
    if _input[0] == "exitexit":
        exit()
    result = cosine_score(_input)
    for doc_id in sorted(result.items(), key=lambda x: x[1], reverse=True)[:10]:
        print(doc_id[0])
    print("")

hello world
1244
1325
756
556
678
205
900
2234
2397
1100



KeyboardInterrupt: Interrupted by user