In [1]:
import time
import random
import string
import os
import re

from parse_file import parse_file
from partial_index import partial_index
from WARC_html_information import WARC_html_information
from html.parser import HTMLParser
from index import Index

def get_tmp_dir_name():
    random.seed()
    return ''.join(random.choice(string.ascii_letters) for x in range(20))

def processing(_content: str, offset: int) -> partial_index:
    html_pro = WARC_html_information()
    html_pro.offset = offset
    html_pro.feed(_content)
    return html_pro.index

def build_index(_parse: parse_file):
    count = 0
    start_time_build = time.time()
    try:
        tmp_dir_name = "tmp/" + get_tmp_dir_name()
        print("create temp index directory: ", tmp_dir_name)
        os.mkdir(tmp_dir_name)
        
        while True:
            count += 1
            line = _parse.read_file()
            if line is not None:
                find_begin = re.compile("Content-Length: (\d+)\n\n").search(line.content)
                html = line.content[find_begin.span()[1]:]
                result = processing(html, find_begin.span()[1])
                result.dump(tmp_dir_name + "/" + str(count))
        
                if count % 1000 == 0:
                    print("waiting ", int(count/1000))
            else:
                break
                
        print("analysis document:")
        end_time = time.time()
        print(end_time - start_time_build, "s")
        print("Average: ", (end_time - start_time_build) * 1000 / count, "ms")
        print("DPS: ", count / (end_time - start_time_build), "ps")
        
        print("---------------------------------------------------------------------------------------")
        print("build index")
        start_time_index = time.time()
        idx = Index()
        for i in range(1, count):
            if count % 500 == 0:
                gc.collect()
            idx.read_partial_index(i, partial_index.read(tmp_dir_name + "/" + str(i)))
            os.remove(tmp_dir_name + "/" + str(i))
        os.rmdir(tmp_dir_name)
        end_time_index = time.time()
        print(end_time_index - start_time_index, "s")
        print("Average: ", (end_time_index - start_time_index) * 1000 / count, "ms")
        print("DPS: ", count / (end_time_index - start_time_index), "ps")
    except KeyboardInterrupt:
        print("stop")
        return count
    return count, idx

print("please enter the filename which you want to read:")
filename = input()

print("please enter the function you want to use: (stemming, stopword, case_folding, None)")
print("use space to seperate each other")
print("if you enter the None, any function will not be run")
instruction = input().split()

WARC_html_information.stemming = False
WARC_html_information.case_folding = False    # convert the word to lower case
WARC_html_information.stopword_remove = False

if "stemming" in instruction:
    WARC_html_information.stemming = True
if "case_folding" in instruction:
    WARC_html_information.case_folding = True
if "stopword" in instruction:
    WARC_html_information.stopword_remove = True
if "None" in instruction:
    WARC_html_information.stemming = False
    WARC_html_information.case_folding = False
    WARC_html_information.stopword_remove = False

file = parse_file(filename)    # read file
file.read_file()
start_time = time.time()
print("---------------------------------------------------------------------------------------")
print("start!!!")
count, index = build_index(file)
index.write_file(filename + "_index")
print("finish!!!")
print("---------------------------------------------------------------------------------------")
print("total time analysis:")
print(time.time() - start_time, "s")
print("Average", (time.time() - start_time) * 1000 / count, "ms")
print("DPS", count / (time.time() - start_time), "ps")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\MI\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


please enter the filename which you want to read:
11.warc
please enter the function you want to use: (stemming, stopword, case_folding, None)
use space to seperate each other
if you enter the None, any function will not be run
None
---------------------------------------------------------------------------------------
start!!!
create temp index directory:  tmp/eBJhOkmnGsXvjGmINIQH
analysis document:
0.04263639450073242 s
Average:  10.659098625183105 ms
DPS:  93.81656321646257 ps
---------------------------------------------------------------------------------------
build index
0.01140284538269043 s
Average:  2.8507113456726074 ms
DPS:  350.7896376523721 ps
finish!!!
---------------------------------------------------------------------------------------
total time analysis:
0.06881070137023926 s
Average 17.202675342559814 ms
DPS 57.18069439379428 ps
