In [1]:
import re
import json
import csv
from Index import Index 

# Data Extraction

In [2]:
def read(f):
    content = ''
    for line in f:
        content += line
    return content 

In [3]:
with open('cisi\\CISI.ALL', 'r') as f:
    content = read(f)
titles = [x.groups()[0] for x in re.finditer(r"\.T[^\.\w]*?\n((.|\n)*?)\n\.A", content)]
abstracts = [x.groups()[0] for x in re.finditer(r"\.W.*?\n((.|\n)*?)\n\.X", content)]

In [4]:
with open('cisi\\CISI.QRY', 'r') as f:
    content = read(f)
queries = [x.groups()[0] for x in re.finditer(r"\.W[^\.\w]*?\n((.|\n)*?)\n\.", content)]

In [5]:
len(titles), len(abstracts), len(queries)

(1460, 1460, 112)

In [6]:
docs = [t + ' ' + a for t, a in zip(titles, abstracts)]

In [7]:
docs = docs[:100]

# Building Dataset

In [8]:
index = Index(docs) 
index.process() 

Tokenizing...


100%|██████████| 100/100 [00:02<00:00, 43.68it/s]


Removing empty words...


100%|██████████| 100/100 [00:00<00:00, 2474.38it/s]


Getting frequencies...


100%|██████████| 100/100 [00:00<00:00, 50087.22it/s]


Getting weights...


100%|██████████| 100/100 [01:28<00:00,  1.13it/s]


## Index and Inverted

In [9]:
index.get_index()

In [10]:
index.get_inverted()

In [None]:
index.index

In [None]:
index.inverted

In [13]:
with open('index.json', 'w', encoding='utf-8') as f:
    json.dump(index.index, f, ensure_ascii=False, indent=4)

In [14]:
with open('inverted.json', 'w', encoding='utf-8') as f:
    json.dump(index.inverted, f, ensure_ascii=False, indent=4)

## CSV File

In [15]:
data = []
for k in list(index.index.keys()):
    for v in index.index[k]:
        data.append([k] + v)

In [16]:
with open("dataset.csv", "wt", newline='') as fp:
    writer = csv.writer(fp, delimiter=",")
    writer.writerow(['Document', 'Token', 'Frequency', 'Weight']) 
    writer.writerows(data)