In [1]:
import csv
import json
import math
import re
import numpy as np

import nltk
import pandas as pd
from Index import Index
from matplotlib import pyplot as plt
from nltk.stem import PorterStemmer, WordNetLemmatizer

from rank_bm25 import BM25Okapi

# Data Extraction

In [2]:
def read(f):
    content = ''
    for line in f:
        content += line
    return content 

In [3]:
with open('cisi\\CISI.ALL', 'r') as f:
    content = read(f)
titles = [x.groups()[0] for x in re.finditer(r"\.T[^\.\w]*?\n((.|\n)*?)\n\.A", content)]
abstracts = [x.groups()[0] for x in re.finditer(r"\.W.*?\n((.|\n)*?)\n\.X", content)]

In [4]:
with open('cisi\\CISI.QRY', 'r') as f:
    content = read(f) 
queries = [x.groups()[0] for x in re.finditer(r"\.W[^\.\w]*?\n((.|\n)*?)\n\.", content)]

In [5]:
raw_queries = {k: v for k, v in enumerate(queries)}
with open('DS\\raw_queries.json', 'w', encoding='utf-8') as f:
    json.dump(raw_queries, f, ensure_ascii=False, indent=4)

In [6]:
docs = [t + ' ' + a for t, a in zip(titles, abstracts)]

In [11]:
raw_docs = {i: d for i, d in enumerate(docs)}
with open('DS\\raw_docs.json', 'w', encoding='utf-8') as f:
    json.dump(raw_docs, f, ensure_ascii=False, indent=4)

# Building Dataset

In [7]:
index = Index(docs) 
index.process() 

Tokenizing...


100%|██████████| 1460/1460 [00:08<00:00, 174.22it/s]


Removing empty words...


100%|██████████| 1460/1460 [00:00<00:00, 3090.79it/s]


Getting frequencies...


100%|██████████| 1460/1460 [00:00<00:00, 47886.17it/s]


Combining...


1460it [00:06, 224.35it/s]


Getting weights...


100%|██████████| 1460/1460 [00:01<00:00, 1048.92it/s]


Combining...


1460it [00:07, 198.47it/s]


## Index and Inverted

In [28]:
with open('DS\\index.json', 'w', encoding='utf-8') as f:
    json.dump(index.index, f, ensure_ascii=False, indent=4)

In [29]:
with open('DS\\inverted.json', 'w', encoding='utf-8') as f:
    json.dump(index.inverted, f, ensure_ascii=False, indent=4)

## CSV File

In [30]:
data = []
for doc in list(index.index.keys()):
    for token in index.index[doc]:
        data.append([doc, token, index.index[doc][token][0], index.index[doc][token][1]]) 

In [31]:
with open("DS\\dataset.csv", "wt", newline='') as fp:
    writer = csv.writer(fp, delimiter=",")
    writer.writerow(['Document', 'Token', 'Frequency', 'Weight']) 
    writer.writerows(data) 

## Queries

In [32]:
def filter(token):
    t = PorterStemmer().stem(token)
    return WordNetLemmatizer().lemmatize(t)

In [33]:
def tokenize(docs, regex='(?:[A-Za-z]\.)+|\d+(?:\.\d+)?%?|\w+(?:\-\w+)*'):
    regex = nltk.RegexpTokenizer(regex) 
    tokens_lists = [regex.tokenize(txt) for txt in docs]
    tokens_lists = [[filter(t) for t in tokens_list] for tokens_list in tokens_lists] 
    empty_words = nltk.corpus.stopwords.words('english')
    tokens_lists = [[token.lower() for token in tokens if token not in empty_words] for tokens in tokens_lists]
    return tokens_lists

In [34]:
queries = tokenize(queries)
queries = [list(np.unique(q)) for q in queries]

In [35]:
queries = {i: q for i, q in enumerate(queries)} 

In [36]:
with open('DS\\queries.json', 'w', encoding='utf-8') as f:
    json.dump(queries, f, ensure_ascii=False, indent=4) 

## Ground truth

In [37]:
f = open('cisi\\CISI.REL', 'r')
truth = [x.split()[:2] for x in f.readlines()]
with open("DS\\ground_truth.csv", "wt", newline='') as fp:
    writer = csv.writer(fp, delimiter=",")
    writer.writerow(['Query', 'Relevent document']) 
    writer.writerows(truth)