In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "last_expr"

In [6]:
import re, string, unicodedata
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import pandas as pd


In [7]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Wen\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Wen\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Wen\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Wen\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


True

In [8]:
f = open("dataset/passage_collection_new.txt", 'r', encoding = 'utf-8') 

In [9]:
document = f.read()

In [10]:
header_list = ["qid", "pid", "query", "passage"]
candidate_passages_top1000 = pd.read_csv("dataset/candidate_passages_top1000.tsv", sep='\t', names=header_list)

In [11]:
header_list = ["qid", "query"]
test_queries = pd.read_csv("dataset/test-queries.tsv", sep='\t', names=header_list)

# Preprocessing

## Lowercase

In [12]:
document = document.lower()

## Tokenizing

In [13]:

tokens = nltk.word_tokenize(document)


## Remove Punctuation

In [14]:
def remove_punctuation(tokens):
    new_tokens = []
    for token in tokens:
        new_token = re.sub(r'[^\w\s]', '', token)
        if new_token != '':
            new_tokens.append(new_token)
    return new_tokens

In [15]:
tokens = remove_punctuation(tokens)

## Remove Stopwords

In [16]:
def remove_stopwords(tokens):
    new_tokens = []
    stopword_set = set(stopwords.words('english'))
    for token in tokens:
        if token not in stopword_set:
            new_tokens.append(token)
    return new_tokens


In [17]:

tokens = remove_stopwords(tokens)

## Lemmatization

In [18]:
def lemmatize_verbs(tokens):
    lemmatizer = WordNetLemmatizer()
    root_words = []
    for token in tokens:
        root_word = lemmatizer.lemmatize(token, pos='v')
#         root_word = lemmatizer.lemmatize(token, pos='n')
#         root_word = lemmatizer.lemmatize(token, pos='a')
        root_words.append(root_word)
    return root_words

In [19]:

tokens = lemmatize_verbs(tokens)

## Remove numbers

In [20]:
def remove_numbers(tokens):
    """Replace all interger occurrences in list of tokenized words with textual representation"""
    new_tokens = []
    for token in tokens:
        if token.isdigit():
#             new_token = num2words(int(token), to = 'ordinal')
#             new_tokens.append(new_token)
            pass
        else:
            new_tokens.append(token)
    return new_tokens


In [21]:
tokens = remove_numbers(tokens)

In [22]:
def preprocessing(passage):
    passage = passage.lower()
    tokens = nltk.word_tokenize(passage)
    tokens = remove_punctuation(tokens)
    tokens = remove_stopwords(tokens)
    tokens = lemmatize_verbs(tokens)
    tokens = remove_numbers(tokens)
    return tokens

# Inverted Index

## Extracting Pid and Passages

In [23]:
candidate_passages_top1000

Unnamed: 0,qid,pid,query,passage
0,494835,7130104,"sensibilities, definition",This is the definition of RNA along with examp...
1,1128373,7130104,iur definition,This is the definition of RNA along with examp...
2,131843,7130104,definition of a sigmet,This is the definition of RNA along with examp...
3,20455,7130335,ar glasses definition,Best Answer: The AR designation comes from the...
4,719381,7130335,what is ar balance,Best Answer: The AR designation comes from the...
...,...,...,...,...
189872,1056204,79980,who was the first steam boat operator,Other operators with special formats accept mo...
189873,1132213,7998257,how long to hold bow in yoga,You may be surprised that to learn that yoga t...
189874,324211,7998651,how much money a united airline get as a capta...,Find cheap airline tickets & deals on flights ...
189875,1116341,7998709,closed ended mortgage definition,"What is a wrap-around mortgage, and who is it ..."


In [24]:
passage_dict = {}
for idx, row in candidate_passages_top1000.iterrows():
    pid = row['pid']
    passage = preprocessing(row['passage'])
    if pid not in passage_dict:
        passage_dict[pid] = passage
#         passage_dict[pid] = None
#     passage_dict[pid] = passage

    

In [25]:
len(passage_dict)

182469

In [26]:
inverted_index = {}

# term_frequency = frequency_dict[term].frequency if term in frequency_dict else 0

In [27]:
len(tokens)

5889367

In [28]:
tokens_no_dup = set(tokens)
len(tokens_no_dup)

172464

In [29]:
passage = passage_dict[7130104]
freqDist = nltk.FreqDist(passage)
passage
freqDist
for check in freqDist:
    print(check)

if 'rna' in freqDist:
    print(freqDist['rna'])

rna
definition
along
examples
type
molecules
5


In [48]:
# def get_vocabulary(data):
#     tokens = []
#     for token_list in data.values():
#         tokens = tokens + token_list
# #     print("tokens:", tokens)
#     fdist = nltk.FreqDist(tokens)
# #     print("fdist:", fdist.items())
#     return list(fdist.keys())

In [None]:
# check = get_vocabulary(passage_dict)

In [None]:
import time

start = time.time()

for token in tokens_no_dup:
    for pid, passage in passage_dict.items():
        freqDist = nltk.FreqDist(passage)
        if token in freqDist:
            inverted_index.setdefault(token, [])
            inverted_index[token].append((pid, freqDist[token]))

end = time.time()
print(end - start)

In [32]:
len(inverted_index)

15424