In [None]:
import os
import re
from nltk.tokenize import sent_tokenize , word_tokenize
import nltk
from nltk.corpus import stopwords
#from nltk.stem.snowball import SnowballStemmer
from nltk.stem import PorterStemmer
import math
nltk.download('punkt')
nltk.download('stopwords')
import operator
import pickle
Stopwords = set(stopwords.words('english'))
ps = PorterStemmer()

#Global variables
files_path = 'english-corpora/' #Directory in which corpus files are present
files = os.listdir(files_path)  

#posting list, key=token,value=list of doc_id's
boolean_posting_list = {}
#posting list, key=token,value=list of doc_id's with frequency
tf_posting_list = {}
#list, value=(token, freq in that document index)
document_tf_list = []
#dict,no of times a token appears in all documents, useful in idf calculation
token_document_frequency = {}
#stores all unique tokens
vocabulary = set()
#key=filename,value = index(0,1,2)
filename_index_dict = {}
#key=index,value=filename
filename_index_inv_dict = {}
#No of tokens in each doc,including non unique tokens
doc_len = []
#key=token,value=list of (doc_id,tfidf value)
tf_idf_list = {}


#It is of the form {token1:[[doc1,doc2,doc5] token2:[doc1,doc10]}
def boolean_retrieval_system(token_list:[], doc_id: int):
    token_set = set(token_list)
    for token in token_set:
        if(not (token in boolean_posting_list)):
            boolean_posting_list[token] = []
        boolean_posting_list[token].append(doc_id)  
        
#It is of the form {token1:[(doc1,cnt1) (doc2,cnt2)], token2:[(doc1,cnt1)]}
#Here cnt is no of times token1 appears in doc1
def tf_retrieval_system(token_list:[], doc_id: int):
    tf_map = {}
    for token in token_list:
        if(token in tf_map):
            tf_map[token] += 1
        else:
            tf_map[token] = 1
       
    document_tf_list.append({})        
    for token in tf_map:
        if(not (token in tf_posting_list)):
            tf_posting_list[token] = []
        tf_posting_list[token].append((doc_id, tf_map[token]))
        if(token in token_document_frequency):
            token_document_frequency[token] += 1
        else:
            token_document_frequency[token] = 1
        
        document_tf_list[doc_id][token] = tf_map[token]
def clean_text(data: str)->str:
    sub_space   = '[. \- ,: /; \[\]\(\)=_]' #substitutes the special chars with space
    data = data.replace("\\", " ")    #replaces / with space
    data = re.sub(sub_space,' ',data) 
    data = data.replace('"',"") 
    data = data.replace("'","")  #replaces quotes with nothing
    cleaned_data = data.encode('ascii',errors='ignore').decode() #ignores non-ascii chars
    return cleaned_data
    

def tokenize_text(data: str):
    word_data = word_tokenize(data)   #word tokenizer lib
    token_list = []
    for word in word_data:
        #if(len(word)>1 and (word not in Stopwords)):
        if(len(word)>1 or (word in ['a','i'])):                         #tokenizing text.Including stop words
            token_list.append(word.lower())
    return token_list


def stem_text(token_data: []):
    stemmed_tokens = []
    for token in token_data:
        stemmed_tokens.append(ps.stem(token))  #porter stemmer
    return stemmed_tokens
        

def insert_unique_tokens(tokens: []):
    for token in tokens:
        vocabulary.add(token)  #all the unique tokens
    return vocabulary

def preprocess(data: str):
    cleaned_data   = clean_text(data)
    token_data     = tokenize_text(cleaned_data)
    stemmed_tokens = stem_text(token_data)
    return stemmed_tokens
    
doc_cnt = 0
for file in files:
    f = open(os.path.join(files_path, file),'r',encoding="utf8")
    
    data = f.read().replace('\n', ' ')
    
    filename_index_dict[file] = doc_cnt            #indexing filename with numbers.Eg.C0001->0,C0002->1...
    filename_index_inv_dict[doc_cnt] = file
    
    tokens = preprocess(data)
    insert_unique_tokens(tokens)                  #create vocabulary
    boolean_retrieval_system(tokens, doc_cnt)     #create boolean posting list
    tf_retrieval_system(tokens, doc_cnt)          #create term frequency posting list
    doc_len.append(len(tokens))                   #stores no of tokens
    f.close()
    doc_cnt+=1
    if(doc_cnt%100==0):
        print(doc_cnt, file)                      #debugging
    
        
#Calculates idf 
def calc_idf_tfidfalgo(token: str)->int:
    if(token in token_document_frequency):
        return math.log(doc_cnt/(token_document_frequency[token]+1))
    else:
        return 0


    
#calculates tf
def calc_tfidf(freq:int, idf: int)->int:
    if(freq==0):
        return 0
    else:
        return (1+math.log(freq))*idf
    
    
normalized_doc_len = [0]*doc_cnt #stores norm of tfidf vectors
#converts tf-idf scores to document vectors
#Eg. doc1[token1: (term1, tfidf1),token2: (term2, tfidf2)]
def vectorize_tfidf():
    tfidf_doc_score = [None]*doc_cnt
    
    for token in tf_idf_list:
        for doc_tuple in tf_idf_list[token]:
            doc_id, score = doc_tuple
            if(tfidf_doc_score[doc_id]==None):
                tfidf_doc_score[doc_id] = {}
            tfidf_doc_score[doc_id][token] = score
            
            normalized_doc_len[doc_id] += (score*score)
           # print('doc_id is ',doc_id)
        
    for doc_id in range(doc_cnt):
        for token in tfidf_doc_score[doc_id]:
            tfidf_doc_score[doc_id][token] = tfidf_doc_score[doc_id][token]
    return tfidf_doc_score
        
    


#calculates tf-idf
def tfidf_system():
    for token in tf_posting_list:
        freq_list = tf_posting_list[token]
        tf_idf_list[token] = []
        idf_tfidf_algo   = calc_idf_tfidfalgo(token)
        for freq_tuple in freq_list:
            doc_id, freq = freq_tuple
            tf_idf_list[token].append((doc_id, calc_tfidf(freq, idf_tfidf_algo)))

tfidf_system()

tfidf_doc_vector_list =  vectorize_tfidf()  

#store all the above files in a dump for faster query processing
#It avoids building global variables everytime a query is run
vocabulary_file               = open("vocabulary.txt", "wb")
boolean_posting_list_file     = open("boolean_posting_list.txt", "wb")
token_document_frequency_file = open("token_document_frequency.txt", "wb")
tfidf_doc_vector_list_file    = open("tfidf_doc_vector_list.txt", "wb")
normalized_doc_len_file       = open("normalized_doc_len.txt", "wb")
doc_len_file                  = open("doc_len.txt", "wb")
filename_index_inv_dict_file  = open("filename_index_inv_dict.txt", "wb")
document_tf_list_file         = open("document_tf_list.txt","wb")


pickle.dump(vocabulary, vocabulary_file)
pickle.dump(boolean_posting_list, boolean_posting_list_file)
pickle.dump(token_document_frequency, token_document_frequency_file)
pickle.dump(tfidf_doc_vector_list, tfidf_doc_vector_list_file)
pickle.dump(normalized_doc_len, normalized_doc_len_file)
pickle.dump(doc_len, doc_len_file)
pickle.dump(filename_index_inv_dict, filename_index_inv_dict_file)
pickle.dump(document_tf_list, document_tf_list_file)

vocabulary_file.close()
boolean_posting_list_file.close()
token_document_frequency_file.close()
tfidf_doc_vector_list_file.close()
normalized_doc_len_file.close()
doc_len_file.close()
filename_index_inv_dict_file.close()
document_tf_list_file.close()

[nltk_data] Downloading package punkt to /home/anirudh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/anirudh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


100 S00789.txt
200 T00740.txt
300 C00106.txt
400 L00071.txt
500 C00585.txt
600 S00988.txt
700 T00612.txt
800 D00813.txt
900 C00061.txt
1000 S00920.txt
1100 P04035.txt
1200 S00511.txt
1300 P05034.txt
1400 P_1375.txt
1500 C00981.txt
1600 P_569.txt
1700 M00189.txt
1800 C00273.txt
1900 D00369.txt
2000 L00438.txt
2100 P_1174.txt
2200 R00170.txt
2300 M00089.txt
2400 C00154.txt
2500 T00760.txt
2600 P_2079.txt
2700 L00303.txt
2800 P_824.txt
2900 P_23.txt
3000 R00124.txt
3100 D00657.txt
3200 D00697.txt
3300 M00150.txt
3400 D00515.txt
3500 P_562.txt
3600 C00977.txt
3700 C00616.txt
3800 C00624.txt
3900 D00736.txt
4000 T00682.txt
4100 C00847.txt
4200 C00803.txt
4300 S00728.txt
4400 C00770.txt
4500 L00308.txt
4600 P_2063.txt
4700 P_1602.txt
4800 P_1298.txt
4900 P_1780.txt
5000 C00539.txt
5100 P_1400.txt
5200 D00301.txt
5300 R00414.txt
5400 D00478.txt
5500 D00959.txt
5600 R00152.txt
5700 P_157.txt
5800 P03098.txt
5900 T00903.txt
6000 L00080.txt
6100 L00496.txt
6200 L00400.txt
6300 P_1589.txt
6400 C0