In [1]:
import os
from pprint import pprint
from bs4 import BeautifulSoup
from nltk import WordNetLemmatizer
from nltk import regexp_tokenize
from nltk import PorterStemmer
from nltk.corpus import stopwords
from nltk import WordPunctTokenizer
import pandas as pd 
import numpy as np
import math


def rename_documents(dataPath):

    """ dataPath : "20_newsgroups/" """
    new_path = os.getcwd()+'/'+dataPath
    every_path = list(os.walk(new_path))
    
    total = 1
    for i in range(0,len(every_path)):
        dirPath, dirName, fileNames = every_path[i]
        if(len(dirName) == 0):
            for j in fileNames:
                single_doc_loc = dirPath + '/' + j
                os.rename(single_doc_loc, dirPath + '/' + str(total))
                total+=1
    
    return       


def collect_documents(dataPath):
    """ dataPath : "20_newsgroups/" """

    docs = {}
    
    new_path = os.getcwd()+'/'+dataPath
    every_path = list(os.walk(new_path))
    
    for i in range(0,len(every_path)):
        dirPath, dirName, fileNames = every_path[i]
        if(len(dirName) == 0):
            for j in fileNames:
                single_doc = []
                single_doc_loc = dirPath + '/' + j
                with open(single_doc_loc, 'rb') as f:
                    single_doc.append(str(f.read()))
                docs[j] = single_doc
    return docs

def tokenizeDocument(docs, qt = False):
    """
        Args:
            docs (dict) : all documents
        return:
            final tokens (list) : terms for index

        0. Convert to lowercase
        1. Stop words removed
        2. Tokenize 
        3. Stemming
        4. Lemmatization
        5. Only words that starts with alphabet or digit. Front 0's removed.
    """

    stop_ws = set(stopwords.words('english'))

    ts = docs.split('\\n')
    docs = ' '.join(ts)
    ts = docs.split('\t')

    docs = ' '.join(ts)
  
    # Tokenization
    tokens = WordPunctTokenizer().tokenize(docs)

    # lowercase
    tokens_lowercase = [ w.lower() for w in tokens]
    
    #Remove Stop words
    tokens_stop  = [ w for w in tokens_lowercase if(w not in stop_ws)] 
    
    # Stemming 
#     tokens_stem = [ PorterStemmer().stem(w) for w in tokens_stop]   # .wes. we

    # Lemmatization
    updated_tokens = [ WordNetLemmatizer().lemmatize(w) for w in tokens_stop]
     
    final_tokens = []
    
    for updated_token in updated_tokens:
        if(updated_token[0].isalpha()) and (len(updated_token) > 1):
            final_tokens.append(updated_token)
        else:
            if(updated_token.isnumeric()):
                final_tokens.append(str(int(updated_token)))
            else:
                if(updated_token[0].isdigit()):
                    updated_token = updated_token.lstrip('0')
                    final_tokens.append(updated_token)
        
    if(not qt):
        final_tokens = final_tokens[1:]  # remove b
    else:
        return final_tokens  

    return final_tokens

#### Inverted Index ####

def makeIndex(token_docid):
    inverted_index = {}

    for element in token_docid:
        term = element[0]
        docid = element[1]
        if(term not in inverted_index.keys()):
            postings_list = [0,{}]  # df, key = docid, value = tf
            postings_list[0] = 1
            postings_list[1][docid] = 1 #tf 
            inverted_index[term] = postings_list
        else:
            plist = inverted_index[term] 
            if docid not in plist[1].keys():
               plist[1][docid] = 1 #tf 
               plist[0]+=1
            else:
                plist[1][docid]+=1

    return inverted_index



def queryTermsOR(qts, inverted_index):
    docs_found = []
    
    for qt in qts:
        if(qt in inverted_index.keys()):
            for doc_id in inverted_index[qt][1].keys():
                if(doc_id not in docs_found):
                    docs_found.append([qt, doc_id])
    return docs_found




