### 1.Imports

In [16]:
import re , json , os , pickle , math
from collections import Counter
from Operator import *
from nltk.stem import PorterStemmer , WordNetLemmatizer
from nltk.tokenize import word_tokenize


### 2.Dataset and Base directory path

In [2]:
DIR_PATH = os.path.dirname(os.path.abspath(""))

DATASET_PATH = os.path.join(DIR_PATH, "DataSet")

STOPWORDBBC_PATH = os.path.join(DATASET_PATH, "stopwordbbc.txt")
STOPWORDTRUMP_PATH = os.path.join(DATASET_PATH, "stopwordtrump.txt")

DATASET_BBCSPORT = os.path.join(DATASET_PATH, "bbcsport")
DATASET_TRUMPSPEECHS = os.path.join(DATASET_PATH, "trumpspeechs")


CODE_PATH = os.path.join(DIR_PATH, "Code")
PICKLES_PATH = os.path.join(DIR_PATH, "Pickles")

### 3.Stop words and Special characters

In [3]:
infile  = open(STOPWORDTRUMP_PATH, "r")


stopwordtrump_list    = word_tokenize(infile.read())
stopwordtrump_list    = [i for i in stopwordtrump_list if i]

infile  = open(STOPWORDBBC_PATH, "r")

stopwordbbc_list    = word_tokenize(infile.read())
stopwordbbc_list    = [i for i in stopwordbbc_list if i]

specialchar_list = ['.',' ',',','[',']','(',')','"',':','?','','-']

### 4. Loading Trump dataset

#### 4.1 Stemming , Lemmatization & removing special characters


In [46]:
ps = PorterStemmer()
le = WordNetLemmatizer() 

for r, d, files in os.walk(DATASET_TRUMPSPEECHS):
    stem_speeches = [None for _ in range(len(files))]
    lema_speeches = [None for _ in range(len(files))]
    
    for speech in files:
        lema_speech  = []
        stem_speech  = []
        speech_no    = int(re.search(r'\d+',speech)[0])
        infile       = open(os.path.join(DATASET_TRUMPSPEECHS,speech), "r")
        content      = infile.read()
        content      = content.split('\n')[1]
        content      = re.sub(r"[^a-zA-Z0-9\']+", ' ', content)
        content      = content.casefold()
        content_list = word_tokenize(content)
        for word in content_list:
            stem_speech.append(ps.stem(word))
            lema_speech.append(le.lemmatize(word))
        stem_speeches[speech_no] = stem_speech
        lema_speeches[speech_no] = lema_speech
        infile.close()

#### 4.2 Generting Posting lists

In [49]:
all_posting_list = {}
for doc_no in range(0,len(stem_speeches)):
    speech       = stem_speeches[doc_no]
    clean_speech =  list(set(speech) - set(stopwordtrump_list))
    posting_list = {}
    for word in clean_speech:
        if word.lower() not in stopwordtrump_list:
            word_index = [index for index, value in enumerate(speech) if value == word]
            posting_list[word] = [{doc_no:word_index}]
    for word in posting_list.keys():
        all_posting_list.setdefault(word, []).append(posting_list[word][0])
out_file = open(os.path.join(PICKLES_PATH, "postinglist.pickle"), "wb")
pickle.dump(all_posting_list, out_file)
out_file.close()

#### 4.2 Generting Vector Space Model (VSM)

In [58]:
VSM     = []
doc_frq = {}
for doc_no in range(0,len(lema_speeches)):
    speech       = lema_speeches[doc_no]
    clean_speech =  list(set(speech) - set(stopwordtrump_list))
    
    doc_frq = Counter(doc_frq) + Counter(dict.fromkeys(clean_speech,1))
    
    term_count = {}
    for word in clean_speech:
        term_count.setdefault(word,0) 
        term_count[word] = term_count[word] +  speech.count(word)
    VSM.append(term_count)
for doc_no in range(0,len(VSM)):
    for word in VSM[doc_no]:
        VSM[doc_no][word] = VSM[doc_no][word]*math.log10( len(VSM) / doc_frq[word])
out_file = open(os.path.join(PICKLES_PATH, "vsm.pickle"), "wb")
pickle.dump(VSM, out_file)
out_file.close()
out_file = open(os.path.join(PICKLES_PATH, "docfrq.pickle"), "wb")
pickle.dump(doc_frq, out_file)
out_file.close()

### Test for VSM

In [89]:
query = input('Enter Query : ')

in_file = open(os.path.join(PICKLES_PATH, "vsm.pickle"), "rb")
VSM = pickle.load(in_file)
in_file.close()

in_file = open(os.path.join(PICKLES_PATH, "docfrq.pickle"), "rb")
doc_frq = pickle.load(in_file)
in_file.close()

le = WordNetLemmatizer() 

query       =  re.sub(r'[^a-zA-Z0-9_\s]+', '', query)
query       = query.casefold()
query_list  = word_tokenize(query)     
lema_query  = [le.lemmatize(word) for word in query_list]

clean_query = list(set(lema_query) - set(stopwordtrump_list))

query_count = {}
for word in clean_query:
    query_count.setdefault(word,0)
    try:
        query_count[word] = (query_count[word] +  lema_query.count(word))*math.log10(56 / doc_frq[word])
    except ( KeyError , ZeroDivisionError) :
        query_count[word] = 0.0
ans = []
for doc_no in range(0,len(VSM)):
    _sum = 0.0
    for word in clean_query:
        try:
            _sum = _sum + (VSM[doc_no][word] * query_count[word]) 
        except KeyError:
            _sum = _sum + (0.0 * query_count[word])

    x = [ v for k, v in VSM[doc_no].items()] 
    y = [ v for k, v in query_count.items()] 

    mag_x = math.sqrt(sum(x_i*x_i for x_i in x))
    mag_y = math.sqrt(sum(y_i*y_i for y_i in y))
    try:
        sim   =  _sum/(mag_x*mag_y)
    except ZeroDivisionError:
        print("I'm sorry")
    if sim >=0.0005:
        ans.append((sim,doc_no))
        # print('doc_no : ',doc_no,end=' -> ')
        # print('SIM : ',sim)
ans  = sorted(ans,reverse=True)
output = [item[1] for item in ans] 
print(output)
print('LENGTH : ',len(ans))


Enter Query : muslim
[3, 4, 9, 2, 20, 7, 6]
LENGTH :  7


### Test for Positional Index & Phrasal Query

In [52]:
query = input('Enter Query : ')

in_file = open(os.path.join(PICKLES_PATH, "postinglist.pickle"), "rb")
posting_list= pickle.load(in_file)
in_file.close()
ps = PorterStemmer()

try:    
    
    query = query.split('/')
    k     = int(query[1]) + 1
    query = query[0]
    query       =  re.sub(r'[^a-zA-Z0-9_\s]+', '', query)
    query       = query.casefold()
    query_list  = word_tokenize(query)     
    clean_query = [w for w in query_list if w.lower() not in stopwordtrump_list]
    stem_query  = [ps.stem(word) for word in clean_query]
    
    postinglist_1 = posting_list[stem_query[0]]
    postinglist_2 = posting_list[stem_query[1]]
    
    answer = []
    
    while len(postinglist_1) != 0 and len(postinglist_2) != 0:
        doc_1 = list(postinglist_1[0].keys())[0]
        doc_2 = list(postinglist_2[0].keys())[0]
        if doc_1 == doc_2:
            l = []

            postionlist_1 = postinglist_1[0][doc_1]
            postionlist_2 = postinglist_2[0][doc_2]

            while len(postionlist_1) != 0:
                while len(postionlist_2) != 0:
                    if abs(postionlist_1[0] - postionlist_2[0]) == k :
                        l.append(postionlist_2[0])
                    elif postionlist_2[0] > postionlist_1[0]:
                        break
                    postionlist_2.pop(0)
                while len(l) != 0 and abs(l[0] - postionlist_1[0]) > k:
                    l.pop(0)
                for postion in l:
                    answer.append((doc_1,postionlist_1[0],postion))

                postionlist_1.pop(0)

            postinglist_1.pop(0)
            postinglist_2.pop(0)

        elif int(doc_1) < int(doc_2):
            postinglist_1.pop(0)

        else:
            postinglist_2.pop(0)
    print(answer)
    
except (KeyError, ValueError,IndexError):
    print("Wrong format")

Enter Query : Hillary Clinton /0
[(1, 2046, 2047), (2, 1404, 1405), (2, 1455, 1456), (3, 14, 15), (3, 1084, 1085), (3, 1116, 1117), (3, 1184, 1185), (3, 1214, 1215), (3, 1350, 1351), (3, 1397, 1398), (3, 1896, 1897), (3, 1927, 1928), (3, 1956, 1957), (3, 2070, 2071), (3, 2125, 2126), (3, 2420, 2421), (3, 2755, 2756), (3, 2850, 2851), (3, 3039, 3040), (4, 148, 149), (4, 354, 355), (4, 571, 572), (4, 704, 705), (4, 1028, 1029), (4, 1030, 1029), (4, 1030, 1031), (4, 1076, 1077), (4, 1119, 1120), (4, 1137, 1138), (4, 1187, 1188), (4, 1245, 1246), (4, 1441, 1442), (4, 1476, 1477), (4, 1504, 1505), (4, 1540, 1541), (4, 1623, 1624), (4, 1707, 1708), (4, 1727, 1728), (4, 1828, 1829), (4, 1869, 1870), (4, 1929, 1930), (4, 2025, 2026), (4, 2044, 2045), (4, 2159, 2160), (4, 2198, 2199), (4, 2275, 2276), (4, 2305, 2306), (4, 2328, 2329), (4, 2399, 2400), (4, 2441, 2442), (4, 2558, 2559), (4, 2669, 2670), (4, 2695, 2696), (4, 2810, 2811), (4, 2895, 2896), (4, 3086, 3087), (4, 3269, 3270), (5, 490, 

### Test for Inverted Index & Boolean Query

In [52]:
query = input('Enter Query : ')

in_file = open(os.path.join(PICKLES_PATH, "postinglist.pickle"), "rb")
posting_list= pickle.load(in_file)
in_file.close()


ps = PorterStemmer()
query       =  re.sub(r'[^a-zA-Z0-9_\s]+', '', query)
query       = word_tokenize(query)     

operator = ['NOT','OR','AND','(',')']

postfix_query = GetPostfix(query)
stack = []
try:
    for i in postfix_query:
        if i.upper() not in operator:
            stack.append(i)
        elif i.upper() == 'NOT':
            query_1 = stack.pop()
            if type(query_1) is str:
                query_1  = ps.stem(query_1)
                query_1  = GetPostingList(posting_list[query_1])
            stack.append(NOT(query_1))

        elif i.upper() == 'AND':
            query_1 = stack.pop()
            query_2 = stack.pop()
            if type(query_1) is str:
                query_1  = ps.stem(query_1)
                query_1  = GetPostingList(posting_list[query_1])
            if type(query_2) is str:
                query_2  = ps.stem(query_2)
                query_2  = GetPostingList(posting_list[query_2])
            stack.append(AND(query_1,query_2))

        elif i.upper() == 'OR':
            query_1 = stack.pop()
            query_2 = stack.pop()
            if type(query_1) is str:
                query_1  = ps.stem(query_1)
                query_1  = GetPostingList(posting_list[query_1])
            if type(query_2) is str:
                query_2  = ps.stem(query_2)
                query_2  = GetPostingList(posting_list[query_2])
            stack.append(OR(query_1,query_2))
    answer = stack.pop()
    if type(answer) is str:
        answer  = ps.stem(answer)
        answer  = GetPostingList(posting_list[answer])
    answer.sort()
    print(answer)
except (KeyError ,ValueError):
    print('wrong key words')

Enter Query : not hammer
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 22, 23, 26, 28, 29, 30, 31, 32, 37, 38, 41, 44, 47, 48, 52, 55]


### EXTRA CODE 

In [36]:
# in_file = open(os.path.join(DATASET_PATH, "querylist.txt"), "r")
# query_list = []
# _query = in_file.readlines()
# _query = [q.replace('\n','') for q in _query if q.replace('\n','') != '']
# i = 0
# while 1:
#     if i == len(_query):
#         break
#     query = _query[i]
#     i = i + 1
#     ans   =  _query[i]
#     if ans == 'None':
#         ans = []
#     else:
#         ans = [int(a) for a in ans.split(",")]
#         ans.sort()
#     query_list.append((query,ans))
#     i = i + 1
    
# in_file.close()