### 1.Imports

In [1]:
import re , json , os , pickle , math , random ,statistics
from random import randint,seed , shuffle
from collections import Counter
from Operator import *
from nltk.stem import PorterStemmer , WordNetLemmatizer
from nltk.tokenize import word_tokenize


### 2.Dataset and Base directory path

In [2]:
DIR_PATH = os.path.dirname(os.path.abspath(""))

DATASET_PATH = os.path.join(DIR_PATH, "DataSet")

STOPWORDBBC_PATH = os.path.join(DATASET_PATH, "stopwordbbc.txt")
STOPWORDTRUMP_PATH = os.path.join(DATASET_PATH, "stopwordtrump.txt")

DATASET_BBCSPORT = os.path.join(DATASET_PATH, "bbcsport")
DATASET_TRUMPSPEECHS = os.path.join(DATASET_PATH, "trumpspeechs")


CODE_PATH = os.path.join(DIR_PATH, "Code")
PICKLES_PATH = os.path.join(DIR_PATH, "Pickles")

### 3.Stop words and Special characters

In [3]:
infile  = open(STOPWORDTRUMP_PATH, "r")


stopwordtrump_list    = word_tokenize(infile.read())
stopwordtrump_list    = [i for i in stopwordtrump_list if i]

infile  = open(STOPWORDBBC_PATH, "r")

stopwordbbc_list    = word_tokenize(infile.read())
stopwordbbc_list    = [i for i in stopwordbbc_list if i]

specialchar_list = ['.',' ',',','[',']','(',')','"',':','?','','-']

### 4. Loading Trump dataset

#### 4.1 Stemming , Lemmatization & removing special characters


In [4]:
ps = PorterStemmer()
le = WordNetLemmatizer() 

for r, d, files in os.walk(DATASET_TRUMPSPEECHS):
    stem_speeches = [None for _ in range(len(files))]
    lema_speeches = [None for _ in range(len(files))]
    
    for speech in files:
        lema_speech  = []
        stem_speech  = []
        speech_no    = int(re.search(r'\d+',speech)[0])
        infile       = open(os.path.join(DATASET_TRUMPSPEECHS,speech), "r")
        content      = infile.read()
        content      = content.split('\n')[1]
        content      = re.sub(r"[^a-zA-Z0-9\']+", ' ', content)
        content      = content.casefold()
        content_list = word_tokenize(content)
        for word in content_list:
            stem_speech.append(ps.stem(word))
            lema_speech.append(le.lemmatize(word))
        stem_speeches[speech_no] = stem_speech
        lema_speeches[speech_no] = lema_speech
        infile.close()

#### 4.2 Generting Posting lists

In [5]:
all_posting_list = {}
for doc_no in range(0,len(stem_speeches)):
    speech       = stem_speeches[doc_no]
    clean_speech =  list(set(speech) - set(stopwordtrump_list))
    posting_list = {}
    for word in clean_speech:
        if word.lower() not in stopwordtrump_list:
            word_index = [index for index, value in enumerate(speech) if value == word]
            posting_list[word] = [{doc_no:word_index}]
    for word in posting_list.keys():
        all_posting_list.setdefault(word, []).append(posting_list[word][0])
out_file = open(os.path.join(PICKLES_PATH, "postinglist.pickle"), "wb")
pickle.dump(all_posting_list, out_file)
out_file.close()

#### 4.2 Generting Vector Space Model (VSM)

In [6]:
VSM     = []
doc_frq = {}
for doc_no in range(0,len(lema_speeches)):
    speech       = lema_speeches[doc_no]
    clean_speech =  list(set(speech) - set(stopwordtrump_list))
    
    doc_frq = Counter(doc_frq) + Counter(dict.fromkeys(clean_speech,1))
    
    term_count = {}
    for word in clean_speech:
        term_count.setdefault(word,0) 
        term_count[word] = term_count[word] +  speech.count(word)
    VSM.append(term_count)
for doc_no in range(0,len(VSM)):
    for word in VSM[doc_no]:
        VSM[doc_no][word] = VSM[doc_no][word]*math.log10( len(VSM) / doc_frq[word])
out_file = open(os.path.join(PICKLES_PATH, "vsm.pickle"), "wb")
pickle.dump(VSM, out_file)
out_file.close()
out_file = open(os.path.join(PICKLES_PATH, "docfrq.pickle"), "wb")
pickle.dump(doc_frq, out_file)
out_file.close()

### 5. Loading BBC sports dataset

### K-NN

#### 5.1 Split Dataset

#### 5.2 Stemming  , removing special characters & stop words

#### 5.3 Document frq

In [7]:
ps = PorterStemmer()

percent = 70 / 100

doc_frq        = {}
all_train_doc  = {}
all_test_doc   = {}
actual_output  = {}

for nested_folder_path, subdirs, files in os.walk(DATASET_BBCSPORT):
    if len(subdirs) is 0:

        _class                = os.path.basename(nested_folder_path)

        random.shuffle(files)

        train_files = math.ceil(len(files)*percent)

        test_files  = len(files) - train_files 

        actual_output[_class] = test_files


        for i in range(0,train_files):
            file_path    = files[i]
            file          = open(nested_folder_path +'\\'+ file_path , "r")

            content       = file.read()
            content       = re.sub(r"[^a-zA-Z0-9\']+", ' ', content)
            content       = content.casefold()
            content_list  = word_tokenize(content)
            
            clean_content = list(set(content_list) - set(stopwordbbc_list))
            stem_content  = [ps.stem(word) for word in clean_content] 
            
            doc_frq       = Counter(doc_frq) + Counter(dict.fromkeys(stem_content,1))

            all_train_doc.setdefault(_class, []).append(stem_content)
        
        for i in range(train_files,train_files+test_files):
            file_path      = files[i]
            file           = open(nested_folder_path +'\\'+ file_path , "r")

            content       = file.read()
            content       = re.sub(r"[^a-zA-Z0-9\']+", ' ', content)
            content       = content.casefold()
            content_list  = word_tokenize(content)
            
            clean_content = list(set(content_list) - set(stopwordbbc_list))
            stem_content  = [ps.stem(word) for word in clean_content] 
            
            all_test_doc.setdefault(_class, []).append(stem_content)
            
doc_frq = Counter({k: c for k, c in doc_frq.items() if c >= 3})

#### 5.4 Convert into Vector 

In [8]:
train_vsm = GetVector(all_train_doc ,doc_frq)
test_vsm  = GetVector(all_test_doc ,doc_frq)

out_file = open(os.path.join(PICKLES_PATH, "knntest.pickle"), "wb")
pickle.dump(test_vsm, out_file)
out_file.close()
out_file = open(os.path.join(PICKLES_PATH, "knntrain.pickle"), "wb")
pickle.dump(train_vsm, out_file)
out_file.close()

## Applying KNN

In [19]:
in_file = open(os.path.join(PICKLES_PATH, "knntrain.pickle"), "rb")
train_vsm = pickle.load(in_file)
in_file.close()

in_file = open(os.path.join(PICKLES_PATH, "knntest.pickle"), "rb")
test_vsm = pickle.load(in_file)
in_file.close()

k              = 3
classes        = test_vsm.keys()
predict_output = { i:0  for i in classes} 
for test_vector in test_vsm:
    for x in test_vsm[test_vector]:
        sim_all_doc = [(0,0,None) for k in range(0,k)]
        for train_vector in train_vsm:
            for i in range(0,len(train_vsm[train_vector])):
                y   = train_vsm[train_vector][i]
                sim = CosineSimilarity(x,y)
                if min(sim_all_doc)[0] <= sim:
                    sim_all_doc[sim_all_doc.index(min(sim_all_doc))] = (sim,i,train_vector)

        k_classes = [c for s,i,c in sim_all_doc]
        try:
            prd_class = statistics.mode(k_classes)
        except statistics.StatisticsError:
            prd_class = k_classes[0]

        if prd_class == test_vector:
            predict_output[test_vector] = predict_output[test_vector] + 1
            
## Accuracy 
each_label_accuracy = {}
for c in actual_output:
    try:
        each_label_accuracy[c]  = (predict_output[c] / actual_output[c]) * 100
    except KeyError:
        each_label_accuracy[c] = 0
print('\t\t\t\t\t\tEach Label Accuracy\n',each_label_accuracy)
total_test_doc = sum([len(doc) for doc in all_test_doc.values()])
accuracy       = (sum(predict_output.values())/total_test_doc)*100
print('ACCURACY : ',accuracy,' %')

						Each Label Accuracy
 {'athletics': 96.66666666666667, 'cricket': 91.8918918918919, 'football': 94.9367088607595, 'rugby': 90.9090909090909, 'tennis': 96.66666666666667}
ACCURACY :  94.0909090909091  %


### 6. Loading BBC sports dataset

### K-MEAN


#### 6.2 Stemming  , removing special characters & stop words

#### 6.3 Document frq

In [10]:
ps = PorterStemmer()

all_doc_frq  = []
all_doc      = []

for nested_folder_path, subdirs, files in os.walk(DATASET_BBCSPORT):
    if len(subdirs) is 0:
        _class                = os.path.basename(nested_folder_path)
        for file_path in files:
            file          = open(nested_folder_path +'\\'+ file_path , "r")

            content       = file.read()
            content       = re.sub(r"[^a-zA-Z0-9\']+", ' ', content)
            content       = content.casefold()
            content_list  = word_tokenize(content)
            
            clean_content = list(set(content_list) - set(stopwordbbc_list))
            stem_content  = [ps.stem(word) for word in clean_content] 
            
            all_doc_frq       = Counter(all_doc_frq) + Counter(dict.fromkeys(stem_content,1))

            all_doc.append([_class]+stem_content)
        
all_doc_frq = Counter({k: c for k, c in all_doc_frq.items() if c >= 3})

#### 6.4 Convert into Vector

In [11]:
total_doc = len(all_doc)
all_vsm       = [] 
vocab     = all_doc_frq.keys()
for doc_vocab in all_doc:
    doc_vector = []
    doc_vector = [doc_vocab.count(word)*math.log10(total_doc/ all_doc_frq[word]) for word in vocab]
    all_vsm.append([doc_vocab[0]] +  doc_vector)


out_file = open(os.path.join(PICKLES_PATH, "kmean.pickle"), "wb")
pickle.dump(all_vsm, out_file)
out_file.close()

## Applying KMEAN

In [12]:
seed(1)

in_file = open(os.path.join(PICKLES_PATH, "kmean.pickle"), "rb")
all_vsm = pickle.load(in_file)
in_file.close()

shuffle(all_vsm)
features = len(all_vsm)
all_vsm      = numpy.array(all_vsm)
K        = 5
    
iteration = 0
if K < len(all_vsm):
    prev_centriod = []

    centriod      = all_vsm[:K]
    centriod      = numpy.delete(centriod, 0, axis=1).astype('float64').tolist()


    while prev_centriod != centriod:
        iteration  +=  1

        cluster     = [[] for k in range(0,K)]
        ans_cluster = [[] for k in range(0,K)]

        for vector in all_vsm:
            x = numpy.delete(vector, 0, axis=0).astype('float64').tolist()
            distance = []
            for y in centriod:
                distance.append(CosineSimilarity(x,y))


            index = distance.index(max(distance))

            cluster[index].append(x)

            ans_cluster[index].append(vector[0])

        prev_centriod = centriod
        centriod      = []

        for i in range(0,len(cluster)):

            if len(cluster[i]) == 0:
                ind = randint(0,len(vsm)-1)
                x = numpy.delete(all_vsm[ind], 0, axis=0).astype('float64').tolist()
                cluster[i].append(x)

            centriod.append(Centriod(cluster[i]))
max_count = []
for c in ans_cluster:
    max_count.append(c.count(statistics.mode(c)))
    
purity = 1/features * sum(max_count)

print('Purity : ',purity)

Purity :  0.6906377204884667


### Test for VSM

In [13]:
query = input('Enter Query : ')

in_file = open(os.path.join(PICKLES_PATH, "vsm.pickle"), "rb")
VSM = pickle.load(in_file)
in_file.close()

in_file = open(os.path.join(PICKLES_PATH, "docfrq.pickle"), "rb")
doc_frq = pickle.load(in_file)
in_file.close()

le = WordNetLemmatizer() 

query       =  re.sub(r'[^a-zA-Z0-9_\s]+', '', query)
query       = query.casefold()
query_list  = word_tokenize(query)     
lema_query  = [le.lemmatize(word) for word in query_list]

clean_query = list(set(lema_query) - set(stopwordtrump_list))

query_count = {}
for word in clean_query:
    query_count.setdefault(word,0)
    try:
        query_count[word] = (query_count[word] +  lema_query.count(word))*math.log10(56 / doc_frq[word])
    except ( KeyError , ZeroDivisionError) :
        query_count[word] = 0.0
ans = []
for doc_no in range(0,len(VSM)):
    _sum = 0.0
    for word in clean_query:
        try:
            _sum = _sum + (VSM[doc_no][word] * query_count[word]) 
        except KeyError:
            _sum = _sum + (0.0 * query_count[word])

    x = [ v for k, v in VSM[doc_no].items()] 
    y = [ v for k, v in query_count.items()] 

    mag_x = math.sqrt(sum(x_i*x_i for x_i in x))
    mag_y = math.sqrt(sum(y_i*y_i for y_i in y))
    try:
        sim   =  _sum/(mag_x*mag_y)
    except ZeroDivisionError:
        print("I'm sorry")
    if sim >=0.0005:
        ans.append((sim,doc_no))
        # print('doc_no : ',doc_no,end=' -> ')
        # print('SIM : ',sim)
ans  = sorted(ans,reverse=True)
output = [item[1] for item in ans] 
print(output)
print('LENGTH : ',len(ans))


Enter Query : no patience for injustice
[11, 7, 16, 22, 15]
LENGTH :  5


### Test for Positional Index & Phrasal Query

In [14]:
query = input('Enter Query : ')

in_file = open(os.path.join(PICKLES_PATH, "postinglist.pickle"), "rb")
posting_list= pickle.load(in_file)
in_file.close()
ps = PorterStemmer()

try:    
    
    query = query.split('/')
    k     = int(query[1]) + 1
    query = query[0]
    query       =  re.sub(r'[^a-zA-Z0-9_\s]+', '', query)
    query       = query.casefold()
    query_list  = word_tokenize(query)     
    clean_query = [w for w in query_list if w.lower() not in stopwordtrump_list]
    stem_query  = [ps.stem(word) for word in clean_query]
    
    postinglist_1 = posting_list[stem_query[0]]
    postinglist_2 = posting_list[stem_query[1]]
    
    answer = []
    
    while len(postinglist_1) != 0 and len(postinglist_2) != 0:
        doc_1 = list(postinglist_1[0].keys())[0]
        doc_2 = list(postinglist_2[0].keys())[0]
        if doc_1 == doc_2:
            l = []

            postionlist_1 = postinglist_1[0][doc_1]
            postionlist_2 = postinglist_2[0][doc_2]

            while len(postionlist_1) != 0:
                while len(postionlist_2) != 0:
                    if abs(postionlist_1[0] - postionlist_2[0]) == k :
                        l.append(postionlist_2[0])
                    elif postionlist_2[0] > postionlist_1[0]:
                        break
                    postionlist_2.pop(0)
                while len(l) != 0 and abs(l[0] - postionlist_1[0]) > k:
                    l.pop(0)
                for postion in l:
                    answer.append((doc_1,postionlist_1[0],postion))

                postionlist_1.pop(0)

            postinglist_1.pop(0)
            postinglist_2.pop(0)

        elif int(doc_1) < int(doc_2):
            postinglist_1.pop(0)

        else:
            postinglist_2.pop(0)
    print(answer)
    
except (KeyError, ValueError,IndexError):
    print("Wrong format")

Enter Query : keep out /2
[(20, 696, 699), (20, 712, 715), (24, 1227, 1230), (39, 2042, 2045), (40, 1928, 1931), (51, 156, 159)]


### Test for Inverted Index & Boolean Query

In [17]:
query = input('Enter Query : ')

in_file = open(os.path.join(PICKLES_PATH, "postinglist.pickle"), "rb")
posting_list= pickle.load(in_file)
in_file.close()


ps = PorterStemmer()
query       =  re.sub(r'[^a-zA-Z0-9_\s]+', '', query)
query       = word_tokenize(query)     

operator = ['NOT','OR','AND','(',')']

postfix_query = GetPostfix(query)
stack = []
try:
    for i in postfix_query:
        if i.upper() not in operator:
            stack.append(i)
        elif i.upper() == 'NOT':
            query_1 = stack.pop()
            if type(query_1) is str:
                query_1  = ps.stem(query_1)
                query_1  = GetPostingList(posting_list[query_1])
            stack.append(NOT(query_1))

        elif i.upper() == 'AND':
            query_1 = stack.pop()
            query_2 = stack.pop()
            if type(query_1) is str:
                query_1  = ps.stem(query_1)
                query_1  = GetPostingList(posting_list[query_1])
            if type(query_2) is str:
                query_2  = ps.stem(query_2)
                query_2  = GetPostingList(posting_list[query_2])
            stack.append(AND(query_1,query_2))

        elif i.upper() == 'OR':
            query_1 = stack.pop()
            query_2 = stack.pop()
            if type(query_1) is str:
                query_1  = ps.stem(query_1)
                query_1  = GetPostingList(posting_list[query_1])
            if type(query_2) is str:
                query_2  = ps.stem(query_2)
                query_2  = GetPostingList(posting_list[query_2])
            stack.append(OR(query_1,query_2))
    answer = stack.pop()
    if type(answer) is str:
        answer  = ps.stem(answer)
        answer  = GetPostingList(posting_list[answer])
    answer.sort()
    print(answer)
except (KeyError ,ValueError):
    print('wrong key words')

Enter Query : outdated AND ( personnel OR policies)
[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 29, 31, 32, 33, 34, 36, 37, 39, 40, 42, 43, 44, 45, 48, 49, 51, 52, 54]


### EXTRA CODE 

In [36]:
# in_file = open(os.path.join(DATASET_PATH, "querylist.txt"), "r")
# query_list = []
# _query = in_file.readlines()
# _query = [q.replace('\n','') for q in _query if q.replace('\n','') != '']
# i = 0
# while 1:
#     if i == len(_query):
#         break
#     query = _query[i]
#     i = i + 1
#     ans   =  _query[i]
#     if ans == 'None':
#         ans = []
#     else:
#         ans = [int(a) for a in ans.split(",")]
#         ans.sort()
#     query_list.append((query,ans))
#     i = i + 1
    
# in_file.close()