# Information Retrieval System

In [53]:
# required imports
import numpy as np
import fnmatch
import os


Suppose we have 3 files containing data :

### File Contents

<img src="1.png"/>
<img src="2.png"/>
<img src="3.png"/>

# Step 1 Creating Files with Dummy data

You have to create few files with dummy data of your own choice as shown above.

In [54]:
f1 = open("f1.txt","w")
str1 = "This is my book"
f1.write(str1)
f1.close()

In [55]:
f2 = open("f2.txt","w")
str2 = "This is my pen"
f2.write(str2)
f2.close()

In [56]:
f3 = open("f3.txt","w")
str3 = "my book is interesting"
f3.write(str3)
f3.close()

# Step 2 Traversing Directories

 Now, You have to traverse the directories and store all the files into a dict type variable(files_dict). 

In [57]:
# Here we have intialized some variables, you can add more if required.

file_count = 0             # file_count to count number of files
file_dict = {}            # files_dic to store count of every file    
unique_word_set = set()    # unique_word_set to store all the unique words in a set


In [58]:
#Your code starts here   
for root, dirs, files in os.walk(r"C:\Users\Hamza\IRS Directory"):
    for File in files:
        if File not in file_dict:
            file_dict[File] = 1
            file_count = file_count + 1
        else:
            file_dict[File] = file_dict[File] + 1     
#Your code ends here       

Displaying the count of files.

In [59]:
print("\nTotal Number  of files\n", file_count)


Total Number  of files
 3


Displaying Dictionary containing all files.

In [60]:
print("\nDictionary containing  files\n", file_dict)


Dictionary containing  files
 {'f1.txt': 2, 'f2.txt': 2, 'f3.txt': 1}


# Step 3 Extracting Unique Vocabulary

In [61]:
# write code to print all the unique words in every file and store them in a set

In [62]:
#Your code starts here    
word_list = []
for keys in file_dict:
    input_file = open(keys,"r")
    file_content = input_file.read()
    word_list  += file_content.split()
    input_file.close()
unique_word_set = set(word_list)
for word in unique_word_set:
    if word not in unique_word_set:
        unique_word_set.add(word)
print("Unique words in files\n",unique_word_set)
print("Count of files ", file_count)
#Your code ends here

Unique words in files
 {'my', 'book', 'pen', 'is', 'This', 'interesting'}
Count of files  3


### Expected Output

<img src="4.png"/>

# Step 4 Creating Term Document Matrix

Create Term-Doc-matrix using Bag of word approach.and display its contents initially and finally.

In [39]:
# Create Term doc matrix such that colmns will be unique words and all the files will be rows
# Write code to count all the unique words appearances in all the files and store it in a dictionary for words 

In [63]:
#Your code starts here  
term_doc_matrix = np.zeros((file_count, len(unique_word_set)))
print("Term Doc Matrix Initially\n", term_doc_matrix)
i = 0
word_dict = dict.fromkeys(unique_word_set, 0)
for values in word_dict:
    word_dict[values] = i
    i += 1
print ("\nDictionary of unique words\n", word_dict)
j=0
file_value_dict = file_dict
for values in file_value_dict:
    file_value_dict[values] = j
    j += 1
print("\nDictionary of files\n", file_value_dict)
#Your code ends here

Term Doc Matrix Initially
 [[0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]]

Dictionary of unique words
 {'my': 0, 'book': 1, 'pen': 2, 'is': 3, 'This': 4, 'interesting': 5}

Dictionary of files
 {'f1.txt': 0, 'f2.txt': 1, 'f3.txt': 2}


### Expected Output

<img src="5.png"/>

# Step 5 Filling Term Document Matrix

In [41]:
# Fill the term doc matrix by checking if the unique word exists in a file or not
# If it exists then substitute a 1 in term_doc_matrix (eg : TERM_DOC_MATRIX[file][word] = 1 ) 
# Do the same for all the files present in the directory

In [42]:
#Your code starts here 
for keys in file_value_dict:
    word_list2 = []
    input_file = open(keys,"r")
    file_content = input_file.read()
    word_list2  += file_content.split()
    input_file.close()
    for word in word_dict:
        if(word in word_list2):
            term_doc_matrix[file_value_dict[keys], word_dict[word]] = 1        
print("Term Doc Matrix after Filling\n" , term_doc_matrix)
#Your code ends here

Term Doc Matrix after Filling
 [[1. 1. 0. 1. 1. 0.]
 [1. 0. 1. 1. 1. 0.]
 [1. 1. 0. 1. 0. 1.]]


### Expected Output

<img src="6.png"/>

# Step 6 Asking for a user Query

In [43]:
# For user query make a column vector of length of all the unique words present in a set

In [49]:
#Your code starts here  
user_query_vector_rows = len(word_dict)
user_query_vector = np.zeros((user_query_vector_rows,1), dtype = float)
print(user_query_vector)
#Your code ends here

[[0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]]


### Expected Output

<img src="7.png"/>

In [47]:
query = input("\nWrite something for searching  ")
# Check every word of query if it exists in the set of unique words or not
# If exixts then increment the count of that word in word dictionary



Write something for searching  This is my book I like it as my book my pen my interesting


In [50]:
#Your code starts here    
new_query = query.split(" ")
for word in new_query:
    if word in unique_word_set:
            user_query_vector[word_dict[word],0] += 1
print(user_query_vector)
#Your code ends here

[[4.]
 [2.]
 [1.]
 [1.]
 [1.]
 [1.]]


### Expected Output

<img src="8.png"/>

# Step 7 Displaying Resultant Vector

Display 
1. Resultant vector.
2. Max value in resultant vector.
3. Index of max value in resultant vector.


In [51]:
#Your code starts here  
resultant = np.dot(term_doc_matrix, user_query_vector)
maximum = 0
for i in range(0, len(resultant)):
    if(resultant[i] > maximum):    
        maximum = resultant[i] 
        index = i
print(resultant)
print("Maximum in resultant is: ", maximum[0])
print("Index of maximum in resultant is: ", index)
#Your code ends here

[[8.]
 [7.]
 [8.]]
Maximum in resultant is:  8.0
Index of maximum in resultant is:  0


### Expected Output

<img src="9.png"/>

# Step 8 Displaying the contents of file


In [None]:
#Write the code to identify the file_name having maximum value in the resultant vector and display its contents.

In [52]:
#Your code starts here  
for i in file_dict :
    if(file_dict[i] == index):
        input_file = open(i,"r")
        file_content = input_file.read()
        input_file.close()
print(file_content)
#Your code ends here

This is my book


Congratulations Now you are able to build your own small IRS.