# 1. PREPROCESSING THE DATA

## IMPORTING REQUIRED LIBRARIES

In [17]:
import pandas as pd
import numpy as np

In [18]:
dataframe = pd.read_csv("ratings.csv",index_col = "Unnamed: 0")

In [19]:
dataframe.shape

(1319968, 3)

## DROPPING THE RATING COLUMN

In [20]:
dataframe = dataframe.drop(['rating'], axis=1)

## TREATING SAME RESTAURANT AT DIFFERENT LOCATIONS AS ONE ENTITY TO REDUCE SIZE OF POSTING LIST IN LATER IMPLEMENTATIONS

In [21]:
dataframe['review'] = dataframe[['name','review']].groupby(['name'])['review'].transform(lambda x: ''.join(str(x)))

In [22]:
dataframe.shape

(1319968, 2)

## DELETING ALL DUPLICATE COLUMNS

In [23]:
dataframe = dataframe[['name','review']].drop_duplicates()

In [24]:
dataframe.shape

(7041, 2)

In [25]:
dataframe

Unnamed: 0,name,review
0,Jalsa,0 A beautiful place to dine inThe int...
12,Spice Elephant,12 Had been here for dinner with family ...
26,San Churro Cafe,26 Ambience is not that good enough and...
46,Addhuri Udupi Bhojana,46 Great food and proper Karnataka style...
81,Grand Village,81 Very good restaurant in neighbourhood...
...,...,...
1315206,Calcutta North Indian Meals,1315206 This center probably famous for nam...
1315268,Chime - Sheraton Grand Bengaluru Whitefield Ho...,1315268 Nice and friendly place and staff i...
1315289,The Nest - The Den Bengaluru,1315289 Great ambience looking nice good s...
1315306,Nawabs Empire,1315306 This place is not at all good We ha...


## MAKING ALL STRINGS LOWER CASE

In [26]:
dataframe['review'] = dataframe['review'].str.lower()

## REMOVING ERRONEOUS NUMBERS FROM THE REVIEWS

In [27]:
dataframe.review = dataframe.review.str.replace('\d+', '')

  dataframe.review = dataframe.review.str.replace('\d+', '')


## REMOVING STOP WORDS

In [28]:
import nltk.corpus
nltk.download('stopwords')
stop = nltk.corpus.stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [29]:
dataframe['review'] = dataframe['review'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

## ONLY KEEPING WORDS PRESENT IN THE ENGLISH DICTIONARY

In [30]:
import nltk
nltk.download('words')
words = set(nltk.corpus.words.words())

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


In [31]:
dataframe['review'] = dataframe['review'].apply(lambda x: ' '.join([word for word in x.split() if word in (words)]))

In [32]:
dataframe

Unnamed: 0,name,review
0,Jalsa,beautiful place dine dinner family restaurant ...
12,Spice Elephant,dinner family turned ambience really nice staf...
26,San Churro Cafe,ambience good enough went quick bite first big...
46,Addhuri Udupi Bhojana,great food proper style full place half good f...
81,Grand Village,good restaurant buffet great service overwhelm...
...,...,...
1315206,Calcutta North Indian Meals,center probably famous north object
1315268,Chime - Sheraton Grand Bengaluru Whitefield Ho...,nice friendly place staff awesome service bad ...
1315289,The Nest - The Den Bengaluru,great ambience looking nice good selection nes...
1315306,Nawabs Empire,place good ordered negative review would object


## RESETING INDEX

In [33]:
dataframe.reset_index(drop=True, inplace = True)

In [34]:
import nltk

## LEMMETIZING WITHOUT POS INDEX

In [35]:
# import nltk

# w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
# lemmatizer = nltk.stem.WordNetLemmatizer()

# def lemmatize_text(text):
#     return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]


# dataframe['review'] = dataframe.review.apply(lemmatize_text)

In [36]:
dataframe

Unnamed: 0,name,review
0,Jalsa,beautiful place dine dinner family restaurant ...
1,Spice Elephant,dinner family turned ambience really nice staf...
2,San Churro Cafe,ambience good enough went quick bite first big...
3,Addhuri Udupi Bhojana,great food proper style full place half good f...
4,Grand Village,good restaurant buffet great service overwhelm...
...,...,...
7036,Calcutta North Indian Meals,center probably famous north object
7037,Chime - Sheraton Grand Bengaluru Whitefield Ho...,nice friendly place staff awesome service bad ...
7038,The Nest - The Den Bengaluru,great ambience looking nice good selection nes...
7039,Nawabs Empire,place good ordered negative review would object


## Words like friendly do not get converted so lemmetizing with POS tag

## LEMMETIZING WITH POS TAG

SIZE OF DICTIONARY BEFORE LEMMETIZATION

In [37]:
d = set()
for words in dataframe.review.str.findall(r"\w+").map(set):
    for word in words:
        d.add(word)
print(len(d))

4889


In [38]:
# WORDNET LEMMATIZER (with appropriate pos tags)

import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import wordnet

lemmatizer = WordNetLemmatizer()

def pos_tagger(nltk_tag):
	if nltk_tag.startswith('J'):
		return wordnet.ADJ
	elif nltk_tag.startswith('V'):
		return wordnet.VERB
	elif nltk_tag.startswith('N'):
		return wordnet.NOUN
	elif nltk_tag.startswith('R'):
		return wordnet.ADV
	else:		
		return None

def pos_tagged(sentence):
    return nltk.pos_tag(nltk.word_tokenize(sentence))

def wordnet_tagged(sentence):
    return list(map(lambda x: (x[0], pos_tagger(x[1])), sentence))

def lem(sentence):
    lemmatized_sentence = []
    for word, tag in sentence:
        if tag is None:
            # if there is no available tag, append the token as is
            lemmatized_sentence.append(word)
        else:	
            # else use the tag to lemmatize the token
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
    return " ".join(lemmatized_sentence)


def final(sentence):
    sentence = pos_tagged(sentence)
    sentence = wordnet_tagged(sentence)
    sentence = lem(sentence)
    return sentence
    
dataframe['review'] = dataframe['review'].apply(final)


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


LookupError: 
**********************************************************************
  Resource [93mpunkt[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt/english.pickle[0m

  Searched in:
    - 'C:\\Users\\Administrator/nltk_data'
    - 'c:\\Users\\Administrator\\AppData\\Local\\Programs\\Python\\Python310\\nltk_data'
    - 'c:\\Users\\Administrator\\AppData\\Local\\Programs\\Python\\Python310\\share\\nltk_data'
    - 'c:\\Users\\Administrator\\AppData\\Local\\Programs\\Python\\Python310\\lib\\nltk_data'
    - 'C:\\Users\\Administrator\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
    - ''
**********************************************************************


In [None]:
dataframe

Unnamed: 0,name,review
0,Jalsa,beautiful place dine dinner family restaurant ...
1,Spice Elephant,dinner family turn ambience really nice staff ...
2,San Churro Cafe,ambience good enough go quick bite first big t...
3,Addhuri Udupi Bhojana,great food proper style full place half good f...
4,Grand Village,good restaurant buffet great service overwhelm...
...,...,...
7036,Calcutta North Indian Meals,center probably famous north object
7037,Chime - Sheraton Grand Bengaluru Whitefield Ho...,nice friendly place staff awesome service bad ...
7038,The Nest - The Den Bengaluru,great ambience look nice good selection nest o...
7039,Nawabs Empire,place good order negative review would object


In [None]:
#estimating the size of the dictionary AFTER LEMMETIZATION

d = set()
for words in dataframe.review.str.findall(r"\w+").map(set):
    for word in words:
        d.add(word)
print(len(d))

4633


In [None]:
#4889 to 4633 is the length of the dictionary after lemmetization

In [None]:
#trying to compress posting list by using docid instead

## 2.  CREATING INVERTED INDEX

In [None]:

new_list = []
for i in range(dataframe.shape[0]):
    for j in dataframe.iloc[i,1].split():
        new_list.append([j,i])
new_list = sorted(new_list)
dict_index = {}
words = []
for i in new_list:
    if i[0] not in words:
        words.append(i[0])
        dict_index[i[0]] = [1,[i[1]]]
    else:  
        if(i[1] not in dict_index[i[0]][1]):    
            dict_index[i[0]][0]+=1
            dict_index[i[0]][1].append(i[1])

In [None]:
dict_index

{'aa': [3, [3925, 5024, 6230]],
 'abandon': [1, [1241]],
 'able': [12,
  [736, 815, 1645, 1902, 2228, 4302, 5089, 6007, 6016, 6474, 6657, 6737]],
 'absolute': [33,
  [234,
   503,
   560,
   767,
   768,
   1018,
   1020,
   1080,
   1508,
   1581,
   1673,
   1774,
   1983,
   2068,
   2206,
   2225,
   2601,
   2682,
   2783,
   3085,
   3647,
   3915,
   3991,
   4172,
   4849,
   5228,
   5251,
   5383,
   5426,
   5974,
   6029,
   6100,
   6604]],
 'absolutely': [166,
  [51,
   66,
   109,
   145,
   147,
   155,
   191,
   204,
   436,
   498,
   506,
   610,
   658,
   660,
   697,
   706,
   739,
   763,
   775,
   859,
   936,
   953,
   1019,
   1031,
   1191,
   1288,
   1331,
   1466,
   1555,
   1575,
   1662,
   1677,
   1728,
   1825,
   1855,
   1876,
   1976,
   1983,
   1996,
   2020,
   2039,
   2053,
   2058,
   2133,
   2153,
   2168,
   2189,
   2192,
   2193,
   2199,
   2250,
   2253,
   2302,
   2318,
   2332,
   2339,
   2442,
   2489,
   2526,
   2528,
   25

## CREATING BIGRAM INVERTED INDEX

In [None]:

bigrams = {}
words = []
for i in range(dataframe.shape[0]):
    for word in dataframe.iloc[i,1].split():
        if word not in words:
            words.append(word)
            new = '$'+word+'$'
            for i in range(len(word)):
                if new[i:i+2] not in bigrams:
                    bigrams[new[i:i+2]] = [word]
                else:
                    bigrams[new[i:i+2]].append(word)

                    

In [None]:
bigrams

{'$b': ['beautiful',
  'best',
  'bad',
  'bite',
  'big',
  'buffet',
  'bar',
  'back',
  'bit',
  'busy',
  'book',
  'bunch',
  'barbecue',
  'break',
  'base',
  'bath',
  'branch',
  'board',
  'become',
  'bake',
  'butter',
  'box',
  'bone',
  'benne',
  'basket',
  'basically',
  'bakery',
  'beside',
  'bread',
  'belong',
  'breakfast',
  'beer',
  'bear',
  'baker',
  'bowl',
  'brilliant',
  'behind',
  'boy',
  'bull',
  'bottle',
  'blend',
  'budget',
  'beat',
  'brownie',
  'blow',
  'brain',
  'blue',
  'buy',
  'block',
  'broken',
  'brightly',
  'bugle',
  'bot',
  'brunch',
  'bed',
  'belief',
  'birthday',
  'bought',
  'bring',
  'black',
  'blessing',
  'buzz',
  'belt',
  'boil',
  'begin',
  'bold',
  'behavior',
  'button',
  'brand',
  'bun',
  'believe',
  'bright',
  'blast',
  'boneless',
  'baby',
  'blueberry',
  'bum',
  'brigade',
  'butterscotch',
  'brown',
  'brew',
  'brewery',
  'bel',
  'birth',
  'baa',
  'better',
  'bustle',
  'bullet',
 

## INVERTED INDEX WITH POSITIONAL INFORMATION

In [None]:
new_list = []
for i in range(dataframe.shape[0]):
    count = 0
    for j in dataframe.iloc[i,1].split():
        new_list.append([j,i,count])
        count+=1
new_list = sorted(new_list)
dict_index = {}
words = []
for i in new_list:
    if i[0] not in words:
        words.append(i[0])
        dict_index[i[0]] = [1,{i[1]:[i[2]]}]
    else:
        if i[1] not in dict_index[i[0]][1]:            
            dict_index[i[0]][0]+=1
            dict_index[i[0]][1][i[1]] = [i[2]]
        else:
            dict_index[i[0]][1][i[1]].append(i[2])
print(dict_index)




In [None]:
#when positional information is included
#dict_index is a dictionary structure
#each word is a key
#The value of each word is a list where first element is document frequency and 
#second element is a dictionary with doc id as key and positional info as a list

In [None]:
import pandas as pd
import numpy as np
import re
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity



# FUNCTION TO CALCULATE COSINE SIMILARITY

In [None]:
def preprocess(text):
    text = str(text).lower()
    text = re.sub(r'[^\w\s]', '', text)
    return text


def cosine_similarity(query_vector, doc_vector):
    dot_product = np.dot(query_vector, doc_vector)
    query_norm = np.linalg.norm(query_vector)
    doc_norm = np.linalg.norm(doc_vector)
    if query_norm == 0 or doc_norm == 0:
        return 0
    else:
        return dot_product / (query_norm * doc_norm)




# SEARCH FUNCTION TAKES CARE OF CREATING VECTORS, WILDCARD QUERIES,  SIMILARITY SCORES AND RANKING RESULTS

In [None]:
def search(query, feedback=False):
    query = preprocess(query)
    query_terms = query.split()
    # Generate vocab and query vector
    vocab = list(set(query_terms))
    query_vector = np.zeros(len(vocab))
    for i, term in enumerate(vocab):
        query_vector[i] = query_terms.count(term)
        
    # Generate phrase vectors
    phrase_vectors = []
    for i in range(len(query_terms)-1):
        if query_terms[i] == '"' and '"' in query_terms[i+1:]:
            j = i+1+query_terms[i+1:].index('"')
            phrase = ' '.join(query_terms[i:j+1])
            phrase_terms = phrase.split()
            phrase_vector = np.zeros(len(vocab))
            for k, term in enumerate(vocab):
                if term in phrase_terms:
                    phrase_vector[k] = phrase_terms.count(term)
            phrase_vectors.append((phrase, phrase_vector))
            
    # Generate wildcard regexes
    wildcard_regexes = []
    for term in query_terms:
        if '*' in term:
            regex = term.replace('*', '\w+')
            wildcard_regexes.append(regex)
            
            
    # Compute similarity scores
    scores = []
    for i,review in enumerate(dataframe['review']):######
        terms = review.split()
        doc_vector = np.zeros(len(vocab))
        for j, term in enumerate(vocab):
            doc_vector[j] = terms.count(term)
        # Check phrase queries
        phrase_match = True
        for phrase, phrase_vector in phrase_vectors:
            if phrase not in review:
                phrase_match = False
                break
            phrase_score = cosine_similarity(phrase_vector, doc_vector)
            phrase_match = phrase_match and (phrase_score > 0)
        if not phrase_match:
            continue
        # Check wildcard queries
        
        for regex in wildcard_regexes:
            if not any(re.match(regex, t) for t in terms):
                continue
        score = cosine_similarity([query_vector], [doc_vector])
        scores.append((i, score))
        
        
    # Rank results
    scores = sorted(scores, key=lambda x: x[1], reverse=True)
    results = []
    for i, score in scores:
        review = dataframe.loc[i]['review']
        name = dataframe.loc[i]['name']
        results.append((name, review, score))
        

    # Re-rank results using relevance feedback
    if feedback:
        relevant_docs = []
        nonrelevant_docs = []
        for i, (name, review, score) in enumerate(results):
            print(f'Review {i+1}:')
            print(name)
            print(review)
    
            print(f'Similarity score: {score}')
            feedback = input('Is this review relevant? (y/n): ')
            if feedback.lower() == 'y':
                relevant_docs.append(i)
            else:
                nonrelevant_docs.append(i)
        relevant_scores = [score for i, score in enumerate(scores) if i in relevant_docs]
        nonrelevant_scores = [score for i, score in enumerate(scores) if i in nonrelevant_docs]
        if len(relevant_scores) > 0:
            mean_relevant_score = sum(relevant_scores) / len(relevant_scores)
        else:
            mean_relevant_score = 0
        if len(nonrelevant_scores) > 0:
            mean_nonrelevant_score = sum(nonrelevant_scores) / len(nonrelevant_scores)
        else:
            mean_nonrelevant_score = 0
        alpha = 0.1
        beta = 0.1
        new_scores = []
        for i, (name,review, score) in enumerate(results):
            if i in relevant_docs:
                new_score = (1-alpha)*score + alpha*mean_relevant_score
            elif i in nonrelevant_docs:
                new_score = (1-beta)*score - beta*mean_nonrelevant_score
            else:
                new_score = score
            new_scores.append((i, new_score))
        new_scores = sorted(new_scores, key=lambda x: x[1], reverse=True)
        results = []
        for i, score in new_scores:
            review = dataframe.loc[i]['review']
            name = dataframe.loc[i]['name']
            results.append((name, review, score))
    return results



# RESULTS OF SEARCH

In [None]:
search("tasty cake")

[('Butterly',
  'reasonably price tasty food perfect first look cheese cake nice ambience lot good food one place hangout second reasonably price tasty food perfect first look cheese cake nice ambience lot good food one place hangout second reasonably price tasty food perfect first look cheese cake nice ambience lot good food one place hangout second reasonably price tasty food perfect first look cheese cake nice ambience lot good food one place hangout second object',
  0.9999999999999998),
 ('Cakeport',
  'havent give chance even taste ordered orange butterscotch havent give chance even taste ordered orange butterscotch can not rate store amazing awesome variety use go absolutely best always average dry cake pleasurable good tasty take white forest wow good taste wide range object',
  0.9999999999999998),
 ('Foreign CafÃ\x83Â\x83Ã\x82Â\x83Ã\x83Â\x82Ã\x82Â\x83Ã\x83Â\x83Ã\x82Â\x82Ã\x83Â\x82Ã\x82Â©',
  'foreign one bad ever order disgust service pathetic support take whatever order dont

# LOADING PRETRAINED MODEL FOR SEMANTIC MATCHING

In [None]:
import gensim.downloader as api
import numpy as np

# Load pre-trained Word2Vec model
model = api.load('word2vec-google-news-300')



In [None]:
def tokenize(query):
    query = preprocess(query)
    query_terms = query.split()
    # Generate vocab and query vector
    query_tokens = list(set(query_terms))
    return query_tokens
    

In [None]:
def semantic_matching(query, dataframe):
    # Tokenize query
    query_tokens = tokenize(query)
    # Convert query terms to vectors
    query_vectors = [model[word] for word in query_tokens if word in model.key_to_index]
    # Compute mean vector of query
    query_vector = np.mean(query_vectors, axis=0)
    # Compute similarity between query vector and document vectors
    results = []
    for i, row in dataframe.iterrows():
        document = row['review']
        name = row['name']
        document_tokens = tokenize(document)
        document_vectors = [model[word] for word in document_tokens if word in model.key_to_index]
        if len(document_vectors) > 0:
            document_vector = np.mean(document_vectors, axis=0)
            similarity = cosine_similarity(np.squeeze(np.asarray(query_vector)),np.squeeze(np.asarray(document_vector)))
            results.append((name, document, similarity))
    # Sort results by similarity score
    results = sorted(results, key=lambda x: x[2], reverse=True)
    return results


# RESULTS OF SEMANTIC MATCHING

In [None]:
results = semantic_matching("tasty cakes", dataframe)

In [None]:
results

[('Delicious Desserts',
  'try deliciously taste price quality try chocolate try chocolate truffle tasty homemade chocolate cake really delicious homemade good variety try deliciously taste price quality try chocolate try chocolate truffle tasty homemade chocolate cake really delicious homemade good variety delicious object',
  0.7930152),
 ('Cupcake Noggins',
  'prepare delicious red velvet order chocolate salt caramel cake go feed sweet cupcake small cute outlet serve prepare delicious red velvet cupcake small cute outlet serve prepare delicious red velvet order chocolate salt caramel cake go feed sweet cupcake small cute outlet serve cupcake object',
  0.74693733),
 ('Nawabi Zaica',
  'taste tell order pepper dum yummy nice food extremely spicy even though id tell order dum chilly chicken nice order butter non new tasty spice nice atmosphere best menu food delicious chai good taste tell order pepper dum yummy nice food extremely spicy even though id tell order dum chilly chicken nic

In [39]:
import tkinter as tk
from tkinter import scrolledtext

class IRSystemGUI:
    def __init__(self, master):
        self.master = master
        master.title("Information Retrieval System")

        # Create text box for input query
        self.query_label = tk.Label(master, text="Enter Query:")
        self.query_label.pack()
        self.query_box = tk.Entry(master)
        self.query_box.pack()

        # Create button to initiate search
        self.search_button = tk.Button(master, text="Search", command=self.search)
        self.search_button.pack()

        # Create text box for displaying results
        self.results_label = tk.Label(master, text="Results:")
        self.results_label.pack()
        self.results_box = scrolledtext.ScrolledText(master, height=10, width=50)
        self.results_box.pack()

    def search(self):
        query = self.query_box.get()
        # Perform search operation using IR system and get results
        results = [search(query)[i][0] for i in range(20)]

        # Clear results box
        self.results_box.delete(1.0, tk.END)

        # Display results in results box
        for result in results:
            self.results_box.insert(tk.END, result + "\n")

# Create GUI instance and start main event loop
root = tk.Tk()
ir_system_gui = IRSystemGUI(root)
root.mainloop()





Exception in Tkinter callback
Traceback (most recent call last):
  File "c:\Users\Administrator\AppData\Local\Programs\Python\Python310\lib\tkinter\__init__.py", line 1921, in __call__
    return self.func(*args)
  File "C:\Users\Administrator\AppData\Local\Temp\ipykernel_8340\3582423791.py", line 28, in search
    results = [search(query)[i][0] for i in range(20)]
  File "C:\Users\Administrator\AppData\Local\Temp\ipykernel_8340\3582423791.py", line 28, in <listcomp>
    results = [search(query)[i][0] for i in range(20)]
NameError: name 'search' is not defined
