# Assignment 5

## Some Setup/Imports

In [1]:
# Getting the post reader
from post_parser_record import PostParserRecord
post_reader = PostParserRecord("Posts_Coffee.xml")

In [2]:
# Making sure correct version of pyterrier is installed
!pip install python-terrier
!pip install --upgrade git+https://github.com/terrier-org/pyterrier.git

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/terrier-org/pyterrier.git
  Cloning https://github.com/terrier-org/pyterrier.git to /tmp/pip-req-build-_s8oq73s
  Running command git clone -q https://github.com/terrier-org/pyterrier.git /tmp/pip-req-build-_s8oq73s


In [3]:
# Importing and starting pyterrier
import pandas as pd
import pyterrier as pt
if not pt.started():
  pt.init(boot_packages=['com.github.terrierteam:terrier-prf:-SNAPSHOT'])

PyTerrier 0.9.1 has loaded Terrier 5.7 (built by craigm on 2022-11-10 18:30) and terrier-helper 0.0.7



In [4]:
import string
import re

# Cleans the text

def clean_string(s):
  # Removes HTML tags
  CLEANR = re.compile('<.*?>') 
  s = re.sub(CLEANR, '', s)
  # Removes newlines and adds a space so that words do not combine
  s = s.replace('\n', ' ') 
  # Removes punctuation
  s = s.translate(str.maketrans('','',string.punctuation))
  
  return s

In [5]:
# More imports
from gensim.models import Word2Vec
import numpy as np
from numpy.linalg import norm
from prettytable import PrettyTable

## Question 1 - Word2Vec

In [6]:
# Getting list of all question titles
titles = []

for question_id in post_reader.map_questions:
  t_list = []
  
  # Gets question
  question = post_reader.map_questions[question_id]

  # Combines question body and title
  title = question.title.lower().strip()

  # Cleaning
  title = clean_string(title).split()

  titles.append(title)

In [7]:
# Creating Word2Vec model and training on the titles with 10 epochs
model = Word2Vec(sentences=titles, window=5, min_count=1, workers=4)
model.train(titles, total_examples=len(titles), epochs=10)



(84623, 122550)

In [8]:
# Setting up query
query = "When does coffee go off?"
query = clean_string(query).lower()
querywords = query.split()

In [9]:
# Computing vector for each title and storing it in dict of (title : vector) pairing
dict_of_vec = {}

for title in titles:
  all_vectors = np.zeros(100,)

  for word in title:
    vector = model.wv[word]  # Get numpy vector of a word
    all_vectors = all_vectors + vector

  all_vectors = all_vectors / len(title) # Average of the vectors of each term in title
  title = ' '.join(title) # Turns title back into a string
  dict_of_vec.update({title : all_vectors}) # Updates dict with (title : vector) pairing

In [10]:
# Getting the vector for the query
query_vec = np.zeros(100,) # Vector of 0's

for word in querywords:
  vector = model.wv[word]  # Get numpy vector of a word
  query_vec= query_vec + vector

query_vec = query_vec/ len(querywords) # Average of the vectors of each term in query

In [11]:
# Do cosine sim on result from query and titles
cosim_dict = {}

for title in dict_of_vec:
  vector = dict_of_vec[title]
  cosim = np.dot(query_vec,vector)/(norm(query_vec)*norm(vector))
  cosim_dict.update({title : cosim})

# Sort the dict
cosim_dict = dict(sorted(cosim_dict.items(), key=lambda item: item[1], reverse = True))

In [12]:
# Function to display topk results from a dict of form (title : cosine similatity) for a query
def topk(topk, dict):
  print("Top " + str(topk) + " similar question titles: \n")
  
  for x in range (topk):
    res = list(dict.keys())[x]
    print("Result " + str(x + 1) + ':')
    print(str(res))
    print("With sim score: " + str(dict[res]) + "\n")

### Results

In [13]:
topk(5,cosim_dict)

Top 5 similar question titles: 

Result 1:
when does coffee go off
With sim score: 1.0000000000000002

Result 2:
do coffee connoisseurs add anything to their coffee when tasting a new coffee
With sim score: 0.9999899562249565

Result 3:
why do some coffee beans produce more dust than others when ground
With sim score: 0.999989624987434

Result 4:
why does instant coffee foam when stirring it
With sim score: 0.9999886507757055

Result 5:
how does coffee grind size affect coffee taste
With sim score: 0.9999882786056138



#### Making the table

In [14]:
# Function to create the table 

# Takes topk and dict as inputs
def table(topk, dict):
  myTable = PrettyTable(["Title", "Cosine Similarity"])

  for x in range (topk):
    res = list(dict.keys())[x]
    # Adds rows
    myTable.add_row([res, dict[res]])

  print(myTable)

In [15]:
table(5,cosim_dict)

+-------------------------------------------------------------------------------+--------------------+
|                                     Title                                     | Cosine Similarity  |
+-------------------------------------------------------------------------------+--------------------+
|                            when does coffee go off                            | 1.0000000000000002 |
| do coffee connoisseurs add anything to their coffee when tasting a new coffee | 0.9999899562249565 |
|       why do some coffee beans produce more dust than others when ground      | 0.999989624987434  |
|                 why does instant coffee foam when stirring it                 | 0.9999886507757055 |
|                 how does coffee grind size affect coffee taste                | 0.9999882786056138 |
+-------------------------------------------------------------------------------+--------------------+


## Question 2: Classification

In [16]:
# Method to get the query class by taking the tags from each document number 
# given by bm25 retrieval and returning the tag that appears most frequently

def getQueryClass (query,topk):

  # BM25 model of retrieval k = topk
  bm25 = pt.BatchRetrieve(indexref1, num_results = topk, wmodel="BM25").search(query)

  docs = bm25['docno'].astype(int)

  tagfreq = {}

  for docno in docs:
    question = post_reader.map_questions[docno]
    qtags = question.tags
    for tag in qtags:
      count = 0
      if tag in tagfreq:
        count = tagfreq[tag]
        tagfreq.update({tag: count + 1})
      else:
        tagfreq.update({tag : 1})

  # Sort
  tagfreq = dict(sorted(tagfreq.items(), key=lambda item: item[1], reverse=True))
  print("Tags with tag count for the query: " + query + "\n")
  print(str(tagfreq) + "\n")

  qclass = list(tagfreq.keys())[0]

  return "The class of the query is: " + qclass

### Indexing/Setting up queries

In [17]:
# reading questions into a list
questions = []
for question_id in post_reader.map_questions:
  question = post_reader.map_questions[question_id]
  questions.append({'docno':str(question_id), 'title': question.title, 'body': question.body})

In [18]:
# creating the index which is a pandas dataframe
iter_indexer = pt.IterDictIndexer("./index", meta={'docno': 20, 'title': 10000, 'body':20000},
overwrite=True)
iter_indexer.setProperty("tokeniser", "UTFTokeniser")
RETRIEVAL_FIELDS = ['body', 'title']
indexref1 = iter_indexer.index(questions, fields=RETRIEVAL_FIELDS)

In [19]:
# Queries
q1 = "how to make espresso"
q2 = "moka pot"
q3 = "coffee caffeine"

### Retrieval for k = 1

In [20]:
getQueryClass(q1,1)

Tags with tag count for the query: how to make espresso

{'espresso': 1, 'roasting': 1, 'coffee-blends': 1}



'The class of the query is: espresso'

In [21]:
getQueryClass(q2,1)

Tags with tag count for the query: moka pot

{'moka': 1}



'The class of the query is: moka'

In [22]:
getQueryClass(q3,1)

Tags with tag count for the query: coffee caffeine

{'caffeine': 1, 'health': 1}



'The class of the query is: caffeine'

### Retrieval for k = 5

In [23]:
getQueryClass(q1,5)

Tags with tag count for the query: how to make espresso

{'espresso': 5, 'espresso-machine': 2, 'roasting': 1, 'coffee-blends': 1, 'equipment': 1, 'quality': 1, 'crema': 1, 'flavor': 1, 'grinder': 1, 'barista': 1}



'The class of the query is: espresso'

In [24]:
getQueryClass(q2,5)

Tags with tag count for the query: moka pot

{'moka': 4, 'brewing-process': 2, 'grinder': 1, 'espresso': 1}



'The class of the query is: moka'

In [25]:
getQueryClass(q3,5)

Tags with tag count for the query: coffee caffeine

{'caffeine': 4, 'health': 3, 'chemistry': 1, 'capsule': 1, 'espresso': 1}



'The class of the query is: caffeine'

### Retrieval for k = 10

In [26]:
getQueryClass(q1,10)

Tags with tag count for the query: how to make espresso

{'espresso': 9, 'espresso-machine': 5, 'roasting': 1, 'coffee-blends': 1, 'equipment': 1, 'quality': 1, 'crema': 1, 'flavor': 1, 'grinder': 1, 'barista': 1, 'brewing-process': 1, 'acid': 1}



'The class of the query is: espresso'

In [27]:
getQueryClass(q2,10)

Tags with tag count for the query: moka pot

{'moka': 9, 'brewing-process': 2, 'equipment': 2, 'grinder': 1, 'espresso': 1, 'cleaning': 1, 'v60': 1, 'espresso-machine': 1, 'induction': 1}



'The class of the query is: moka'

In [28]:
getQueryClass(q3,10)

Tags with tag count for the query: coffee caffeine

{'caffeine': 8, 'health': 5, 'capsule': 2, 'chemistry': 1, 'espresso': 1, 'beans': 1, 'nespresso': 1, 'brewing-process': 1}



'The class of the query is: caffeine'

### Which value of k was better?

Each value of k returned the same tag as the class for the query so in this way I can say they are all very similar and performance between k = 1, k = 5, and k = 10 is about the same. I would say that the value of k = 1 is the worst, however, because it does not provide enough tags/tag counts to have a conclusive class for the query. This is because if there are multiple tags, each tag can have a frequency of at most 1, so all of the tags returned will have the same frequency, and the tag that gets returned will just depend on how the tags are sorted. For k = 5 and k = 10, each value of k returned similar tags as well as tag counts in proportion to the value of k. While k = 10 does provide a wider variety of tags while returning a valid class tag for the query, not all of these tags are directly related to the query.