# Imports

In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import string

import csv

import math

from post_parser_record import PostParserRecord
post_reader = PostParserRecord("Posts_Coffee.xml")

from scipy.spatial import distance

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# BM25

### Create BM25

In [None]:
def process_bm25():

  stop_words = set(stopwords.words('english'))
  stop_words.add('p')

  # initialize tf, idf, document length dictionaries, total docs, and total tokens
  tf_dict = {}
  idf_dict = {}
  doc_length = {}
  total_docs = 0
  total_tokens = 0

  # calculate tf
  # go thru all questions
  for question_id in post_reader.map_questions:
    question = post_reader.map_questions[question_id]

    # tokenize the title and body and filter out the unwanted stuff
    word_tokens = word_tokenize(question.title)
    word_tokens += word_tokenize(question.body)
    filtered_text = [word.lower() for word in word_tokens if word not in stop_words and word.isalnum()]

    # get the amount of terms in the doc
    terms_amt = len(filtered_text)

    # add doc length to dict, increment total docs and total tokens
    doc_length[question_id] = terms_amt
    total_docs += 1
    total_tokens += terms_amt

    # go thru all the strings in filtered_text
    for word in filtered_text:
      # if the word is already in the dictionary
      if word in tf_dict.keys():
        # if the questionid for the word is there, increment
        if question_id in tf_dict[word].keys():
          tf_dict[word][question_id] += (1 / terms_amt)

        # if the questionid for the word is not there, initialize
        else:
          tf_dict[word][question_id] = (1 / terms_amt)

      # if the word is not in the dictionary, initialize it
      else:
        tf_dict[word] = {question_id: (1 / terms_amt)}

  # # go thru all the answers
  # for answer_id in post_reader.map_just_answers:
  #   answer = post_reader.map_just_answers[answer_id]

  #   # tokenize the body and filter out the unwanted stuff
  #   word_tokens = word_tokenize(answer.body)
  #   filtered_text = [word.lower() for word in word_tokens if word not in stop_words and word.isalnum()]

  #   # get the amount of terms in the doc
  #   terms_amt = len(filtered_text)

  #   # add doc length to dict, increment total docs and total tokens
  #   doc_length[answer_id] = terms_amt
  #   total_docs += 1
  #   total_tokens += terms_amt

  #   # go thru all the strings in filtered_text
  #   for word in filtered_text:
  #     # if the word is already in the dictionary
  #     if word in tf_dict.keys():
  #       # if the answerid for the word is there, increment
  #       if answer_id in tf_dict[word].keys():
  #         tf_dict[word][answer_id] += (1 / terms_amt)

  #       # if the answerid for the word is not there, initialize
  #       else:
  #         tf_dict[word][answer_id] = (1 / terms_amt)

  #     # if the word is not in the dictionary, initialize it
  #     else:
  #       tf_dict[word] = {answer_id: (1 / terms_amt)}
    
  
  # calculate idf
  # get total amount of docs
  doc_amount = len(post_reader.map_questions)# + len(post_reader.map_just_answers)

  # go thru all the words in tf dictionary and calculate idf
  idf_dict = {word: (math.log( (doc_amount / len(tf_dict[word].keys())) , 2)) for word in tf_dict.keys()}

  # return tf, idf, document length dictionaries and average length
  return tf_dict, idf_dict, doc_length, total_tokens / total_docs

# Process BM25
tf_dict, idf_dict, doc_length, average_length = process_bm25()

In [None]:
# Take a search and print the top results
def bm_25_search(search_text, b, k1):

  stop_words = set(stopwords.words('english'))
  stop_words.add('p')

  # tokenize the body and filter out the unwanted stuff
  word_tokens = word_tokenize(search_text)
  search_words = [word.lower() for word in word_tokens if word not in stop_words and word.isalnum()]

  # initialize a dictionary to store document scores
  doc_scores = {}

  # go thru all the terms
  for word in search_words:
    # go thru all the docs for the word
    for doc_id in tf_dict[word].keys():
      # if the doc has not been score, initialize
      if doc_id not in doc_scores.keys():
        doc_scores[doc_id] = (idf_dict[word] * (((k1 + 1) * tf_dict[word][doc_id]) / ((k1 * ((1 - b) + (b * (doc_length[doc_id] / average_length)))) + tf_dict[word][doc_id])))
      # if the doc has been scored, increment
      else:
        doc_scores[doc_id] += (idf_dict[word] * (((k1 + 1) * tf_dict[word][doc_id]) / ((k1 * ((1 - b) + (b * (doc_length[doc_id] / average_length)))) + tf_dict[word][doc_id])))

  
  # print(search_docs_rank)
  # order by the values
  sorted_by_value = dict(sorted(doc_scores.items(), key=lambda item: item[1], reverse=True))
  print("Search Results for: " + search_text)
  for i in range(1,11):
    # print(str(list(sorted_by_value.keys())[i-1]) + "\n")
    # print(str(list(sorted_by_value.values())[i-1]) + "\n")
    print(str(i) + ". " + str(list(sorted_by_value.keys())[i-1]) + " : " + str(list(sorted_by_value.values())[i-1]))

### Results

In [None]:
b = 0.75
k1 = 1.2

bm_25_search("how to make espresso", b, k1)

bm_25_search("moka pot", b, k1)

bm_25_search("coffee caffeine", b, k1)

Search Results for: how to make espresso
1. 4404 : 1.2394862199905607
2. 5526 : 1.178071422060564
3. 5121 : 1.1312409263567955
4. 4739 : 0.9872093529045914
5. 4258 : 0.9264812157429019
6. 2867 : 0.902578425382284
7. 4175 : 0.8844814651156854
8. 4406 : 0.8515163985919738
9. 94 : 0.8384360248231166
10. 3143 : 0.8384360248231166
Search Results for: moka pot
1. 5066 : 2.391156771453269
2. 4500 : 2.2085357310940084
3. 3381 : 1.9218586430695521
4. 5070 : 1.5981155396408173
5. 97 : 1.5530474695435768
6. 4710 : 1.4572907573703662
7. 122 : 1.4547910272975648
8. 2024 : 1.384641997966383
9. 4299 : 1.3686101594336146
10. 5546 : 1.340349995237566
Search Results for: coffee caffeine
1. 2387 : 1.8587922238595915
2. 204 : 1.8427245558778806
3. 2358 : 1.5643365700272247
4. 94 : 1.4154702530746701
5. 2127 : 1.4154702530746701
6. 3566 : 1.2804855192599771
7. 29 : 1.2662177289375511
8. 462 : 1.2578243525720696
9. 3475 : 1.215776379839394
10. 128 : 1.153389997394759
