# Imports

In [1]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import string

import csv

import math

from post_parser_record import PostParserRecord
post_reader = PostParserRecord("Posts_Coffee.xml")

from scipy.spatial import distance

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# TF-IDF

TF-IDF:

TF = term frequency in a document / total number of words in the document

Will be stored in a dictionary like an inverted index

IDF = log( total number of docs / number of docs with the term t )

Will be stored in a dictionary with {term: IDF, term2: IDF2...} Each term will have 1 IDF score. Can use the TF dictionary to see how many docs each term is in

After you have these two dictionaries, go thru the TF one, multiply each value by the corresponding IDF score, and store this in a new dictionary

TF:

tokenize, remove stop words and such

go thru all words

  if the word is not in the tf dictionary then initalize it with {doc_id: 1/total terms}

  else if the word is in the dict, but doc_id isnt in the word then initalize the doc_id with 1/total terms

  else (the word and doc_id is in the dict) then increment by 1/total terms


IDF: 

find total number of docs

go thru all of the words in the tf dictionary

  IDF[word] = log2( # of docs / len(tf_dict[word].keys()) )


TF-IDF:

go thru all the words in tf

go thru all the doc_ids for the word

  if word not in tf_idf.keys() then initialize, tf_idf[word] = {doc_id: tf[word][doc_id] * idf[word]

  else (word is in tf_idf) tf_idf[word][doc_id] = tf[word][doc_id] * idf[word]

In [55]:
def process_tf_idf():

  stop_words = set(stopwords.words('english'))
  stop_words.add('p')

  # initialize tf, idf, and tf-idf dictionaries
  tf_dict = {}
  idf_dict = {}
  tf_idf_dict = {}

  # calculate tf
  # go thru all questions
  for question_id in post_reader.map_questions:
    question = post_reader.map_questions[question_id]

    # tokenize the title and body and filter out the unwanted stuff
    word_tokens = word_tokenize(question.title)
    word_tokens += word_tokenize(question.body)
    filtered_text = [word.lower() for word in word_tokens if word not in stop_words and word.isalnum()]

    # get the amount of terms in the doc
    terms_amt = len(filtered_text)

    # go thru all the strings in filtered_text
    for word in filtered_text:
      # if the word is already in the dictionary
      if word in tf_dict.keys():
        # if the questionid for the word is there, increment
        if question_id in tf_dict[word].keys():
          tf_dict[word][question_id] += (1 / terms_amt)

        # if the questionid for the word is not there, initialize
        else:
          tf_dict[word][question_id] = (1 / terms_amt)

      # if the word is not in the dictionary, initialize it
      else:
        tf_dict[word] = {question_id: (1 / terms_amt)}

  # go thru all the answers
  for answer_id in post_reader.map_just_answers:
    answer = post_reader.map_just_answers[answer_id]

    # tokenize the body and filter out the unwanted stuff
    word_tokens = word_tokenize(answer.body)
    filtered_text = [word.lower() for word in word_tokens if word not in stop_words and word.isalnum()]

    # get the amount of terms in the doc
    terms_amt = len(filtered_text)

    # go thru all the strings in filtered_text
    for word in filtered_text:
      # if the word is already in the dictionary
      if word in tf_dict.keys():
        # if the answerid for the word is there, increment
        if answer_id in tf_dict[word].keys():
          tf_dict[word][answer_id] += (1 / terms_amt)

        # if the answerid for the word is not there, initialize
        else:
          tf_dict[word][answer_id] = (1 / terms_amt)

      # if the word is not in the dictionary, initialize it
      else:
        tf_dict[word] = {answer_id: (1 / terms_amt)}
    
  
  # calculate idf
  # get total amount of docs
  doc_amount = len(post_reader.map_questions) + len(post_reader.map_just_answers)

  # go thru all the words in tf dictionary and calculate idf
  idf_dict = {word: (math.log( (doc_amount / len(tf_dict[word].keys())) , 2)) for word in tf_dict.keys()}


  # calculate tf-idf
  # go thru all the words in tf dictionary
  for word in tf_dict.keys():
    # go thru all the docs for that word
    for doc_id in tf_dict[word].keys():
      # calculate tf-idf and store it in tf-idf dictionary
      # if word is not already in tf-idf dict then initialize
      if word not in tf_idf_dict.keys():
        tf_idf_dict[word] = {doc_id: (tf_dict[word][doc_id] * idf_dict[word])}
      # if word is already in tf-idf dict then add the docid
      else:
        tf_idf_dict[word][doc_id] = (tf_dict[word][doc_id] * idf_dict[word])
  
  # return the tf-idf dictionary
  return tf_idf_dict

# call the tf-idf processing function and store results in a dictionary
tf_idf_dict = process_tf_idf()

Ranking based on TF-IDF:

Term at a time

In [56]:
# Take a search and print the top results
def tf_idf_search(search_text):

  stop_words = set(stopwords.words('english'))
  stop_words.add('p')

  # tokenize the body and filter out the unwanted stuff
  word_tokens = word_tokenize(search_text)
  search_words = [word.lower() for word in word_tokens if word not in stop_words and word.isalnum()]

  # initialize a dictionary to the inverted index entry of the first word in the search
  search_docs_rank = tf_idf_dict[search_words[0]].copy()

  # print(search_words)

  # go thru the rest of the terms
  for i in range(1, len(search_words)):
    # put the scores of the word in a temp dictionary
    temp_dict = tf_idf_dict[search_words[i]].copy()

    # go thru the doc_ids in the temp dictionary
    for doc_id in temp_dict:

      # if the doc_id is already in the main dict, then increment
      if doc_id in search_docs_rank.keys():
        search_docs_rank[doc_id] += temp_dict[doc_id]

      # if the doc_id is not in the main dict, initialize
      else:
        search_docs_rank[doc_id] = temp_dict[doc_id]
 
  # print(search_docs_rank)
  # order by the values
  sorted_by_value = dict(sorted(search_docs_rank.items(), key=lambda item: item[1], reverse=True))
  print("Search Results for: " + search_text)
  for i in range(1,6):
    # print(str(list(sorted_by_value.keys())[i-1]) + "\n")
    # print(str(list(sorted_by_value.values())[i-1]) + "\n")
    print(str(i) + ". " + str(list(sorted_by_value.keys())[i-1]) + " : " + str(list(sorted_by_value.values())[i-1]))

In [57]:
tf_idf_search("espresso")

tf_idf_search("turkish coffee")

tf_idf_search("making a decaffeinated coffee")

tf_idf_search("can I use the same coffee grounds twice")

Search Results for: espresso
1. 4404 : 0.3755715977425341
2. 3904 : 0.3755715977425341
3. 2867 : 0.32191851235074354
4. 5526 : 0.307285852698437
5. 4258 : 0.30045727819402734
Search Results for: turkish coffee
1. 5182 : 1.406828518900155
2. 5094 : 1.0586643529899669
3. 209 : 0.750308543413416
4. 483 : 0.750308543413416
5. 2522 : 0.6596504998045386
Search Results for: making a decaffeinated coffee
1. 204 : 1.0611436096253164
2. 120 : 0.8098105547792871
3. 2897 : 0.7428005267377216
4. 3293 : 0.5584774999332578
5. 373 : 0.539219655107973
Search Results for: can I use the same coffee grounds twice
1. 2683 : 0.8569033712400295
2. 1749 : 0.5285781531033462
3. 3258 : 0.5150073113135012
4. 3966 : 0.5093941577180555
5. 183 : 0.42714003108048576


# VSM

In [49]:
def process_vsm():

  stop_words = set(stopwords.words('english'))
  stop_words.add('p')

  # initialize a list
  tokens = []

  # go thru all questions
  for question_id in post_reader.map_questions:
    question = post_reader.map_questions[question_id]

    # tokenize the title and body and filter out the unwanted stuff
    word_tokens = word_tokenize(question.title)
    word_tokens += word_tokenize(question.body)
    filtered_text = [word.lower() for word in word_tokens if word not in stop_words and word.isalnum()]

    # add all of the words into lsit of tokens
    for word in filtered_text:
      tokens.append(word)
    
  
  # go thru all the answers
  for answer_id in post_reader.map_just_answers:
    answer = post_reader.map_just_answers[answer_id]

    # tokenize the body and filter out the unwanted stuff
    word_tokens = word_tokenize(answer.body)
    filtered_text = [word.lower() for word in word_tokens if word not in stop_words and word.isalnum()]

    # add all of the words into lsit of tokens
    for word in filtered_text:
      tokens.append(word)

  # get the amount of unique tokens
  unique_tokens = set(tokens)
  unique_count = len(unique_tokens)

  # create dictionary of unique terms with an index
  i = 0
  unique_dict = {}
  for word in unique_tokens:
    unique_dict[word] = i
    i += 1

  # initizalize a dictionary for storing the vectors
  doc_vectors_dict = {}

  # go thru all questions
  for question_id in post_reader.map_questions:
    question = post_reader.map_questions[question_id]

    # tokenize the title and body and filter out the unwanted stuff
    word_tokens = word_tokenize(question.title)
    word_tokens += word_tokenize(question.body)
    filtered_text = [word.lower() for word in word_tokens if word not in stop_words and word.isalnum()]

    # go thru all the tokens and create a vector based off of them
    temp_vector = [0] * unique_count
    for word in filtered_text:
      temp_vector[unique_dict[word]] += 1
    
    # add the vector to the document vectors dictionary
    doc_vectors_dict[question_id] = temp_vector.copy()
  
  # go thru all answers
  for answer_id in post_reader.map_just_answers:
    answer = post_reader.map_just_answers[answer_id]

    # tokenize the title and body and filter out the unwanted stuff
    word_tokens = word_tokenize(answer.body)
    filtered_text = [word.lower() for word in word_tokens if word not in stop_words and word.isalnum()]

    # go thru all the tokens and create a vector based off of them
    temp_vector = [0] * unique_count
    for word in filtered_text:
      temp_vector[unique_dict[word]] += 1
    
    # add the vector to the document vectors dictionary
    doc_vectors_dict[answer_id] = temp_vector.copy()
  
  # return the document vectors dictionary, unique words, and number of unique words
  return doc_vectors_dict, unique_dict, unique_count

doc_vectors_dict, unique_dict, unique_count = process_vsm()

In [50]:
def vsm_search(search_text):

  stop_words = set(stopwords.words('english'))
  stop_words.add('p')

  # tokenize the body and filter out the unwanted stuff
  word_tokens = word_tokenize(search_text)
  search_words = [word.lower() for word in word_tokens if word not in stop_words and word.isalnum()]

  # create a vector based off the search
  search_vector = [0] * unique_count
  for word in search_words:
    search_vector[unique_dict[word]] += 1
  
  # create a dictionary for the scores of documents
  docs_scored = {}
  for doc_id in doc_vectors_dict.keys():
    docs_scored[doc_id] = 1 - distance.cosine(search_vector, doc_vectors_dict[doc_id])
  
  # print(search_docs_rank)
  # order by the values
  sorted_by_value = dict(sorted(docs_scored.items(), key=lambda item: item[1], reverse=True))
  print("Search Results for: " + search_text)
  for i in range(1,6):
    # print(str(list(sorted_by_value.keys())[i-1]) + "\n")
    # print(str(list(sorted_by_value.values())[i-1]) + "\n")
    print(str(i) + ". " + str(list(sorted_by_value.keys())[i-1]) + " : " + str(list(sorted_by_value.values())[i-1]))

In [51]:
vsm_search("espresso")

vsm_search("turkish coffee")

vsm_search("making a decaffeinated coffee")

vsm_search("can I use the same coffee grounds twice")

Search Results for: espresso
1. 26 : 0.6108472217815261
2. 3956 : 0.5960395606792697
3. 2095 : 0.5790416107014432
4. 2766 : 0.5746957711326909
5. 1574 : 0.5669084372142384
Search Results for: turkish coffee
1. 5094 : 0.7372097807744856
2. 3074 : 0.7219948723811555
3. 5182 : 0.7071067811865476
4. 2379 : 0.6616825781270739
5. 5095 : 0.629511580291707
Search Results for: making a decaffeinated coffee
1. 120 : 0.5715476066494082
2. 3746 : 0.5101127853361852
3. 3509 : 0.48666426339228763
4. 2158 : 0.4838867031273071
5. 373 : 0.4758309514308865
Search Results for: can I use the same coffee grounds twice
1. 1749 : 0.6757246285173464
2. 2683 : 0.6459422414661738
3. 3258 : 0.6383942119187179
4. 4959 : 0.5976415302112609
5. 3144 : 0.5952522654434597


# BM25

In [58]:
def process_bm25():

  stop_words = set(stopwords.words('english'))
  stop_words.add('p')

  # initialize tf, idf, document length dictionaries, total docs, and total tokens
  tf_dict = {}
  idf_dict = {}
  doc_length = {}
  total_docs = 0
  total_tokens = 0

  # calculate tf
  # go thru all questions
  for question_id in post_reader.map_questions:
    question = post_reader.map_questions[question_id]

    # tokenize the title and body and filter out the unwanted stuff
    word_tokens = word_tokenize(question.title)
    word_tokens += word_tokenize(question.body)
    filtered_text = [word.lower() for word in word_tokens if word not in stop_words and word.isalnum()]

    # get the amount of terms in the doc
    terms_amt = len(filtered_text)

    # add doc length to dict, increment total docs and total tokens
    doc_length[question_id] = terms_amt
    total_docs += 1
    total_tokens += terms_amt

    # go thru all the strings in filtered_text
    for word in filtered_text:
      # if the word is already in the dictionary
      if word in tf_dict.keys():
        # if the questionid for the word is there, increment
        if question_id in tf_dict[word].keys():
          tf_dict[word][question_id] += (1 / terms_amt)

        # if the questionid for the word is not there, initialize
        else:
          tf_dict[word][question_id] = (1 / terms_amt)

      # if the word is not in the dictionary, initialize it
      else:
        tf_dict[word] = {question_id: (1 / terms_amt)}

  # go thru all the answers
  for answer_id in post_reader.map_just_answers:
    answer = post_reader.map_just_answers[answer_id]

    # tokenize the body and filter out the unwanted stuff
    word_tokens = word_tokenize(answer.body)
    filtered_text = [word.lower() for word in word_tokens if word not in stop_words and word.isalnum()]

    # get the amount of terms in the doc
    terms_amt = len(filtered_text)

    # add doc length to dict, increment total docs and total tokens
    doc_length[answer_id] = terms_amt
    total_docs += 1
    total_tokens += terms_amt

    # go thru all the strings in filtered_text
    for word in filtered_text:
      # if the word is already in the dictionary
      if word in tf_dict.keys():
        # if the answerid for the word is there, increment
        if answer_id in tf_dict[word].keys():
          tf_dict[word][answer_id] += (1 / terms_amt)

        # if the answerid for the word is not there, initialize
        else:
          tf_dict[word][answer_id] = (1 / terms_amt)

      # if the word is not in the dictionary, initialize it
      else:
        tf_dict[word] = {answer_id: (1 / terms_amt)}
    
  
  # calculate idf
  # get total amount of docs
  doc_amount = len(post_reader.map_questions) + len(post_reader.map_just_answers)

  # go thru all the words in tf dictionary and calculate idf
  idf_dict = {word: (math.log( (doc_amount / len(tf_dict[word].keys())) , 2)) for word in tf_dict.keys()}

  # return tf, idf, document length dictionaries and average length
  return tf_dict, idf_dict, doc_length, total_tokens / total_docs

# Process BM25
tf_dict, idf_dict, doc_length, average_length = process_bm25()

In [59]:
# Take a search and print the top results
def bm_25_search(search_text, b, k1):

  stop_words = set(stopwords.words('english'))
  stop_words.add('p')

  # tokenize the body and filter out the unwanted stuff
  word_tokens = word_tokenize(search_text)
  search_words = [word.lower() for word in word_tokens if word not in stop_words and word.isalnum()]

  # initialize a dictionary to store document scores
  doc_scores = {}

  # go thru all the terms
  for word in search_words:
    # go thru all the docs for the word
    for doc_id in tf_dict[word].keys():
      # if the doc has not been score, initialize
      if doc_id not in doc_scores.keys():
        doc_scores[doc_id] = (idf_dict[word] * (((k1 + 1) * tf_dict[word][doc_id]) / ((k1 * ((1 - b) + (b * (doc_length[doc_id] / average_length)))) + tf_dict[word][doc_id])))
      # if the doc has been scored, increment
      else:
        doc_scores[doc_id] += (idf_dict[word] * (((k1 + 1) * tf_dict[word][doc_id]) / ((k1 * ((1 - b) + (b * (doc_length[doc_id] / average_length)))) + tf_dict[word][doc_id])))

  
  # print(search_docs_rank)
  # order by the values
  sorted_by_value = dict(sorted(doc_scores.items(), key=lambda item: item[1], reverse=True))
  print("Search Results for: " + search_text)
  for i in range(1,6):
    # print(str(list(sorted_by_value.keys())[i-1]) + "\n")
    # print(str(list(sorted_by_value.values())[i-1]) + "\n")
    print(str(i) + ". " + str(list(sorted_by_value.keys())[i-1]) + " : " + str(list(sorted_by_value.values())[i-1]))

In [60]:
b = 0.75
k1 = 1.2

bm_25_search("espresso", b, k1)

bm_25_search("turkish coffee", b, k1)

bm_25_search("making a decaffeinated coffee", b, k1)

bm_25_search("can I use the same coffee grounds twice", b, k1)

Search Results for: espresso
1. 3904 : 1.547084428490502
2. 4404 : 1.3737044945367798
3. 2867 : 1.0433688725826213
4. 5526 : 0.9890349752828139
5. 3981 : 0.96761757755849
Search Results for: turkish coffee
1. 5182 : 5.202259238172734
2. 5094 : 3.796604360991974
3. 209 : 2.742667548141526
4. 483 : 2.742667548141526
5. 2522 : 2.0765703993535936
Search Results for: making a decaffeinated coffee
1. 204 : 3.8899468072956545
2. 2897 : 2.6159512025630143
3. 120 : 2.460535411591724
4. 3293 : 2.296214284092522
5. 373 : 1.6920452949896814
Search Results for: can I use the same coffee grounds twice
1. 2683 : 2.47962822849626
2. 3966 : 2.0091653855787985
3. 1749 : 1.6049621533294713
4. 3818 : 1.5168800892726597
5. 4703 : 1.4473461318125382
