# Imports

In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import string

import csv

from post_parser_record import PostParserRecord
post_reader = PostParserRecord("Posts_Coffee.xml")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Processs the data into an inverted index

In [None]:
def process_data():
  # initialize dictionary
  words_dict = {}

  stop_words = set(stopwords.words('english'))

  # go thru all the questions
  for question_id in post_reader.map_questions:
    question = post_reader.map_questions[question_id]

    # tokenize the title and body and filter out the unwanted stuff
    word_tokens = word_tokenize(question.title)
    word_tokens += word_tokenize(question.body)
    filtered_text = [word.lower() for word in word_tokens if word not in stop_words and word.isalnum()]

    # go thru all the strings in filtered_text
    for word in filtered_text:
      # if the word is already in the dictionary
      if word in words_dict:
        # if the questionid for the word is there, increment
        if question_id in words_dict[word]:
          words_dict[word][question_id] += 1

        # if the questionid for the word is not there, initialize to 1
        else:
          words_dict[word][question_id] = 1

      # if the word is not in the dictionary, initialize it
      else:
        words_dict[word] = {question_id: 1}
  
  # go thru all the answers
  for answer_id in post_reader.map_just_answers:
    answer = post_reader.map_just_answers[answer_id]

    # tokenize the body and filter out the unwanted stuff
    word_tokens = word_tokenize(answer.body)
    filtered_text = [word.lower() for word in word_tokens if word not in stop_words and word.isalnum()]

    # go thru all the strings in filtered_text
    for word in filtered_text:
      # if the word is already in the dictionary
      if word in words_dict:
        # if the answerid for the word is there, increment
        if answer_id in words_dict[word]:
          words_dict[word][answer_id] += 1

        # if the answerid for the word is not there, initialize to 1
        else:
          words_dict[word][answer_id] = 1

      # if the word is not in the dictionary, initialize it
      else:
        words_dict[word] = {answer_id: 1}
      
  return words_dict
  
words_dict = process_data()
# for key, value in words_dict.items():
#     print(key, ' : ', value)

# Term at a Time Ranking

In [None]:
# Take a search and print the top results
def search_query(search_text):

  # split the search into its terms
  search_words = search_text.split()

  # check if all the words are in the dictionary
  check = True
  if len(search_words) == 0:
    check = False
  for word in search_words:
    if word not in words_dict.keys():
      check = False

  if check == False:
    print("No Valid Searches for: " + search_text)
  else:
    # initialize a dictionary to the inverted index entry of the first word in the search
    search_docs_rank = words_dict[search_words[0]]

    # go thru the rest of the words in the search, if they exist
    if len(search_words) > 1:

      # the goal here is to intersect the keys, while adding the values
      # this ensures that only documents with all of the search terms included will be returned
      for word in search_words[1:]:
        # declare a temporary dictionary
        temp_dict = {}

        # go thru the id in the current word
        for key, value in words_dict[word].items():
          # if the id is also in the main dictionary, add it to the temp one
          if key in search_docs_rank:
            temp_dict[key] = value + search_docs_rank[key]
        # redeclare the main dict to the temp one
        search_docs_rank = temp_dict
      
    # print(search_docs_rank)
    # order by the values
    sorted_by_value = dict(sorted(search_docs_rank.items(), key=lambda item: item[1], reverse=True))
    print("Search Results for: " + search_text)
    for i in range(1,11):
      print(str(i) + ": " + str(list(sorted_by_value.keys())[i-1]) + " with " + str(list(sorted_by_value.values())[i-1]) + " Occurences")
      
  
search_query("persian coffee")

No Valid Searches for: persian coffee


**Results**

Search Results for: espresso

1: 3269 with 33 Occurences: Relevant

2: 1574 with 23 Occurences: Relevant

3: 2095 with 20 Occurences: Relevant

4: 283 with 14 Occurences: Not Relevant

5: 2077 with 13 Occurences: Relevant

6: 5537 with 13 Occurences: Relevant

7: 2087 with 12 Occurences: Relevant

8: 2116 with 12 Occurences: Relevant

9: 3721 with 12 Occurences: Relevant

10: 3438 with 11 Occurences: Not Relevant

---------------------------------------------------

Search Results for: turkish coffee

1: 2392 with 65 Occurences: Relevant

2: 1833 with 19 Occurences: Relevant

3: 4407 with 19 Occurences: Not Relevant

4: 4185 with 17 Occurences: Not Relevant

5: 4273 with 17 Occurences: Not Relevant

6: 5095 with 16 Occurences: Relevant

7: 3101 with 15 Occurences: Not Relevant

8: 165 with 13 Occurences: Not Relevant

9: 2379 with 13 Occurences: Relevant

10: 2647 with 13 Occurences: Not Relevant

---------------------------------------------------

No Valid Searches for: persian coffee



# Questions

**Part A**

Measure|espresso | turkish coffee | persian coffee
---------|--------|--------------|------------
Precision @ 10|0.8 | 0.4 | 0 (No documents recalled)

--------------------------------------
**Part B**

For Espresso, the precision in assignment 1 was 0.3, the precision in this assignment is 0.8, a big improvement. i believe that this is because this method, when using a single search word, will return the document that talks about that one thing the most.

For Turkish Coffee, the precision in assignment 1 was 0.6, the precision in this assignment is 0.4, a significant drop in precision. I believe that this is because, using this method, "coffee" is as equally important in ranking as "turkish". This matters because this method could return a post that mentions coffe in general a ton of times, but turkish coffee only once, which is what ended up happening when I looked through the returned documents.

For Persian Coffee, the precision in both assignment was 0. This is because there are no posts with "persian" in the collection.

Overall, I noticed that there were more answers returned than questions this time. This should lead to better results since its hard to find an answer to a query in a question

# Export to TSV


In [None]:
with open('coffee_index.tsv', 'w') as f:
  for word in words_dict.keys():
    f.write("%s"%(word))
    for key,value in words_dict[word].items():
      key = str(key)
      value = str(value)
      f.write("\t%s:%s"%(key,value))

    f.write("\n")