<a href="https://colab.research.google.com/github/HofstraDoboli/TextMining/blob/main/feedback_retrieval_class.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Implement Rocchio feedback algorithm**

In [3]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
import os
from sklearn import metrics
import scipy.sparse
import math

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
%cd /content/drive/MyDrive/TextMining/DataSets
%ls *.csv

/content/drive/MyDrive/TextMining/DataSets
job_title_des.csv


In [5]:
dir_file = os.getcwd() # returns path to current directory
files_dir = os.listdir(dir_file)  # list of files in current directory

csv_files = [f for f in files_dir if f.endswith('csv')]
print(csv_files)
jobs_file = csv_files[0]

jobs_df = pd.read_csv(jobs_file)
print(jobs_df.columns)  # the columns
print(jobs_df.shape)

['job_title_des.csv']
Index(['Unnamed: 0', 'Job Title', 'Job Description'], dtype='object')
(2277, 3)


In [12]:
collection = jobs_df['Job Title'] + '. ' + jobs_df['Job Description']
print(collection.iloc[:10])

0    Flutter Developer. We are looking for hire exp...
1    Django Developer. PYTHON/DJANGO (Developer/Lea...
2    Machine Learning. Data Scientist (Contractor)\...
3    iOS Developer. JOB DESCRIPTION:\n\nStrong fram...
4    Full Stack Developer. job responsibility full ...
5    Java Developer. Software Developer - Integrati...
6    Full Stack Developer. senior full stack develo...
7    JavaScript Developer. Job Description:\n\nReac...
8    DevOps Engineer. Main Responsibilities and Del...
9    Software Engineer. Overview\n\n\nBased in Sili...
dtype: object


In [14]:
import numpy as np
import spacy   # another tokenizer, lemmatizer (has --> be)
nlp = spacy.load('en_core_web_sm')
nlp.disable_pipes('parser', 'ner')
def extract_terms(text):
  '''
      use nlp to extract all terms in a text
      process it with spacy
      lower case
      remove punctuation
      remove stop words
      lemmatize
      extract remaining words
      return a list of words with repetition
      later optimize it with your insight from Assignment 1
  '''

  terms = nlp(text)
  out_terms = [t.lemma_.lower() for t in terms] # if t.is_alpha and not t.is_stop]
  return out_terms

from collections import defaultdict # if a key does not exist it automatically add it

def compute_freq(lst):
  '''
    compute frequency of terms in lst
    return a dictionary of terms (key) and term frequency (values)
    initialize an empty dictionry
    for each word in lst
      add it to dictionary
  '''
  freq_term = defaultdict(int)
  for term in lst:
    freq_term[term] += 1

  return freq_term


In [15]:
# build the inverted index
# a dictionary with keys = terms in the vocabulary
# and values = a dictionary with doc_id as key and term frequency as value
inverted_index = defaultdict(dict) # a dict of dict

for doc_id, doc in enumerate(collection):
  terms_doc     = extract_terms(doc) # apply nlp
  dict_doc      = compute_freq(terms_doc) # compute freq of each term in a document
  # update inverted_index
  for term_doc,term_freq in dict_doc.items():
    inverted_index[term_doc][doc_id] = term_freq

print(inverted_index)

print('Vocab size', len(inverted_index))

Vocab size 16635


In [16]:
# efficiently compute the dot product between query and each text that has at least one
# word in the query
# return a sorted list of top k texts from the most relevant
# (highest dot product) to the least relevant (lowest dot product)

def dist_query_docs(query_term_freq, inverted_index, topk = 5):
  # which docs have terms in the query
  dot_product = defaultdict(int)

  for query_term, query_tf in query_term_freq.items():

    if query_term in inverted_index:
      dict_term_inverted_index = inverted_index[query_term]

      for doc_id, doc_tf in dict_term_inverted_index.items():
        dot_product[doc_id] += query_tf * doc_tf

  sorted_dot_product = sorted(dot_product.items(), key = lambda item:item[1], reverse = True)

  if len(sorted_dot_product) < topk:
      return dict(sorted_dot_product)
  else:
      return dict(sorted_dot_product[:topk])

In [18]:
# compute modified query
def get_feedback_query(orig_query, inverted_index, collection, ind_rel_doc, ind_not_rel_doc,
                       alpha, beta, gamma):
  '''
    Input: original query, relevant docs, alpha, beta, gamma, inverted index
    Output: modified query
    orig_query     = dictionary {term_query:term_freq_query}
    inverted_index = dict {term:dict{ doc_id:term_freq}}
    collection     = list of texts
    ind_rel_doc    = list of indices of relevant docs
    ind_irrel_doc  = list of indices of irrelevant docs
  '''
  # Step 1 => start the mod_query with the original query, multiply all tf by alpha
  mod_query = {term: alpha * tf for term, tf in orig_query}
  mod_query = defaultdict(int, mod_query)

  # Step 2:
  # for each relevant doc:
    # Extract the words in the relevant docs -> use extract_terms function
    # for each word with a high tf-idf:
    #     add it to the mod_query (beta * tf_idf)
  size_vocab = len(inverted_index)

  for doc_id in ind_rel_doc:

    terms     = extract_terms(collection[doc_id]) # returns a key with duplicates
    term_freq = compute_freq(terms)

    for t, tf in term_freq.items(): # for each term in the document
      doc_freq = len(inverted_index[t]) # number of documents the term appears in
      tf_idf = tf * math.log((1 + size_vocab)/doc_freq)

      if tf_idf >= 1: # if the term is relevant for the doc - you can introduce a threshold and play with it
        mod_query[t] += beta * tf_idf

  # Step 3:
  # for each non-relevant doc:
      # Extract the words in the non-relevant docs -> use extract_terms function
      # for each word with a high tf-idf:
      #   subtract it from the mod_query (-gamma * tf_idf)

  # Step 4: any other adjustments to mod_query

  return mod_query




In [None]:
# enter queries, get top 7 ranked results

In [None]:
# mark results as relevant or not relevant

In [None]:
# compute modified query
# recompute top 7 results and mark them

In [None]:
# print text movies
import pprint
def print_ranked_docs(list_docs, collection):
  for doc in list_docs:
    print('############################################')
    pprint.pp(collection[['Release Year','Title', 'Genre']].iloc[doc])
    pprint.pp(collection['Plot'].iloc[doc])


print_ranked_docs(list(sorted_sim.keys())[:3], jobs_df)

NameError: name 'sorted_sim' is not defined

In [None]:
selected_movies[selected_movies['Title'].str.contains('Star wars')]

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot


In [None]:
# find modified query vector
# compute df frequency from counter
# orig_query = text
# rel_docs, irel_docs = list of docs indices
def mod_query(orig_query, vectorizer, term_freq, log_doc_freq, rel_docs, not_rel_docs, alpha, beta, gamma):
  vect_orig = vectorizer.transform(orig_query)
  # compute centroid for rel and not_rel docs
  list_rel     = [term_freq.getrow(i) for i in rel_docs]
  list_not_rel = [term_freq.getrow(i) for i in not_rel_docs]

  # Check if list_rel and list_not_rel are empty before applying vstack
  if list_rel:
    is_empty_rel = True
  if list_not_rel:
    is_empty_not_rel = True

  if list_rel and list_not_rel:
      return np.array([])  # or any other appropriate default value

  # apply tf function
  nr_docs = term_freq.shape[0]
  fct_tf = lambda x: np.log(x + 1)

  vect_mod = alpha * vect_orig

  if list_rel:
    vect_rel = scipy.sparse.vstack(list_rel).toarray()
    np_rel   = fct_tf(vect_rel)           #  apply tf function
    np_rel   = np_rel * log_doc_freq.T    #  apply idf function
    max_tf_idf_rel     = np.max(np_rel)
    th_rel     = 0.3 * max_tf_idf_rel
    np_rel[np_rel < th_rel] = 0
    np_rel[np_rel > 0.8 * max_tf_idf_rel] = 0

    centroid_rel     = np.mean(np_rel, axis = 0)
    vect_mod +=  beta * centroid_rel

  if list_not_rel:
    vect_not_rel = scipy.sparse.vstack(list_not_rel).toarray()
    np_not_rel   = fct_tf(vect_not_rel)         # apply tf function
    np_not_rel   = np_not_rel * log_doc_freq.T  # apply idf function
    max_tf_idf_not_rel = np.max(np_not_rel)
    th_not_rel = 0.3 * max_tf_idf_not_rel
    np_not_rel[np_not_rel < th_not_rel] = 0 # maybe filter very rare words (low document frequency) - aka. Character names
    centroid_not_rel = np.mean(np_not_rel, axis = 0)
    vect_mod -= gamma * centroid_not_rel
  # apply a threshold for tf_idf values

  #print(max_tf_idf_rel, max_tf_idf_not_rel)
  tf_idf_nonzero = np_rel[:,vect_mod.nonzero()[1]]

  return vect_mod, tf_idf_nonzero

orig_query = ["aliens spaceship new york"]
rel_docs = [696]
not_rel_docs = []

vect_mod, tf_idf_nonzero = mod_query(orig_query, vect, counter, log_doc_freq, rel_docs, not_rel_docs,  0.5, 0.5, 0.5)
print(type(vect_mod), vect_mod.shape)

print("Count num non-zero terms =", np.count_nonzero(vect_mod))
# find the index of non-zero terms
ind_nonzero_mod = np.nonzero(vect_mod)
print(ind_nonzero_mod[1][:10])
#print the query non-zero terms
print(vect.get_feature_names_out()[ind_nonzero_mod[1]])

print('tfidf nonzero', tf_idf_nonzero)

<class 'numpy.matrix'> (1, 17923)
Count num non-zero terms = 55
[ 440 1291 1585 1765 2181 2260 2380 2741 3132 3658]
['alien' 'ball' 'berman' 'blanket' 'bully' 'caan' 'captain' 'cheng'
 'command' 'crew' 'defenseless' 'detach' 'display' 'doorway' 'drain'
 'earth' 'fbi' 'fireball' 'gabrielle' 'goldfish' 'harmless' 'hart' 'helm'
 'human' 'humanoid' 'imbibe' 'lifeboat' 'mannerism' 'metal' 'ming'
 'morrison' 'murphy' 'myer' 'net' 'new' 'nil' 'number' 'ocean' 'orb'
 'power' 'presentation' 'recharge' 'researcher' 'retrieve' 'salt'
 'science' 'ship' 'shoe' 'sized' 'spaceship' 'superpower' 'superstitious'
 'taser' 'telescope' 'york']
tfidf nonzero [[ 8.45717116  8.20254382  4.83864968  4.1598638   4.0228436   4.83864968
  10.54199729  4.00562127  7.3246421   7.594634    5.11948185  4.35884076
   4.29446236  4.83864968  7.27863299  7.60653962  4.51090723  4.4850018
   4.00562127  5.11948185  4.25220621  4.07843725  4.00562127  6.57119974
   4.63945843  4.4850018   7.35336763  4.00562127  8.457171

========== END OF QUERY MODIFICATION ==========