<a href="https://colab.research.google.com/github/HofstraDoboli/TextMining/blob/main/feedback_retrieval_class.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Implement Rocchio feedback algorithm**

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
import os
from sklearn import metrics
import scipy.sparse


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd /content/drive/MyDrive/TextMining
%ls *.csv

/content/drive/MyDrive/TextMining
wiki_movie_plots_deduped.csv


In [None]:
dir_file = os.getcwd() # returns path to current directory
files_dir = os.listdir(dir_file)  # list of files in current directory

csv_files = [f for f in files_dir if f.endswith('csv')]
print(csv_files)
movie_file = csv_files[0]

movie_df = pd.read_csv(movie_file)
print(movie_df.columns)  # the columns
print(movie_df.shape)

['wiki_movie_plots_deduped.csv']
Index(['Release Year', 'Title', 'Origin/Ethnicity', 'Director', 'Cast',
       'Genre', 'Wiki Page', 'Plot'],
      dtype='object')
(34886, 8)


In [None]:
movie_df.Genre.value_counts()[:40]

unknown            6083
drama              5964
comedy             4379
horror             1167
action             1098
thriller            966
romance             923
western             865
crime               568
adventure           526
musical             467
crime drama         464
romantic comedy     461
science fiction     418
film noir           345
mystery             310
war                 273
animation           264
comedy, drama       236
sci-fi              221
family              217
fantasy             204
animated            195
musical comedy      154
comedy-drama        137
biography           136
anime               112
suspense            104
romantic drama      103
comedy drama        103
animated short       91
drama, romance       86
social               82
historical           77
action thriller      73
documentary          73
serial               71
world war ii         70
family drama         66
war drama            65
Name: Genre, dtype: int64

In [None]:
new_movies = movie_df[movie_df['Release Year'] > 2000]
selected_movies = new_movies[new_movies.Genre.isin(['thriller', 'comedy', 'drama', 'science-fiction', 'sci-fi', 'adventure'])]
print(new_movies.shape)
print(selected_movies.shape)
text_data = selected_movies['Title'] + ' ' + selected_movies['Plot']
text_data.head()

(12093, 8)
(3224, 8)


Unnamed: 0,0
13785,The Affair of the Necklace Jeanne de Saint-Rém...
13787,All Over the Guy All Over the Guy is about Eli...
13789,The Amati Girls The film centers around an Ita...
13790,America's Sweethearts Film publicist Lee Phill...
13793,American Pie 2 After their freshman year at co...


In [None]:
import numpy as np
import spacy   # another tokenizer, lemmatizer (has --> be)
nlp = spacy.load('en_core_web_sm')
nlp.disable_pipes('parser', 'ner')

['parser', 'ner']

In [None]:
# print text movies
import pprint
def print_movie(list_docs):
  for doc in list_docs:
    print('############################################')
    pprint.pp(selected_movies[['Release Year','Title', 'Genre']].iloc[doc])
    pprint.pp(selected_movies['Plot'].iloc[doc])


print_movie(list(sorted_sim.keys())[:3])

############################################
Release Year         2008
Title           Meet Dave
Genre              comedy
Name: 15480, dtype: object
('In his New York City apartment, a young boy named Josh Morrison (Austyn Lind '
 'Myers) stares through his telescope at an object falling from the sky. It is '
 'a golf-ball-sized metal ball which flies through the window and lands in his '
 'fishbowl, quickly draining the water along with the goldfish. He decides to '
 "show it at his school's science class presentation.\r\n"
 'Some months later a massive fireball crashes into the water near Liberty '
 'Island. It is revealed to be a spaceship which resembles a human (Eddie '
 'Murphy), controlled by 100 tiny humanoid aliens. Its Captain (also played by '
 'Murphy) pilots the spaceship from the command deck located in its head, with '
 'the help of his second-in-command Number 2 (Ed Helms), and researcher Number '
 '3 (Gabrielle Union). The spaceship looks very human, and displays nume

In [None]:
selected_movies[selected_movies['Title'].str.contains('Star wars')]

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot


In [None]:
# find modified query vector
# compute df frequency from counter
# orig_query = text
# rel_docs, irel_docs = list of docs indices
def mod_query(orig_query, vectorizer, term_freq, log_doc_freq, rel_docs, not_rel_docs, alpha, beta, gamma):
  vect_orig = vectorizer.transform(orig_query)
  # compute centroid for rel and not_rel docs
  list_rel     = [term_freq.getrow(i) for i in rel_docs]
  list_not_rel = [term_freq.getrow(i) for i in not_rel_docs]

  # Check if list_rel and list_not_rel are empty before applying vstack
  if list_rel:
    is_empty_rel = True
  if list_not_rel:
    is_empty_not_rel = True

  if list_rel and list_not_rel:
      return np.array([])  # or any other appropriate default value

  # apply tf function
  nr_docs = term_freq.shape[0]
  fct_tf = lambda x: np.log(x + 1)

  vect_mod = alpha * vect_orig

  if list_rel:
    vect_rel = scipy.sparse.vstack(list_rel).toarray()
    np_rel   = fct_tf(vect_rel)           #  apply tf function
    np_rel   = np_rel * log_doc_freq.T    #  apply idf function
    max_tf_idf_rel     = np.max(np_rel)
    th_rel     = 0.3 * max_tf_idf_rel
    np_rel[np_rel < th_rel] = 0
    np_rel[np_rel > 0.8 * max_tf_idf_rel] = 0

    centroid_rel     = np.mean(np_rel, axis = 0)
    vect_mod +=  beta * centroid_rel

  if list_not_rel:
    vect_not_rel = scipy.sparse.vstack(list_not_rel).toarray()
    np_not_rel   = fct_tf(vect_not_rel)         # apply tf function
    np_not_rel   = np_not_rel * log_doc_freq.T  # apply idf function
    max_tf_idf_not_rel = np.max(np_not_rel)
    th_not_rel = 0.3 * max_tf_idf_not_rel
    np_not_rel[np_not_rel < th_not_rel] = 0 # maybe filter very rare words (low document frequency) - aka. Character names
    centroid_not_rel = np.mean(np_not_rel, axis = 0)
    vect_mod -= gamma * centroid_not_rel
  # apply a threshold for tf_idf values

  #print(max_tf_idf_rel, max_tf_idf_not_rel)
  tf_idf_nonzero = np_rel[:,vect_mod.nonzero()[1]]

  return vect_mod, tf_idf_nonzero

orig_query = ["aliens spaceship new york"]
rel_docs = [696]
not_rel_docs = []

vect_mod, tf_idf_nonzero = mod_query(orig_query, vect, counter, log_doc_freq, rel_docs, not_rel_docs,  0.5, 0.5, 0.5)
print(type(vect_mod), vect_mod.shape)

print("Count num non-zero terms =", np.count_nonzero(vect_mod))
# find the index of non-zero terms
ind_nonzero_mod = np.nonzero(vect_mod)
print(ind_nonzero_mod[1][:10])
#print the query non-zero terms
print(vect.get_feature_names_out()[ind_nonzero_mod[1]])

print('tfidf nonzero', tf_idf_nonzero)

<class 'numpy.matrix'> (1, 17923)
Count num non-zero terms = 55
[ 440 1291 1585 1765 2181 2260 2380 2741 3132 3658]
['alien' 'ball' 'berman' 'blanket' 'bully' 'caan' 'captain' 'cheng'
 'command' 'crew' 'defenseless' 'detach' 'display' 'doorway' 'drain'
 'earth' 'fbi' 'fireball' 'gabrielle' 'goldfish' 'harmless' 'hart' 'helm'
 'human' 'humanoid' 'imbibe' 'lifeboat' 'mannerism' 'metal' 'ming'
 'morrison' 'murphy' 'myer' 'net' 'new' 'nil' 'number' 'ocean' 'orb'
 'power' 'presentation' 'recharge' 'researcher' 'retrieve' 'salt'
 'science' 'ship' 'shoe' 'sized' 'spaceship' 'superpower' 'superstitious'
 'taser' 'telescope' 'york']
tfidf nonzero [[ 8.45717116  8.20254382  4.83864968  4.1598638   4.0228436   4.83864968
  10.54199729  4.00562127  7.3246421   7.594634    5.11948185  4.35884076
   4.29446236  4.83864968  7.27863299  7.60653962  4.51090723  4.4850018
   4.00562127  5.11948185  4.25220621  4.07843725  4.00562127  6.57119974
   4.63945843  4.4850018   7.35336763  4.00562127  8.457171

========== END OF QUERY MODIFICATION ==========