In [32]:
from mabokahflib.myfunctions import my_nltk_tokenizer
import warnings
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict

In [24]:
warnings.filterwarnings("ignore")

### reading the data

In [25]:
df = pd.read_csv("wiki_movie_plots_deduped.csv")
df = df['Plot']
raw_data = df.values.tolist()

In [26]:
df.head()

0    A bartender is working at a saloon, serving dr...
1    The moon, painted with a smiling face hangs ov...
2    The film, just over a minute long, is composed...
3    Lasting just 61 seconds and consisting of two ...
4    The earliest known adaptation of the classic f...
Name: Plot, dtype: object

### Using TFIDF Vectorizer

In [27]:
tfidf_vect = TfidfVectorizer(tokenizer=my_nltk_tokenizer)
vecs_count = tfidf_vect.fit_transform(raw_data)

In [28]:
print(" size of dictionary ==> ", len(tfidf_vect.get_feature_names_out()))
print(" IDs of the tokens: \n")
print(tfidf_vect.vocabulary_)

 size of dictionary ==>  102249
 IDs of the tokens: 

{'bartend': 8544, 'work': 99213, 'saloon': 77226, 'serv': 79887, 'drink': 25182, 'custom': 20686, 'fill': 29481, 'stereotyp': 85432, 'irish': 41443, 'man': 54136, 'bucket': 13533, 'beer': 9237, 'carri': 15167, 'nation': 61535, 'follow': 30245, 'burst': 14004, 'insid': 41019, 'assault': 6317, 'pull': 70929, 'hat': 36873, 'eye': 28251, 'dump': 25532, 'head': 37122, 'group': 34869, 'begin': 9285, 'wreck': 99317, 'bar': 8233, 'smash': 83157, 'fixtur': 29767, 'mirror': 58101, 'break': 12766, 'cash': 15272, 'regist': 73768, 'spray': 84613, 'seltzer': 79559, 'water': 97681, 'face': 28351, 'policemen': 69075, 'appear': 5207, 'order': 64601, 'everybodi': 27959, 'leav': 50302, '1': 27, 'moon': 59100, 'paint': 65628, 'smile': 83195, 'hang': 36269, 'park': 66433, 'night': 62537, 'young': 100534, 'coupl': 19781, 'walk': 97378, 'past': 66666, 'fenc': 29136, 'learn': 50280, 'rail': 72110, 'look': 51868, 'embrac': 26920, 'get': 32700, 'bigger': 106

In [29]:
vecs_count.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

### Document term Matrix

In [30]:
print("matrix type : ", type(vecs_count))
print("matrix dimension: ", vecs_count.shape)
print("\n document vector")
print(vecs_count.getrow(0))

matrix type :  <class 'scipy.sparse._csr.csr_matrix'>
matrix dimension:  (34886, 102249)

 document vector
  (0, 27)	0.10993181204870044
  (0, 50302)	0.0515660205460145
  (0, 27959)	0.14239573416327622
  (0, 64601)	0.06920146974056317
  (0, 5207)	0.07864446635233532
  (0, 69075)	0.14720523825869897
  (0, 28351)	0.08201742675861512
  (0, 97681)	0.1009481938346297
  (0, 79559)	0.24897600316455132
  (0, 84613)	0.1654980731720511
  (0, 73768)	0.156802935045374
  (0, 15272)	0.12499906408517929
  (0, 12766)	0.07346959934618033
  (0, 58101)	0.14025700867208107
  (0, 29767)	0.21900841003894908
  (0, 83157)	0.13833440662215216
  (0, 8233)	0.10499531566168369
  (0, 99317)	0.14363437477618018
  (0, 9285)	0.06225318790456436
  (0, 34869)	0.16079102736538714
  (0, 37122)	0.07280155808888597
  (0, 25532)	0.12827452677911683
  (0, 28251)	0.1001631129097147
  (0, 36873)	0.14919174140316202
  (0, 70929)	0.09654532730464926
  (0, 6317)	0.11735616999031548
  (0, 41019)	0.0911826788928822
  (0, 14004)	0.1

### Term document matrix

In [31]:
inverted_count = vecs_count.transpose()
print(" index ")
print(inverted_count.toarray())

 index 
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [22]:
print("matrix type : ", type(inverted_count))
print("matrix dimension: ", inverted_count.shape)
print("\n document vector")
print(inverted_count.getrow(2))

matrix type :  <class 'scipy.sparse._csc.csc_matrix'>
matrix dimension:  (102249, 34886)

 document vector
  (0, 9408)	0.07881833500212063
  (0, 22630)	0.020752329173586496
  (0, 23684)	0.020752329173586496
  (0, 25351)	0.013063076936248768
  (0, 26467)	0.07748457115959989
  (0, 29706)	0.028365972745469502
  (0, 29720)	0.02501511186798203
  (0, 34072)	0.11281807619610011


### implementation of term at a time 

In [33]:
documents = [
    "the cat in the hat",
    "the quick brown fox",
    "the quick red fox",
    "the lazy dog"
]

In [34]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(documents)
feature_names = vectorizer.get_feature_names_out()

In [48]:
inverted_index = defaultdict(list)

In [49]:
# Fill the inverted index with TF-IDF values
for doc_id, doc in enumerate(tfidf_matrix):
    for term_id in doc.nonzero()[1]:
        term = feature_names[term_id]
        inverted_index[term].append((doc_id, doc[0, term_id]))

In [61]:
max_term_length = max(len(term) for term in inverted_index)

In [62]:
for term, postings in inverted_index.items():
    print(f"Term: {term.ljust(max_term_length)}   Postings: {postings}")

Term: hat     Postings: [(0, 0.4945120594211411)]
Term: in      Postings: [(0, 0.4945120594211411)]
Term: cat     Postings: [(0, 0.4945120594211411)]
Term: the     Postings: [(0, 0.5161138142514549), (1, 0.3290228843798993), (2, 0.3290228843798993), (3, 0.34618161159873423)]
Term: fox     Postings: [(1, 0.4970962045265468), (2, 0.4970962045265468)]
Term: brown   Postings: [(1, 0.6305035039117027)]
Term: quick   Postings: [(1, 0.4970962045265468), (2, 0.4970962045265468)]
Term: red     Postings: [(2, 0.6305035039117027)]
Term: dog     Postings: [(3, 0.6633846138519129)]
Term: lazy    Postings: [(3, 0.6633846138519129)]


In [63]:
inverted_index

defaultdict(list,
            {'hat': [(0, 0.4945120594211411)],
             'in': [(0, 0.4945120594211411)],
             'cat': [(0, 0.4945120594211411)],
             'the': [(0, 0.5161138142514549),
              (1, 0.3290228843798993),
              (2, 0.3290228843798993),
              (3, 0.34618161159873423)],
             'fox': [(1, 0.4970962045265468), (2, 0.4970962045265468)],
             'brown': [(1, 0.6305035039117027)],
             'quick': [(1, 0.4970962045265468), (2, 0.4970962045265468)],
             'red': [(2, 0.6305035039117027)],
             'dog': [(3, 0.6633846138519129)],
             'lazy': [(3, 0.6633846138519129)]})

In [64]:
# TAAT scoring function
def taat_scoring(query, inverted_index):
    scores = defaultdict(float)
    for term in query:
        if term in inverted_index:
            for doc_id, score in inverted_index[term]:
                scores[doc_id] += score
    return scores

# Example query
query = ["quick", "fox"]

# Perform TAAT scoring
scores = taat_scoring(query, inverted_index)

# Print the scores
for doc_id, score in sorted(scores.items(), key=lambda item: item[1], reverse=True):
    print(f"Document ID: {doc_id}, Score: {score}")


Document ID: 1, Score: 0.9941924090530936
Document ID: 2, Score: 0.9941924090530936
