In [3]:
import pyterrier as pt
import nltk
import pandas as pd
import os
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np

if not pt.started():
  pt.init(boot_packages=["com.github.terrierteam:terrier-prf:-SNAPSHOT"])

In [4]:
os.environ["JAVA_HOME"] = "C:\\Program Files\\Java\\jdk-22"
if not pt.started():
    pt.init()

In [5]:
categories = ['business', 'entertainment', 'food',
              'graphics', 'historical', 'space', 'sport', 'technologie']

In [6]:
def read_data(categories):
    collection = []
    for category in categories:
        for i in range(1, 101):
            filename = f'./archive_2/{category}/{category}_{i}.txt'
            with open(filename, 'r') as f:
                lines = f.readlines()
                for line in lines:
                    line = line.strip()
                    if line:
                        collection.append({'category': category, 'text': line})
    return collection

In [7]:
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    stemmer = PorterStemmer()

    def remove_stopwords(text):
        tokens = word_tokenize(text)
        filtered_tokens = [word.lower()
                           for word in tokens if word.lower() not in stop_words]
        return ' '.join(filtered_tokens)

    def stem_text(text):
        tokens = word_tokenize(text)
        stemmed_tokens = [stemmer.stem(word) for word in tokens]
        return ' '.join(stemmed_tokens)

    text = re.sub(r"http\S+", " ", text)
    text = re.sub(r"RT ", " ", text)
    text = re.sub(r"@[\w]*", " ", text)
    text = re.sub(r"[\.\,\#_\|\:\?\?\/\=]", " ", text)
    text = re.sub(r'\t', ' ', text)
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r"\s+", " ", text)
    text = text.strip()
    text = remove_stopwords(text)
    text = stem_text(text)
    return text

In [8]:
def clean_text(text):
    text = re.sub(r"http\S+", " ", text)
    text = re.sub(r"RT ", " ", text)
    text = re.sub(r"@[\w]*", " ", text)
    text = re.sub(r"[\.\,\#_\|\:\?\?\/\=]", " ", text)
    text = re.sub(r'\t', ' ', text)
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r"\s+", " ", text)
    text = text.strip()
    return text

In [9]:
def index_documents(df):
    df['processed_text'] = df['text'].apply(preprocess_text)
    df['processed_text'] = df['processed_text'].apply(clean_text)
    df['docno'] = range(1, len(df)+1)
    df['docno'] = df['docno'].apply(str)
    return df

In [10]:
def build_inverted_index(df):
    inverted_index = {}
    df2 = index_documents(df)
    for index, row in df2.iterrows():
        doc_id = index + 1
        words = row['text'].split()

        for term in words:
            if term not in inverted_index:
                inverted_index[term] = {}
            if doc_id not in inverted_index[term]:
                inverted_index[term][doc_id] = 0
            inverted_index[term][doc_id] += 1
    return inverted_index

In [11]:
def search_index(index, query):
    lexicon = index.getLexicon()
    metadata = index.getMetaIndex()
    inverted = index.getInvertedIndex()
    lex = lexicon.getLexiconEntry(query)

    if lex is None:
        return []  # Return empty list if query term is not found in the index

    postings = inverted.getPostings(lex)

    if postings is None:
        return []  # Return empty list if no postings are found for the query term

    ids = [metadata.getItem("docno", posting.getId())
           for posting in postings]
    return ids

In [12]:
def rank_tfidf(index, query):
    tfidf_retr = pt.BatchRetrieve(
        index, controls={"wmodel": "TF_IDF"})
    return tfidf_retr.search(query)

In [13]:
def display_document_index(index, docid):
    di = index.getDirectIndex()
    doi = index.getDocumentIndex()
    lex = index.getLexicon()
    for i in range(len(docid)):
      doc_entry = doi.getDocumentEntry(docid[i])
      posting_list = di.getPostings(doc_entry)
      for posting in posting_list:
          termid = posting.getId()
          lex_entry = lex.getLexiconEntry(termid)
          print(lex_entry.getKey() + " -> " + str(posting) +
                " doclen=%d" % posting.getDocumentLength())

In [14]:
collection = read_data(categories)
df = pd.DataFrame(collection)
df.to_csv('text.csv', index=False)
df = pd.read_csv('./text.csv')

In [15]:
df2 = index_documents(df)
df2

Unnamed: 0,category,text,processed_text,docno
0,business,Lufthansa flies back to profit,lufthansa fli back profit,1
1,business,German airline Lufthansa has returned to profi...,german airlin lufthansa return profit 2004 pos...,2
2,business,"In a preliminary report, the airline announced...",preliminari report airlin announc net profit 4...,3
3,business,Japanese growth grinds to a halt,japanes growth grind halt,4
4,business,Growth in Japan evaporated in the three months...,growth japan evapor three month septemb spark ...,5
...,...,...,...,...
12744,technologie,"Consumers swapping old phones for slinkier, di...",consum swap old phone slinkier dinkier version...,12745
12745,technologie,"In fact, the numbers of people not taking and ...",fact number peopl take send pictur audio video...,12746
12746,technologie,"""Also,"" he said, ""they have to simplify the in...",`` also `` said `` simplifi interfac rocket sc...,12747
12747,technologie,There are other deeper technical reasons why m...,deeper technic reason multimedia messag push s...,12748


In [16]:
inverted_index = build_inverted_index(df)
inverted_index

{'Lufthansa': {1: 1, 2: 1, 3: 3, 212: 1},
 'flies': {1: 1, 10451: 1},
 'back': {1: 1,
  28: 1,
  32: 1,
  39: 1,
  61: 1,
  84: 1,
  103: 1,
  108: 1,
  137: 1,
  166: 1,
  170: 1,
  193: 1,
  208: 1,
  216: 1,
  222: 1,
  272: 1,
  277: 1,
  298: 1,
  340: 1,
  429: 1,
  506: 1,
  637: 1,
  645: 1,
  734: 1,
  809: 1,
  867: 1,
  878: 1,
  952: 1,
  1008: 1,
  1054: 1,
  1101: 1,
  1201: 1,
  1406: 2,
  1459: 1,
  1784: 1,
  1832: 1,
  1840: 1,
  1863: 1,
  2725: 1,
  2769: 1,
  2801: 1,
  2969: 1,
  2988: 1,
  4369: 1,
  4795: 1,
  4830: 1,
  4833: 1,
  4840: 1,
  4897: 1,
  4902: 1,
  4907: 1,
  4936: 1,
  4968: 1,
  4977: 1,
  4980: 1,
  5010: 1,
  5040: 1,
  5046: 2,
  5048: 1,
  5055: 1,
  5056: 1,
  5066: 1,
  5068: 4,
  5093: 1,
  5115: 1,
  5118: 1,
  5136: 1,
  5140: 1,
  5155: 1,
  5168: 2,
  5215: 1,
  5235: 1,
  5271: 1,
  5278: 1,
  5397: 1,
  5416: 1,
  5418: 1,
  5469: 1,
  5515: 1,
  5554: 1,
  5558: 1,
  5591: 1,
  5674: 1,
  5699: 1,
  5713: 1,
  6212: 1,
  6417: 1,


In [17]:
indexer = pt.DFIndexer(
    'C:\\Users\\midoh\\Desktop\\UST-CSAI\\Y2S2\\DSAI 201 (Mining & IR)\\Project\\mySecondIndex', overwrite=True)
index_ref = indexer.index(df["processed_text"], df["docno"])
index = pt.IndexFactory.of(index_ref)

23:14:58.778 [main] WARN org.terrier.structures.indexing.Indexer - Indexed 44 empty documents
23:14:58.790 [main] ERROR org.terrier.structures.indexing.Indexer - Could not rename index
java.io.IOException: Rename of index structure file 'C:\Users\midoh\Desktop\UST-CSAI\Y2S2\DSAI 201 (Mining & IR)\Project\mySecondIndex/data_1.direct.bf' (exists) to 'C:\Users\midoh\Desktop\UST-CSAI\Y2S2\DSAI 201 (Mining & IR)\Project\mySecondIndex/data.direct.bf' (exists) failed - likely that source file is still open. Possible indexing bug?
	at org.terrier.structures.IndexUtil.renameIndex(IndexUtil.java:379)
	at org.terrier.structures.indexing.Indexer.index(Indexer.java:388)
	at org.terrier.structures.indexing.Indexer.index(Indexer.java:355)


In [18]:
query = input("Enter your query: ")

In [19]:
for term in query.split():
    inv_indx = inverted_index[f'{term}']
  
    if term not in inverted_index:
        print(f'Term "{term}" not found in the index')
    else:
        print(f'Term "{term}" found in the index')
inv_indx = list(inv_indx.keys())
inv_indx

Term "sports" found in the index


[21, 207, 468, 470, 515, 620, 11690, 11786, 11799]

In [20]:
docid = inv_indx
display_document_index(index, docid)

profit -> ID(3) TF(1) doclen=47
loss -> ID(6) TF(1) doclen=47
give -> ID(26) TF(1) doclen=47
european -> ID(29) TF(1) doclen=47
sign -> ID(48) TF(1) doclen=47
japan -> ID(66) TF(1) doclen=47
fall -> ID(78) TF(1) doclen=47
engin -> ID(79) TF(1) doclen=47
expect -> ID(103) TF(1) doclen=47
product -> ID(120) TF(1) doclen=47
mainli -> ID(143) TF(1) doclen=47
deal -> ID(155) TF(1) doclen=47
firm -> ID(185) TF(1) doclen=47
warn -> ID(248) TF(1) doclen=47
2005 -> ID(290) TF(1) doclen=47
sai -> ID(295) TF(1) doclen=47
sale -> ID(315) TF(2) doclen=47
sever -> ID(325) TF(1) doclen=47
peugeot -> ID(353) TF(2) doclen=47
mitsubishi -> ID(354) TF(2) doclen=47
boost -> ID(355) TF(1) doclen=47
suv -> ID(362) TF(1) doclen=47
motor -> ID(364) TF(1) doclen=47
come -> ID(368) TF(1) doclen=47
agreement -> ID(369) TF(1) doclen=47
understand -> ID(370) TF(1) doclen=47
spring -> ID(371) TF(1) doclen=47
seal -> ID(372) TF(1) doclen=47
badli -> ID(373) TF(1) doclen=47
utili -> ID(374) TF(1) doclen=47
sold -> ID

In [21]:
import pandas as pd

ranked_documents = rank_tfidf(index, query)

# Extract docid and score lists
docid = list(ranked_documents['docid'])
score = list(ranked_documents['score'])

# Convert lists to DataFrame
df3 = pd.DataFrame(list(zip(docid, score)),
                  columns=['Document ID', 'Score'])

print("Ranked Documents:")
df3

Ranked Documents:


Unnamed: 0,Document ID,Score
0,3331,5.324651
1,12743,4.972688
2,6290,4.524117
3,11700,4.267480
4,688,4.149778
...,...,...
58,11723,1.191681
59,12059,1.173099
60,11891,1.104223
61,11811,1.050267


In [22]:
query = preprocess_text(query)
query = clean_text(query)
query

'sport'

In [23]:
# specify BM25 as wmodel
bm25 = pt.BatchRetrieve(index, controls={"wmodel": "BM25"}, num_results=10)
# the queries dataframe should have qid and query columns

bm25_res = bm25.search(query)
bm25_res

Unnamed: 0,qid,docid,docno,rank,score,query
0,1,3331,3332,0,9.729262,sport
1,1,12743,12744,1,9.086151,sport
2,1,6290,6291,2,8.266517,sport
3,1,11700,11701,3,7.797586,sport
4,1,688,689,4,7.582521,sport
5,1,11615,11616,5,7.582521,sport
6,1,11745,11746,6,7.582521,sport
7,1,11839,11840,7,7.18612,sport
8,1,11798,11799,8,7.003067,sport
9,1,20,21,9,6.829107,sport


In [24]:
rm3_expander2 = pt.rewrite.RM3(index, fb_terms=10, fb_docs=100)

rm3_qe = bm25 >> rm3_expander2

In [25]:
expanded_query = rm3_qe.search(query).iloc[0]["query"]

expanded_query_formatted = ' '.join(expanded_query.split()[1:])
expanded_query_formatted

'categori^0.025603957 ship^0.025603957 american^0.034707587 dataset^0.025603957 1^0.029871289 collin^0.024750495 aircraft^0.025603957 car^0.044470035 sport^0.739034295 dope^0.024750495'

In [26]:
results_wqe = bm25.search(expanded_query_formatted)

In [27]:
print("   Before Expansion    After Expansion")
print(pd.concat([bm25_res[['docid', 'score']][0:5].add_suffix('_1'),
                 results_wqe[['docid', 'score']][0:5].add_suffix('_2')], axis=1).fillna(''))

   Before Expansion    After Expansion
   docid_1   score_1  docid_2    score_2
0     3331  9.729262     3331  12.187377
1    12743  9.086151    12743   9.086151
2     6290  8.266517    11700   8.821616
3    11700  7.797586    11745   8.578307
4      688  7.582521     6290   8.526265


In [28]:
df['text'][df['docno'].isin(
    results_wqe['docno'].loc[0:5].tolist())]

688      The BBC leads the nominations for the Bafta In...
3331     ship datasets from categories such as cars, an...
6290     for 3 sport scale models, a 1:9.22 D Region To...
11700    Sprinter Michelle Collins has lodged an appeal...
11745    Sprinter Michelle Collins has received an eigh...
12743    There is no doubt that mobile phones sporting ...
Name: text, dtype: object

In [29]:
# load the ELMo model
elmo = hub.load("https://tfhub.dev/google/elmo/3")













In [30]:

sentences = []
# define the sentences
for i in range(50):
  sentences.append(df2["processed_text"][i])

# generate ELMo embeddings for the sentences
embeddings = elmo.signatures["default"](tf.constant(sentences))["elmo"]

In [31]:
embeddings

<tf.Tensor: shape=(50, 128, 1024), dtype=float32, numpy=
array([[[-0.36958337, -0.63186955,  0.5697111 , ..., -0.07706828,
         -0.3244971 , -0.01277411],
        [-0.15178531, -0.906508  ,  0.4592035 , ..., -0.3351923 ,
         -0.30336902,  0.53343   ],
        [ 0.11421654, -0.15565476,  0.36456245, ..., -0.08054529,
         -0.3467325 ,  0.05369081],
        ...,
        [-0.02840842, -0.04353216,  0.04130162, ...,  0.02583169,
         -0.01429836, -0.01650422],
        [-0.02840842, -0.04353216,  0.04130162, ...,  0.02583169,
         -0.01429836, -0.01650422],
        [-0.02840842, -0.04353216,  0.04130162, ...,  0.02583169,
         -0.01429836, -0.01650422]],

       [[ 0.4415284 , -0.33083725,  0.14236967, ..., -0.17101328,
          0.15333652, -0.36979228],
        [ 0.18604371, -0.25193602,  0.01307407, ..., -0.4441265 ,
         -0.37996215, -0.00985511],
        [-0.19300207, -0.33831903,  0.55458385, ..., -0.4302743 ,
         -0.37902465, -0.03738508],
        ..

In [43]:
import torch

# Choose GPU as device to run the experiments on
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [33]:
# we need to import the following libraries.
import pandas as pd
import re
from tqdm import tqdm

# to display the full text on the notebook without truncation
pd.set_option('display.max_colwidth', 150)

In [34]:
from transformers import AutoTokenizer, AutoModel

model_name = "bert-base-uncased"

bert_tokenizer = AutoTokenizer.from_pretrained(model_name)
bert_model = AutoModel.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [35]:
sep_token = bert_tokenizer.sep_token

# print sep token of the tokenizer
print("Sep token : ", sep_token)

# print the token id of sep token
print('Token ID of sep token : ',  bert_tokenizer.convert_tokens_to_ids(sep_token))

Sep token :  [SEP]
Token ID of sep token :  102


In [36]:
cls_token = bert_tokenizer.cls_token

# print cls token of the tokenizer
print("Cls token : ", cls_token)

# print the token id of cls token
print('Token ID of cls token : ',  bert_tokenizer.convert_tokens_to_ids(cls_token))

Cls token :  [CLS]
Token ID of cls token :  101


In [38]:
def encode(text, max_length=32):
    return bert_tokenizer.encode_plus(
        text,                      # Sentence to encode.
        add_special_tokens=True,  # Add '[CLS]' and '[SEP]'
        truncation=True,
        max_length=max_length,           # Pad & truncate all sentences.
        padding="max_length",
        return_attention_mask=True,   # Construct attention mask
        return_tensors='pt',     # Return pytorch tensors.
    )


tokenized_reviews = []
for txt in tqdm(df["text"].values, desc="Tokenizing ..."):
    tokenized_reviews.append(encode(txt, max_length=32))

Tokenizing ...: 100%|██████████| 12749/12749 [00:01<00:00, 6462.90it/s]


In [39]:
tokenized_reviews[0]

{'input_ids': tensor([[  101, 11320,  6199,  4819,  3736, 10029,  2067,  2000,  5618,   102,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0]])}

In [40]:
len(tokenized_reviews)

12749

In [44]:
bert_model.cpu()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [45]:
input_ids = tokenized_reviews[0]["input_ids"].to(device)
attention_mask = tokenized_reviews[0]["attention_mask"].to(device)
output = bert_model(input_ids=input_ids, attention_mask=attention_mask)

In [46]:
output[0].shape  # batch_size x sequence_length x embedding_dimension

torch.Size([1, 32, 768])

In [47]:
# print the embedding of all input tokens.
all_embeddings = output[0][0]
print(all_embeddings.shape)
print(all_embeddings)

torch.Size([32, 768])
tensor([[-0.3897,  0.2082,  0.1052,  ..., -0.7404,  0.4300,  0.4158],
        [ 0.8686, -1.3607,  0.8551,  ..., -0.2795,  0.8384,  0.0726],
        [-0.2327, -0.4627,  0.3876,  ..., -0.5537,  0.4480, -0.0938],
        ...,
        [ 0.1364,  0.0930,  0.6446,  ..., -0.1748,  0.1621, -0.0631],
        [ 0.3148, -0.0633,  0.7048,  ..., -0.2632,  0.2054,  0.0372],
        [ 0.3531, -0.0972,  0.6925,  ..., -0.3408,  0.2222,  0.0166]],
       grad_fn=<SelectBackward0>)


In [52]:
# print the cls embedding
cls_embedding = output[0][0][0]
print(cls_embedding.shape)
print(cls_embedding)

torch.Size([768])
tensor([-3.8973e-01,  2.0825e-01,  1.0523e-01,  2.7978e-02, -3.5507e-01,
        -3.0691e-01,  2.3176e-01,  5.1751e-01, -6.3911e-02, -2.7169e-01,
        -8.8550e-02, -3.4075e-02,  2.7058e-01,  4.7987e-01,  1.3014e-01,
        -1.4775e-02,  1.8505e-01,  5.9033e-02,  2.3334e-01, -2.5631e-01,
         2.1994e-02, -4.2439e-01,  1.7006e-01, -1.0899e-01,  2.5774e-01,
        -5.1792e-03, -6.4407e-02, -5.9223e-01, -7.3062e-02,  5.5849e-02,
         4.0976e-02,  2.7194e-01, -6.7759e-01, -6.4028e-02,  4.2005e-01,
         1.3988e-01,  3.0843e-01, -3.1391e-01,  3.2220e-02,  2.2093e-01,
        -2.4415e-02,  1.1351e-01,  1.4203e-01,  2.9840e-01,  1.8576e-01,
        -3.3796e-01, -2.2288e+00, -1.9740e-01,  9.2226e-02, -5.2136e-02,
         9.1841e-03, -5.2677e-01,  3.5270e-01,  1.2304e-01, -2.4215e-01,
        -3.0401e-02, -2.5064e-01,  7.0096e-01, -6.0598e-02,  1.4984e-01,
         1.2525e-01, -7.1658e-02, -2.1134e-01,  1.6847e-01, -3.4241e-01,
         1.2769e-02, -2.3514e-01,

In [49]:
# print the first token embedding
first_token_embedding = output[0][0][1]
print(first_token_embedding.shape)
print(first_token_embedding)

torch.Size([768])
tensor([ 8.6861e-01, -1.3607e+00,  8.5510e-01, -6.3925e-01,  7.0092e-01,
        -1.7984e-01, -2.8641e-01,  4.5211e-01,  5.8664e-02, -4.0707e-01,
        -2.4510e-01, -4.0537e-01,  5.6137e-03,  4.2592e-01, -8.3456e-01,
         4.0442e-01,  1.8487e-01,  2.9946e-02,  2.8479e-01,  3.9588e-01,
         4.8980e-01,  1.2872e-01, -4.7451e-01,  9.7200e-01,  8.9602e-01,
        -4.0813e-03, -3.7031e-01, -5.1651e-01, -1.3322e-01, -1.7766e-01,
        -1.0811e-01,  5.0358e-01, -1.2365e+00,  3.6626e-01,  3.8750e-01,
         2.1127e-01,  2.6709e-01,  3.0290e-01,  5.0235e-01,  1.1764e+00,
        -8.0545e-01, -4.2491e-01,  1.3775e-01,  4.1716e-01,  2.9926e-01,
        -9.3495e-01,  3.0402e-01,  4.0424e-01,  3.4353e-01, -2.8976e-01,
        -4.6694e-01, -9.2946e-02, -1.2262e-01,  1.5603e-01,  4.7634e-01,
         1.8602e-01,  4.3886e-01, -6.7179e-01,  1.9189e-01, -2.9908e-01,
         4.0906e-01,  7.0381e-01,  8.6413e-01, -9.9745e-02,  5.9550e-01,
        -1.0218e+00, -6.1089e-01,

In [53]:
def get_embeddings(text):
  # Encode text and computer input_ids and attention mask
  tokens = encode(text)
  input_ids = tokens["input_ids"].to(device)
  attention_mask = tokens["attention_mask"].to(device)
  # Pass input_ids and attention mask to model
  output = bert_model(input_ids=input_ids, attention_mask=attention_mask)
  return output


for term in query.split():
    text = f"{term}"
    embeddings = get_embeddings(text)
    print(f"Term : {term}")
    print(f"Embedding shape : {embeddings[0].shape}")
    print(f"Embedding : {embeddings[0]}")
    print("\n")

Term : sport
Embedding shape : torch.Size([1, 32, 768])
Embedding : tensor([[[ 0.3800,  0.2812, -0.0503,  ...,  0.0418,  0.0995,  0.6764],
         [ 0.3049,  0.7073, -0.2842,  ...,  0.3480,  1.1234, -0.0016],
         [ 0.8440, -0.0148, -0.2753,  ...,  0.2098, -0.9082, -0.1826],
         ...,
         [ 0.2149, -0.0915,  0.0662,  ...,  0.2469,  0.1883, -0.0821],
         [ 0.3563,  0.0756,  0.1035,  ...,  0.1527,  0.0638, -0.1062],
         [-0.0238, -0.2127, -0.3091,  ...,  0.2258,  0.4955,  0.2875]]],
       grad_fn=<NativeLayerNormBackward0>)


