In [None]:
#install the Pyterrier framework
!pip install python-terrier
# install the nltk modules
!pip install nltk

!pip install flask
!pip install flask_ngrok

Collecting python-terrier
  Downloading python_terrier-0.13.0-py3-none-any.whl.metadata (11 kB)
Collecting ir-datasets>=0.3.2 (from python-terrier)
  Downloading ir_datasets-0.5.10-py3-none-any.whl.metadata (12 kB)
Collecting wget (from python-terrier)
  Downloading wget-3.2.zip (10 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pyjnius>=1.4.2 (from python-terrier)
  Downloading pyjnius-1.6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Collecting ir-measures>=0.3.1 (from python-terrier)
  Downloading ir_measures-0.3.7-py3-none-any.whl.metadata (7.0 kB)
Collecting pytrec-eval-terrier>=0.5.3 (from python-terrier)
  Downloading pytrec_eval_terrier-0.5.7-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (984 bytes)
Collecting dill (from python-terrier)
  Downloading dill-0.4.0-py3-none-any.whl.metadata (10 kB)
Collecting chest (from python-terrier)
  Downloading chest-0.2.3.tar.gz (9.6 kB)
  Preparing metadata (setup.py

# **Imports**


In [None]:
import pandas as pd
import pyterrier as pt
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import re
import os
pd.set_option('display.max_colwidth', 150)

In [None]:
# Need to install additional terrier package for PRF. It will take around 1 min
!git clone https://github.com/terrierteam/terrier-prf/
!apt-get install maven   #used for Java projects to manage project dependencies and build processes
%cd /content/terrier-prf/
!mvn install
!pwd
%cd ..

Cloning into 'terrier-prf'...
remote: Enumerating objects: 227, done.[K
remote: Counting objects: 100% (227/227), done.[K
remote: Compressing objects: 100% (98/98), done.[K
remote: Total 227 (delta 62), reused 192 (delta 40), pack-reused 0 (from 0)[K
Receiving objects: 100% (227/227), 33.94 KiB | 534.00 KiB/s, done.
Resolving deltas: 100% (62/62), done.
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  libaopalliance-java libapache-pom-java libatinject-jsr330-api-java
  libcdi-api-java libcommons-cli-java libcommons-io-java libcommons-lang3-java
  libcommons-parent-java libgeronimo-annotation-1.3-spec-java
  libgeronimo-interceptor-3.0-spec-java libguava-java libguice-java
  libhawtjni-runtime-java libjansi-java libjansi-native-java libjsr305-java
  libmaven-parent-java libmaven-resolver-java libmaven-shared-utils-java
  libmaven3-core-java libplexus-cipher-java libplexus-classworl

# **Preprocessing the data set**
Tokenization, Removing stop words and Cleaning

In [None]:
nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))
print(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

In [None]:
# Initialize Porter stemmer
stemmer = PorterStemmer()

In [None]:
def Stem_text(text):
    tokens = word_tokenize(text)
    stemmed_tokens = [stemmer.stem(word) for word in tokens]
    # print (tokens)
    return ' '.join(stemmed_tokens)

def clean(text):
   text = re.sub(r"[\.\,\#_\|\:\?\?\/\=\@]", " ", text) # remove special characters
   text = re.sub(r'\t', ' ', text) # remove tabs
   text = re.sub(r'\n', ' ', text) # remove line jump
   text = re.sub(r"\s+", " ", text) # remove extra white space
   text = text.strip()
   return text

def remove_stopwords(text):
    tokens = word_tokenize(text)
    filtered_tokens = [word.lower() for word in tokens if word.lower() not in stop_words] #Lower is used to normalize al the words make them in lower case
    # print('Tokens are:',tokens,'\n')
    return ' '.join(filtered_tokens)

#we need to process the query also as we did for documents
def preprocess(sentence):
  sentence = clean(sentence)
  sentence = remove_stopwords(sentence)
  sentence = Stem_text(sentence)
  return sentence

# Load Data for the Indexer

In [None]:
import zipfile
zip_file_name = 'cisi.zip'
with zipfile.ZipFile(zip_file_name, 'r') as zip_ref:
    zip_ref.extractall('cisi_dataset')
!ls cisi_dataset

CISI.ALL  CISI.QRY  CISI.REL


In [None]:
import os
import pandas as pd

def load_dataset(data_dir):
    documents_path = os.path.join(data_dir, 'CISI.ALL')
    queries_path = os.path.join(data_dir, 'CISI.QRY')
    qrels_path = os.path.join(data_dir, 'CISI.REL')

    documents_df = read_documents(documents_path)
    queries_df = read_queries(queries_path)
    qrels_df = read_qrels(qrels_path)
    return documents_df, queries_df, qrels_df

def read_documents(documents_path):
    with open(documents_path, 'r') as file:
        lines = file.readlines()

    documents = []
    current_document = None

    for line in lines:
        if line.startswith('.I'):
            if current_document is not None:
                current_document['Text'] = current_document['Text'].split('\t')[0].strip()
                documents.append(current_document)
            current_document = {'ID': line.strip().split()[1], 'Text': ''}
        elif line.startswith('.T'):
            continue
        elif line.startswith('.A') or line.startswith('.B') or line.startswith('.W') or line.startswith('.X'):
            continue
        else:
            current_document['Text'] += line.strip() + ' '

    if current_document is not None:
        current_document['Text'] = current_document['Text'].split('\t')[0].strip()
        documents.append(current_document)

    documents_df = pd.DataFrame(documents)
    return documents_df

def read_queries(queries_path):
    with open(queries_path, 'r') as file:
        lines = file.readlines()

    query_texts = []
    query_ids = []
    current_query_id = None
    current_query_text = []

    for line in lines:
        if line.startswith('.I'):
            if current_query_id is not None:
                query_texts.append(' '.join(current_query_text))
                current_query_text = []
            current_query_id = line.strip().split()[1]
            query_ids.append(current_query_id)
        elif line.startswith('.W'):
            continue
        elif line.startswith('.X'):
            break
        else:
            current_query_text.append(line.strip())

    query_texts.append(' '.join(current_query_text))

    queries_df = pd.DataFrame({
        'qid': query_ids,
        'raw_query': query_texts})

    return queries_df

def read_qrels(qrels_path):
    qrels_df = pd.read_csv(qrels_path, sep='\s+', names=['qid','Q0','docno','label'])
    return qrels_df


In [None]:
data_dir = '/content/cisi_dataset'
documents_df, queries_df, qrels_df = load_dataset(data_dir)
documents_df['Text'][0]

"18 Editions of the Dewey Decimal Classifications Comaromi, J.P. The present study is a history of the DEWEY Decimal Classification.  The first edition of the DDC was published in 1876, the eighteenth edition in 1971, and future editions will continue to appear as needed.  In spite of the DDC's long and healthy life, however, its full story has never been told.  There have been biographies of Dewey that briefly describe his system, but this is the first attempt to provide a detailed history of the work that more than any other has spurred the growth of librarianship in this country and abroad. 1"

In [None]:
documents_df

Unnamed: 0,ID,Text
0,1,"18 Editions of the Dewey Decimal Classifications Comaromi, J.P. The present study is a history of the DEWEY Decimal Classification. The first edi..."
1,2,"Use Made of Technical Libraries Slater, M. This report is an analysis of 6300 acts of use in 104 technical libraries in the United Kingdom. Librar..."
2,3,"Two Kinds of Power An Essay on Bibliographic Control Wilson, P. The relationships between the organization and control of writings and the organiz..."
3,4,"Systems Analysis of a University Library; final report and research project Buckland, M.K. The establishment of nine new universities in the 1960'..."
4,5,"A Library Management Game: a report on a research project Brophy, P. Although the use of games in professional education has become widespread onl..."
...,...,...
1455,1456,"World Dynamics Forrester, J.W. Over the last several decades interest in economic development, population growth, and the world environment has ex..."
1456,1457,"World Trends in Library Education Bramley, G. One of the most significant aspects of the evolution of librarianship in the twentieth century has b..."
1457,1458,"Legal Restrictions on Exploitation of the Patent Monopoly: An Economic Analysis Baxter, W.A. The patent laws confer on a patentee power to exclude..."
1458,1459,"Language and Thought Poluskin, V.A. This book considers the basic aspects of this complex problem - the historical and social essence of language ..."


In [None]:
queries_df

Unnamed: 0,qid,raw_query
0,1,What problems and concerns are there in making up descriptive titles? What difficulties are involved in automatically retrieving articles from app...
1,2,"How can actually pertinent data, as opposed to references or entire articles themselves, be retrieved automatically in response to information req..."
2,3,What is information science? Give definitions where possible.
3,4,Image recognition and any other methods of automatically transforming printed text into computer-ready form.
4,5,What special training will ordinary researchers and businessmen need for proper information management and unobstructed use of information retriev...
...,...,...
107,108,".T A Program for Machine-Mediated Searching .A Toliver, D. A technique of online instruction and assistance to bibliographic data base searchers c..."
108,109,".T Author Cocitation: A Literature Measure of Intellectual Structure .A White, H.D. Griffith, B.C. It is shown that the mapping of a particular a..."
109,110,".T Progress in Documentation. Word Processing: An Introduction and Appraisal .A Whitehead, J. The ""Office of the Future,"" ""Office Technology,"" ""W..."
110,111,".T Document Clustering Using an Inverted File Approach .A Willett, P. An automated document clustering procedure is described which does not requi..."


In [None]:
qrels_df

Unnamed: 0,qid,Q0,docno,label
0,1,28,0,0.0
1,1,35,0,0.0
2,1,38,0,0.0
3,1,42,0,0.0
4,1,43,0,0.0
...,...,...,...,...
3109,111,422,0,0.0
3110,111,448,0,0.0
3111,111,485,0,0.0
3112,111,503,0,0.0


In [None]:
#the docno will be our ID
documents_df["docno"]=documents_df["ID"].astype(str)
documents_df

Unnamed: 0,ID,Text,docno
0,1,"18 Editions of the Dewey Decimal Classifications Comaromi, J.P. The present study is a history of the DEWEY Decimal Classification. The first edi...",1
1,2,"Use Made of Technical Libraries Slater, M. This report is an analysis of 6300 acts of use in 104 technical libraries in the United Kingdom. Librar...",2
2,3,"Two Kinds of Power An Essay on Bibliographic Control Wilson, P. The relationships between the organization and control of writings and the organiz...",3
3,4,"Systems Analysis of a University Library; final report and research project Buckland, M.K. The establishment of nine new universities in the 1960'...",4
4,5,"A Library Management Game: a report on a research project Brophy, P. Although the use of games in professional education has become widespread onl...",5
...,...,...,...
1455,1456,"World Dynamics Forrester, J.W. Over the last several decades interest in economic development, population growth, and the world environment has ex...",1456
1456,1457,"World Trends in Library Education Bramley, G. One of the most significant aspects of the evolution of librarianship in the twentieth century has b...",1457
1457,1458,"Legal Restrictions on Exploitation of the Patent Monopoly: An Economic Analysis Baxter, W.A. The patent laws confer on a patentee power to exclude...",1458
1458,1459,"Language and Thought Poluskin, V.A. This book considers the basic aspects of this complex problem - the historical and social essence of language ...",1459


In [None]:
queries_df["qid"]=queries_df["qid"].astype(str)
queries_df

Unnamed: 0,qid,raw_query
0,1,What problems and concerns are there in making up descriptive titles? What difficulties are involved in automatically retrieving articles from app...
1,2,"How can actually pertinent data, as opposed to references or entire articles themselves, be retrieved automatically in response to information req..."
2,3,What is information science? Give definitions where possible.
3,4,Image recognition and any other methods of automatically transforming printed text into computer-ready form.
4,5,What special training will ordinary researchers and businessmen need for proper information management and unobstructed use of information retriev...
...,...,...
107,108,".T A Program for Machine-Mediated Searching .A Toliver, D. A technique of online instruction and assistance to bibliographic data base searchers c..."
108,109,".T Author Cocitation: A Literature Measure of Intellectual Structure .A White, H.D. Griffith, B.C. It is shown that the mapping of a particular a..."
109,110,".T Progress in Documentation. Word Processing: An Introduction and Appraisal .A Whitehead, J. The ""Office of the Future,"" ""Office Technology,"" ""W..."
110,111,".T Document Clustering Using an Inverted File Approach .A Willett, P. An automated document clustering procedure is described which does not requi..."


In [None]:
documents_df['processed_text'] = documents_df['Text'].apply(preprocess)
documents_df

Unnamed: 0,ID,Text,docno,processed_text
0,1,"18 Editions of the Dewey Decimal Classifications Comaromi, J.P. The present study is a history of the DEWEY Decimal Classification. The first edi...",1,18 edit dewey decim classif comaromi j p present studi histori dewey decim classif first edit ddc publish 1876 eighteenth edit 1971 futur edit con...
1,2,"Use Made of Technical Libraries Slater, M. This report is an analysis of 6300 acts of use in 104 technical libraries in the United Kingdom. Librar...",2,use made technic librari slater report analysi 6300 act use 104 technic librari unit kingdom librari use one aspect wider pattern inform use infor...
2,3,"Two Kinds of Power An Essay on Bibliographic Control Wilson, P. The relationships between the organization and control of writings and the organiz...",3,two kind power essay bibliograph control wilson p relationship organ control write organ control knowledg inform inevit enter stori write contain ...
3,4,"Systems Analysis of a University Library; final report and research project Buckland, M.K. The establishment of nine new universities in the 1960'...",4,system analysi univers librari ; final report research project buckland k establish nine new univers 1960 's provok highli stimul re-examin natur ...
4,5,"A Library Management Game: a report on a research project Brophy, P. Although the use of games in professional education has become widespread onl...",5,librari manag game report research project brophi p although use game profession educ becom widespread last decad method use number field mani hun...
...,...,...,...,...
1455,1456,"World Dynamics Forrester, J.W. Over the last several decades interest in economic development, population growth, and the world environment has ex...",1456,world dynam forrest j w last sever decad interest econom develop popul growth world environ expand rapidli world-wid stress increas mani individu ...
1456,1457,"World Trends in Library Education Bramley, G. One of the most significant aspects of the evolution of librarianship in the twentieth century has b...",1457,world trend librari educ bramley g one signific aspect evolut librarianship twentieth centuri emerg librari school potent factor shape new philoso...
1457,1458,"Legal Restrictions on Exploitation of the Patent Monopoly: An Economic Analysis Baxter, W.A. The patent laws confer on a patentee power to exclude...",1458,legal restrict exploit patent monopoli econom analysi baxter w patent law confer patente power exclud other make use sell invent further constitut...
1458,1459,"Language and Thought Poluskin, V.A. This book considers the basic aspects of this complex problem - the historical and social essence of language ...",1459,languag thought poluskin v book consid basic aspect complex problem - histor social essenc languag thought interact histor evolut essenc linguist ...


In [None]:
queries_df["query"]=queries_df["raw_query"].apply(preprocess)
queries_df

Unnamed: 0,qid,raw_query,query
0,1,What problems and concerns are there in making up descriptive titles? What difficulties are involved in automatically retrieving articles from app...,problem concern make descript titl difficulti involv automat retriev articl approxim titl usual relev content articl titl
1,2,"How can actually pertinent data, as opposed to references or entire articles themselves, be retrieved automatically in response to information req...",actual pertin data oppos refer entir articl retriev automat respons inform request
2,3,What is information science? Give definitions where possible.,inform scienc give definit possibl
3,4,Image recognition and any other methods of automatically transforming printed text into computer-ready form.,imag recognit method automat transform print text computer-readi form
4,5,What special training will ordinary researchers and businessmen need for proper information management and unobstructed use of information retriev...,special train ordinari research businessmen need proper inform manag unobstruct use inform retriev system problem like encount
...,...,...,...
107,108,".T A Program for Machine-Mediated Searching .A Toliver, D. A technique of online instruction and assistance to bibliographic data base searchers c...",program machine-medi search toliv techniqu onlin instruct assist bibliograph data base searcher call individu instruct data access ( iida ) develo...
108,109,".T Author Cocitation: A Literature Measure of Intellectual Structure .A White, H.D. Griffith, B.C. It is shown that the mapping of a particular a...",author cocit literatur measur intellectu structur white h griffith b c shown map particular area scienc case inform scienc done use author unit an...
109,110,".T Progress in Documentation. Word Processing: An Introduction and Appraisal .A Whitehead, J. The ""Office of the Future,"" ""Office Technology,"" ""W...",progress document word process introduct apprais whitehead j `` offic futur `` `` offic technolog `` `` word process `` `` electron mail `` `` ele...
110,111,".T Document Clustering Using an Inverted File Approach .A Willett, P. An automated document clustering procedure is described which does not requi...",document cluster use invert file approach willett p autom document cluster procedur describ requir use inter-docu similar matrix independ order do...


# Indexing:

In [None]:
if not pt.started():
  # In this lab, we need to specify that we start PyTerrier with PRF enabled
  pt.init(boot_packages=["com.github.terrierteam:terrier-prf:-SNAPSHOT"])

  if not pt.started():


terrier-prf -SNAPSHOT jar not found, downloading to /root/.pyterrier...
Done
terrier-assemblies 5.11 jar-with-dependencies not found, downloading to /root/.pyterrier...
Done
terrier-python-helper 0.0.8 jar not found, downloading to /root/.pyterrier...
Done


Java started and loaded: pyterrier.java, pyterrier.terrier.java [version=5.11 (build: craig.macdonald 2025-01-13 21:29), helper_version=0.0.8]
The following code will have the same effect:
pt.java.add_package('com.github.terrierteam', 'terrier-prf', '-SNAPSHOT')
pt.java.init() # optional, forces java initialisation
  pt.init(boot_packages=["com.github.terrierteam:terrier-prf:-SNAPSHOT"])


In [None]:
indexer = pt.DFIndexer("./DatasetIndex", overwrite=True)
# index the text, record the docnos as metadata
index_ref = indexer.index(documents_df["processed_text"], documents_df["docno"])
print(index_ref.toString())

  indexer = pt.DFIndexer("./DatasetIndex", overwrite=True)


./DatasetIndex/data.properties


In [None]:
index = pt.IndexFactory.of(index_ref)

In [None]:
query="book"
query = preprocess(query)
query

'book'

In [None]:
splited_query = query.split()
len(splited_query)

1

In [None]:
# Function to split text into tokens
def split_text(text):
    return [doc.split() for doc in text]

# Function to find documents that contain the same tokens as the query
def find_matching_docs(query_tokens, doc_tokens):
    matching_docs = []
    for i, doc in enumerate(doc_tokens):
        for token in query_tokens:
            if token in doc:
                matching_docs.append(i)
                break  # No need to continue checking once a match is found
    return matching_docs

# Function to retrieve and print documents
def retrieve_docs(document_df, matching_doc_indices):
    for doc_idx in matching_doc_indices:
        print(f"Document Number {doc_idx}: \n {document_df['processed_text'].iloc[doc_idx]}")

# Split documents into tokens
doc_tokens = split_text(documents_df['processed_text'])

# Find documents containing query tokens
matching_doc_indices = find_matching_docs(splited_query, doc_tokens)

# Retrieve and print matching documents
retrieve_docs(documents_df, matching_doc_indices)


Document Number 6: 
 academ librari build guid architectur issu solut ellsworth r e book attempt present repres exampl success architectur solut import problem librarian architect face plan new colleg univers librari build remodel enlarg exist structur attempt make case studi evalu done ellsworth mason brown yale present exampl unsuccess solut except show avoid mistak case librari identifi 7
Document Number 7: 
 academ librari essay honor guy r lyle farber e import staff member ' individu develop apprenticeship administr perhap signific attitud one acquir work guy engend insist librarian must interest knowledg content materi dealt love literatur respect scholarship admir good write read manifest mani way notabl admonit though primarili research librari must constantli keep mind oblig collect contemporari poetri fiction belles-lett primarili librari staff felt respons well `` gener `` book cross disciplinari line fell disciplin book faculti mostli concern research materi apt overlook bu

In [None]:
# Rank the retrievd documents based ranking algorithm (TF-IDF)
tfidf_retr = pt.BatchRetrieve(index, controls = {"wmodel": "TF_IDF"})

  tfidf_retr = pt.BatchRetrieve(index, controls = {"wmodel": "TF_IDF"})


In [None]:
results=tfidf_retr.search(query)
results

Unnamed: 0,qid,docid,docno,rank,score,query
0,1,238,239,0,2.744382,book
1,1,925,926,1,2.696299,book
2,1,183,184,2,2.678758,book
3,1,1242,1243,3,2.669722,book
4,1,234,235,4,2.650217,book
...,...,...,...,...,...,...
290,1,375,376,290,1.004148,book
291,1,1264,1265,291,1.004148,book
292,1,1247,1248,292,0.902921,book
293,1,16,17,293,0.679622,book


In [None]:
import pandas as pd
import pyterrier as pt
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import re
import os
pd.set_option('display.max_colwidth', 150)

In [None]:
# Need to install additional terrier package for PRF. It will take around 1 min
!git clone https://github.com/terrierteam/terrier-prf/
!apt-get install maven   #used for Java projects to manage project dependencies and build processes
%cd /content/terrier-prf/
!mvn install
!pwd
%cd ..

fatal: destination path 'terrier-prf' already exists and is not an empty directory.
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
maven is already the newest version (3.6.3-5).
0 upgraded, 0 newly installed, 0 to remove and 34 not upgraded.
/content/terrier-prf
[[1;34mINFO[m] Scanning for projects...
[[1;34mINFO[m] 
[[1;34mINFO[m] [1m----------------------< [0;36morg.terrier:terrier-prf[0;1m >-----------------------[m
[[1;34mINFO[m] [1mBuilding terrier-prf 0.2-SNAPSHOT[m
[[1;34mINFO[m] [1m--------------------------------[ jar ]---------------------------------[m
[[1;34mINFO[m] 
[[1;34mINFO[m] [1m--- [0;32mmaven-resources-plugin:2.6:resources[m [1m(default-resources)[m @ [36mterrier-prf[0;1m ---[m
[[1;34mINFO[m] Using 'UTF-8' encoding to copy filtered resources.
[[1;34mINFO[m] skip non existing resourceDirectory /content/terrier-prf/src/main/resources
[[1;34mINFO[m] 
[[1;34mINFO[m] [1m--- [0;32mma

In [None]:
# Define our retrieval model
bm25 = pt.BatchRetrieve(index, wmodel="BM25",num_results=10)

result = bm25.search(query)
result

  bm25 = pt.BatchRetrieve(index, wmodel="BM25",num_results=10)


Unnamed: 0,qid,docid,docno,rank,score,query
0,1,238,239,0,3.871712,book
1,1,925,926,1,3.803877,book
2,1,183,184,2,3.779131,book
3,1,1242,1243,3,3.766383,book
4,1,234,235,4,3.738867,book
5,1,235,236,5,3.691001,book
6,1,270,271,6,3.680762,book
7,1,1033,1034,7,3.651497,book
8,1,237,238,8,3.621458,book
9,1,289,290,9,3.597591,book


In [None]:
documents_df[['Text']][documents_df['docno'].isin(results['docno'].loc[0:4].tolist())]

Unnamed: 0,Text
183,"How Biomedical Investigators Use Library Books Raisig, L. Miles Smith, Meredith Cuff, Renata Kilgour, Frederick G. Relatively few studies have bee..."
234,"Book Catalogs Tauber, M.F. In the intervening years since the appearance of the first collection of papers concerning book catalogs (Kingery, Robe..."
238,"Buyers and Borrowers Mann, P.H. This is the second book based on studies into social aspects of book reading. The present book is largely a report..."
925,"The Lending of Books to One Another by Libraries Green, S.S. It would add greatly to the usefulness of our reference libraries if an agreement sho..."
1242,"Rare Book Librarianship Cave, R. Although there is an extensive and enjoyable literature on the subject of rare books, most of it is concerned wit..."


In [None]:
# "rewrite" function from PyTerrier will be used to expand queries specifying RM3 as the model
# fb_docs ==> no. expansion documents
# fb_terms ==> no. expansion terms
rm3_expander = pt.rewrite.RM3(index,fb_terms=10, fb_docs=100)

#output of the BM25 will be fed into the RM3 expander for query expansion.
rm3_qe = bm25 >> rm3_expander
expanded_query = rm3_qe.search(query).iloc[0]["query"]

expanded_query

'applypipeline:off book^0.600000024 publish^0.043340053 reader^0.030614737 read^0.032347273 investig^0.035177320 librarianship^0.039690748 censorship^0.042064372 catalog^0.067117549 aggress^0.033343930 second^0.037404433 rare^0.038899545'

In [None]:
# Just print the expanded query with term scores
for s in expanded_query.split()[1:]:
  print(s)

print("\n" + query)

book^0.600000024
publish^0.043340053
reader^0.030614737
read^0.032347273
investig^0.035177320
librarianship^0.039690748
censorship^0.042064372
catalog^0.067117549
aggress^0.033343930
second^0.037404433
rare^0.038899545

book


In [None]:
# After that you can search using the expanded query
expanded_query_formatted = ' '.join(expanded_query.split()[1:])

results_wqe = bm25.search(expanded_query_formatted)

print("   Before Expansion    After Expansion")
print(pd.concat([results[['docid','score']][0:5].add_suffix('_1'),
            results_wqe[['docid','score']][0:5].add_suffix('_2')], axis=1).fillna(''))

#Let's check the tweets text for the top 5 retrieved tweets
documents_df[['Text']][documents_df['docno'].isin(results_wqe['docno'].loc[0:5].tolist())]

   Before Expansion    After Expansion
   docid_1   score_1  docid_2   score_2
0      238  2.744382      237  5.299527
1      925  2.696299     1242  5.207872
2      183  2.678758      238  5.004232
3     1242  2.669722     1033  4.947556
4      234  2.650217      234  4.916927


Unnamed: 0,Text
234,"Book Catalogs Tauber, M.F. In the intervening years since the appearance of the first collection of papers concerning book catalogs (Kingery, Robe..."
235,"Book Publishing: What it Is, What it Does Dessauer, J.P. We speak of book publishing as an industry and as a profession. Both designations are ce..."
237,"Book Selection and Censorship Moon, E. When is a librarian's decision not to include a book in his library collection an act of book selection, an..."
238,"Buyers and Borrowers Mann, P.H. This is the second book based on studies into social aspects of book reading. The present book is largely a report..."
1033,"Man's Aggression Montagu, M.F.A. The purpose of this book is to inquire into the validity of the views on human nature expressed in the widely rea..."
1242,"Rare Book Librarianship Cave, R. Although there is an extensive and enjoyable literature on the subject of rare books, most of it is concerned wit..."


In [None]:
from transformers import AutoTokenizer, AutoModel
import transformers

model_name = "bert-base-uncased"

bert_tokenizer = AutoTokenizer.from_pretrained(model_name)
bert_model = AutoModel.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
bert_tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased')

# Load your dataset
data = documents_df

# Select the text column from the dataset
text_column = data["Text"]

# Tokenize each text in the column
tokenized_texts = []
token_ids = []

for text in text_column:
    tokenized_text = bert_tokenizer.tokenize(text)
    token_ids.append(bert_tokenizer.convert_tokens_to_ids(tokenized_text))
    tokenized_texts.append(tokenized_text)

# Print the first 5 tokenized texts and their corresponding IDs
for i in range(5):
    print(f"Original text {i+1}: {text_column[i]}")
    print(f"Tokenized text {i+1}: {tokenized_texts[i]}")
    print(f"Token IDs of text {i+1}: {token_ids[i]}\n")

Original text 1: 18 Editions of the Dewey Decimal Classifications Comaromi, J.P. The present study is a history of the DEWEY Decimal Classification.  The first edition of the DDC was published in 1876, the eighteenth edition in 1971, and future editions will continue to appear as needed.  In spite of the DDC's long and healthy life, however, its full story has never been told.  There have been biographies of Dewey that briefly describe his system, but this is the first attempt to provide a detailed history of the work that more than any other has spurred the growth of librarianship in this country and abroad. 1
Tokenized text 1: ['18', 'editions', 'of', 'the', 'dewey', 'decimal', 'classifications', 'coma', '##rom', '##i', ',', 'j', '.', 'p', '.', 'the', 'present', 'study', 'is', 'a', 'history', 'of', 'the', 'dewey', 'decimal', 'classification', '.', 'the', 'first', 'edition', 'of', 'the', 'dd', '##c', 'was', 'published', 'in', '1876', ',', 'the', 'eighteenth', 'edition', 'in', '1971', 

In [None]:
!pip install flask_ngrok



In [None]:
df2 = documents_df.head(50)

df2 = df2.to_dict()

df2

{'ID': {0: '1',
  1: '2',
  2: '3',
  3: '4',
  4: '5',
  5: '6',
  6: '7',
  7: '8',
  8: '9',
  9: '10',
  10: '11',
  11: '12',
  12: '13',
  13: '14',
  14: '15',
  15: '16',
  16: '17',
  17: '18',
  18: '19',
  19: '20',
  20: '21',
  21: '22',
  22: '23',
  23: '24',
  24: '25',
  25: '26',
  26: '27',
  27: '28',
  28: '29',
  29: '30',
  30: '31',
  31: '32',
  32: '33',
  33: '34',
  34: '35',
  35: '36',
  36: '37',
  37: '38',
  38: '39',
  39: '40',
  40: '41',
  41: '42',
  42: '43',
  43: '44',
  44: '45',
  45: '46',
  46: '47',
  47: '48',
  48: '49',
  49: '50'},
 'Text': {0: "18 Editions of the Dewey Decimal Classifications Comaromi, J.P. The present study is a history of the DEWEY Decimal Classification.  The first edition of the DDC was published in 1876, the eighteenth edition in 1971, and future editions will continue to appear as needed.  In spite of the DDC's long and healthy life, however, its full story has never been told.  There have been biographies of Dew

In [None]:
def sui(df2 , que):
 i = 0

 quer = preprocess(que)

 docs_id = []

 for key, value in df2.items():
   if key == 'processed_text':
         val = value.values()
         for doc in val:
           terms = doc.split()
           for term in terms:
             if term == quer and i not in docs_id:
               docs_id.append(f'''Document number {i} -----> \n{documents_df["Text"][i]}''')
           i = i + 1
 return docs_id

In [None]:
query2 = "book"

x = sui(df2 , query2)
x

['Document number 6 -----> \nAcademic Library Buildings A Guide to Architectural Issues and Solutions Ellsworth, R.E. This book attempts to present representative examples of successful architectural solutions to the important problems librarians and architects face in planning new college and university library buildings or in remodeling and enlarging existing structures.  It does not attempt to make case study evaluations, as was done by Ellsworth Mason for Brown and Yale.  Nor does it present examples of unsuccessful solutions except to show how to avoid mistakes, and in these cases the libraries will not be identified. 7',
 'Document number 7 -----> \nThe Academic Library Essays in Honor of Guy R. Lyle Farber, E.I. As important for staff members\' individual development as was the apprenticeship in administration, perhaps the most significant attitude one acquired while working for Guy was engendered by his insistence that librarians must be interested in and knowledgeable about th

In [None]:
from google.colab.output import eval_js
print (eval_js("google.colab.kernel.proxyPort(5000)"))

https://5000-gpu-t4-s-n9r0d4rair2w-a.us-west4-0.prod.colab.dev


In [None]:
!pip install flask flask-ngrok



In [None]:
!pip install pyngrok

Collecting pyngrok
  Downloading pyngrok-7.2.8-py3-none-any.whl.metadata (10 kB)
Downloading pyngrok-7.2.8-py3-none-any.whl (25 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.2.8


In [None]:
from pyngrok import ngrok

# Replace 'your_authtoken' with the actual token you copied from the ngrok dashboard
ngrok.set_auth_token('2wv0wMPU0JZKPgLLUfmJrAQoJ6h_3tnKUkPqRdEDaCg4C5G6N')

# Now connect to ngrok
public_url = ngrok.connect(5000)
print(f" * ngrok tunnel \"{public_url}\" -> http://127.0.0.1:5000")


 * ngrok tunnel "NgrokTunnel: "https://8b1f-34-125-186-49.ngrok-free.app" -> "http://localhost:5000"" -> http://127.0.0.1:5000


In [None]:
from flask import Flask, request
from pyngrok import ngrok  # Import pyngrok
import json

# Authenticate ngrok with your actual authtoken
ngrok.set_auth_token('2wv0wMPU0JZKPgLLUfmJrAQoJ6h_3tnKUkPqRdEDaCg4C5G6N')  # Replace with your ngrok token

# Initialize Flask app
app = Flask(__name__)

# Open an ngrok tunnel to the Flask app on port 5000
public_url = ngrok.connect(5000)
print(f" * ngrok tunnel \"{public_url}\" -> http://127.0.0.1:5000")

@app.route("/")
def home():
    return """
    <style>
        body {
            background-color: #ccffcc; /* Light green background */
            font-family: Arial, sans-serif;
            margin: 0;
            padding: 0;
            display: flex;
            flex-direction: column;
            align-items: center;
            justify-content: flex-start;
            min-height: 100vh; /* Full viewport height */
            padding-top: 20px; /* Space at the top */
        }

        .search-container {
            display: flex;
            align-items: center;
            justify-content: center;
            margin-bottom: 20px; /* Space between search bar and content */
        }

        input[type="text"] {
            padding: 10px;
            border: 1px solid #ccc;
            border-radius: 20px;
            margin-right: 10px;
            width: 200px;
            box-sizing: border-box;
        }

        button {
            padding: 10px 20px;
            background-color: #fff;
            color: #007bff; /* Blue text */
            border: none;
            border-radius: 20px;
            cursor: pointer;
            transition: background-color 0.3s, color 0.3s; /* Smooth transition */
        }

        button:hover {
            background-color: #0056b3; /* Darker blue on hover */
            color: white; /* White text on hover */
        }
    </style>

    <div class="search-container">
        <input type="text" id="searchInput" placeholder="Enter your query...">
        <button onclick="search()">Search</button>
    </div>

    <div id="searchResult"></div>

    <script>
        function search() {
            var searchTerm = document.getElementById("searchInput").value;
            fetch('/search', {
                method: 'POST',
                body: JSON.stringify({ query: searchTerm }),
                headers:{
                    'Content-Type': 'application/json'
                }
            })
            .then(response => response.json())
            .then(data => {
                var resultDiv = document.getElementById("searchResult");
                resultDiv.innerHTML = "<h2>Relevant Documents IDs:</h2>";
                if (data.results.length === 0) {
                    resultDiv.innerHTML += "<p>No documents found</p>";
                } else {
                    data.results.forEach(doc => {
                        resultDiv.innerHTML += "<p>" + doc + "</p>";
                    });
                }
            })
            .catch(error => {
                console.error('Error occurred during fetch:', error);
            });
        }
    </script>
    """

@app.route("/search", methods=['POST'])
def search():
    query = request.json['query']
    # Assuming you have a function 'sui' that returns search results based on the query
    results = sui(df2, query)
    return json.dumps({'results': results})

# Start the Flask app (no need to specify host and port here, ngrok handles it)
app.run()


 * ngrok tunnel "NgrokTunnel: "https://927f-34-125-186-49.ngrok-free.app" -> "http://localhost:5000"" -> http://127.0.0.1:5000
 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug:127.0.0.1 - - [10/May/2025 20:08:43] "GET / HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [10/May/2025 20:08:44] "[33mGET /favicon.ico HTTP/1.1[0m" 404 -
INFO:werkzeug:127.0.0.1 - - [10/May/2025 20:08:50] "POST /search HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [10/May/2025 20:08:51] "POST /search HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [10/May/2025 20:08:51] "POST /search HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [10/May/2025 20:09:05] "POST /search HTTP/1.1" 200 -
