In [None]:
#learn more about pyterrier: https://pyterrier.readthedocs.io/en/latest/

In [2]:
import pyterrier as pt
import requests
import pandas as pd
import pickle
import nltk
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
data = pd.read_json("/workspace/literature.json")

In [20]:
data

Unnamed: 0,query,title,url,year,authors,abstract,source,docno,text
0,artificial intelligence,Artificial Intelligence in Agriculture,https://www.ijtsrd.com/engineering/electrical-...,2021,"[Matthew N. O. Sadiku, Sarhan M. Musa, Abayomi...",Artificial Intelligence is one of the emerging...,bibsonomy,https://www.ijtsrd.com/engineering/electrical-...,Artificial Intelligence in Agriculture
1,artificial intelligence,Artificial Intelligence Benefit and Risks,https://www.ijtsrd.com/computer-science/artifi...,2020,[Seeta M. Chauhan],this article demonstrate Disadvantage of artif...,bibsonomy,https://www.ijtsrd.com/computer-science/artifi...,Artificial Intelligence Benefit and Risks
2,artificial intelligence,Artificial Intelligence in Power Station,https://www.ijtsrd.com/engineering/electrical-...,2019,"[P. Naveen, S. Nikitha, P. Sudeesh, V. Vaishnavi]",With increased competitiveness in power genera...,bibsonomy,https://www.ijtsrd.com/engineering/electrical-...,Artificial Intelligence in Power Station
3,artificial intelligence,Artificial Intelligence Based Training and Pla...,http://www.ijtsrd.com/computer-science/artific...,2018,"[Krishanu Deb, Pankaj Agrawal, Harish Nawale, ...",Training and placement cell in colleges is to ...,bibsonomy,http://www.ijtsrd.com/computer-science/artific...,Artificial Intelligence Based Training and Pla...
4,artificial intelligence,The Significance of Artificial Intelligence in...,https://www.ijtsrd.com/computer-science/artifi...,2023,[Dr. Atul Kumar Mishra],"In an increasingly digitalized world, the util...",bibsonomy,https://www.ijtsrd.com/computer-science/artifi...,The Significance of Artificial Intelligence in...
...,...,...,...,...,...,...,...,...,...
4737,large language model,Phonologically Aware Neural Model for Named En...,https://www.semanticscholar.org/paper/f1a8ff55...,2016,"[Akash Bharadwaj, David R. Mortensen, Chris Dy...",Named Entity Recognition is a well established...,semantic_scholar,https://www.semanticscholar.org/paper/f1a8ff55...,Phonologically Aware Neural Model for Named En...
4738,large language model,Large-Scale Distributed Language Modeling,https://www.semanticscholar.org/paper/591080c3...,2007,"[Ahmad Emami, K. Papineni, Jeffrey Scott Soren...",A novel distributed language model that has no...,semantic_scholar,https://www.semanticscholar.org/paper/591080c3...,Large-Scale Distributed Language Modeling
4739,large language model,BERT for Joint Intent Classification and Slot ...,https://www.semanticscholar.org/paper/476029ac...,2019,"[Qian Chen, Zhu Zhuo, Wen Wang]",Intent classification and slot filling are two...,semantic_scholar,https://www.semanticscholar.org/paper/476029ac...,BERT for Joint Intent Classification and Slot ...
4740,large language model,Language and Translation Model Adaptation usin...,https://www.semanticscholar.org/paper/b281a9d0...,2008,"[M. Snover, B. Dorr, R. Schwartz]","Traditionally, statistical machine translation...",semantic_scholar,https://www.semanticscholar.org/paper/b281a9d0...,Language and Translation Model Adaptation usin...


In [21]:
#initialise pyterrier
if not pt.started():
    pt.init()

In [22]:
#pyterrier exspects a docno field
data['docno'] = data['url']
data['text'] = data['title']

#year field should be string to simplify indexing 
data['year'] = data['year'].astype('str')

In [23]:
#transfrom dataframe into list of dictionaries
data_dict = data.to_dict(orient='records')

In [24]:
index_folder_mult = "./ai_llm_index"

fields=['docno', 'text', 'authors', 'year', 'title']

indexer_mult = pt.IterDictIndexer(index_folder_mult, meta={'docno': 1024, 'text': 4096, 'year' : 1024}, overwrite=True)
index_ref_mult = indexer_mult.index(data_dict, fields=fields)

print(f"path to our index: {index_ref_mult.toString()}")

path to our index: ./ai_llm_index/data.properties


In [8]:
index_mult = pt.IndexFactory.of(index_ref_mult)

In [9]:
print(index_mult.getCollectionStatistics().toString())

Number of documents: 4742
Number of terms: 21695
Number of postings: 106267
Number of fields: 5
Number of tokens: 141709
Field names: [docno, text, authors, year, title]
Positions:   false



In [18]:
search_engine_mult = pt.BatchRetrieve(index_mult, wmodel="TF_IDF")

In [19]:
search_engine_mult.search("sentiment analysis")


Unnamed: 0,qid,docid,docno,rank,score,query
0,1,1694,https://www.ijtsrd.comengineering/computer-eng...,0,11.479152,sentiment analysis
1,1,744,https://www.ijtsrd.comcomputer-science/artific...,1,11.398638,sentiment analysis
2,1,2153,,2,11.013882,sentiment analysis
3,1,1711,https://www.ijtsrd.comcomputer-science/data-pr...,3,11.012437,sentiment analysis
4,1,1219,http://dblp.uni-trier.de/db/conf/ai/ai2011.htm...,4,10.794416,sentiment analysis
...,...,...,...,...,...,...
99,1,3874,https://www.semanticscholar.org/paper/00a40754...,99,3.355589,sentiment analysis
100,1,4306,https://www.semanticscholar.org/paper/bb065603...,100,3.168347,sentiment analysis
101,1,3904,https://www.semanticscholar.org/paper/202f2e0b...,101,3.124757,sentiment analysis
102,1,4508,https://www.semanticscholar.org/paper/68f14172...,102,1.807080,sentiment analysis


In [13]:
#save our data
pickle.dump(data, open("workspace/data_exercise.pkl", "wb"))

In [25]:
search_engine = pt.BatchRetrieve(index_mult, wmodel="TF_IDF", metadata=["docno", "year"]) 

#here we add 2 filters chained after the retrieval application
year_filter = pt.apply.generic(lambda res : res[res["year"] >= '2015'])
year_filter2 = pt.apply.generic(lambda res : res[res["year"] <= '2020'])


rank_filter = pt.apply.generic(lambda res : res[res["rank"] < 10])

search_engine_year_filter = search_engine >> year_filter >> year_filter2

In [26]:
search_engine_year_filter.search("language")

Unnamed: 0,qid,docid,docno,year,rank,score,query
0,1,2519,https://livrepository.liverpool.ac.uk/3028272/...,2019,0,2.202182,language
2,1,2593,http://dblp.uni-trier.de/db/conf/vardial/vardi...,2016,2,2.149536,language
11,1,2651,http://dblp.uni-trier.de/db/journals/corr/corr...,2018,11,2.074889,language
14,1,2840,http://dblp.uni-trier.de/db/conf/itat/itat2016...,2016,14,2.074889,language
25,1,2571,https://d4mucfpksywv.cloudfront.net/better-lan...,2018,25,2.051146,language
...,...,...,...,...,...,...,...
989,1,2731,http://dblp.uni-trier.de/db/journals/corr/corr...,2020,989,1.609484,language
990,1,2740,http://dblp.uni-trier.de/db/conf/interspeech/i...,2015,990,1.609484,language
992,1,2809,http://dblp.uni-trier.de/db/conf/naacl/naacl20...,2018,992,1.609484,language
995,1,2909,http://dblp.uni-trier.de/db/journals/corr/corr...,2020,995,1.609484,language
