# Data Librarian - Modul 2 PyTerrier Tutorial
## This notebook is based on a CIKM workshop
https://github.com/terrier-org/cikm2021tutorial/blob/main/notebooks

In [None]:
#learn more about pyterrier: https://pyterrier.readthedocs.io/en/latest/

In [None]:
#install pyterrier
%pip install python-terrier

In [None]:
%pip install nltk

In [66]:
import pyterrier as pt
import requests
import pandas as pd
import pickle
import nltk
import numpy as np

In [67]:
query = "artificial intelligence"
url =  f"https://www.bibsonomy.org/json/search/{query}?items=1000"

In [68]:
res = requests.get(url)

In [69]:
data = res.json()

In [70]:
df_ai = pd.DataFrame(data["items"])
df_ai = df_ai[df_ai["type"] == "Publication"]

In [71]:
df_ai

Unnamed: 0,type,id,tags,intraHash,label,user,description,date,changeDate,count,...,keyword,issue,pdf,broken,optseries,position,category,impact,quartile,citedreferences
1000,Publication,https://www.bibsonomy.org/bibtex/24e338b04bc2a...,[710-714],4e338b04bc2abafd9fa0a1f4fe79103a,Artificial Intelligence in Agriculture,ijtsrd,,2021-04-13 13:15:04,2021-04-13 13:15:04,1,...,,,,,,,,,,
1001,Publication,https://www.bibsonomy.org/bibtex/29f3b59303571...,"[Artificial, Intelligence, Risks, and, Benefits]",9f3b593035714a894c265f4fb9c535e7,Artificial Intelligence Benefit and Risks,ijtsrd,,2020-05-14 10:40:49,2020-05-14 10:40:49,1,...,,,,,,,,,,
1002,Publication,https://www.bibsonomy.org/bibtex/24bc08fb79365...,"[ANN, Fuzzylogic, ElectricalEngineering, Artif...",4bc08fb79365b5bc698af1cd97fc1a75,Artificial Intelligence in Power Station,ijtsrd,,2020-01-10 09:23:37,2020-01-10 09:23:37,1,...,,,,,,,,,,
1003,Publication,https://www.bibsonomy.org/bibtex/29f87881e4dec...,"[Artificial, Intelligence, Markup, Chatbot, La...",9f87881e4decfde15a1af630c8ea27d5,Artificial Intelligence Based Training and Pla...,ijtsrd,,2019-03-26 12:35:44,2019-03-26 12:35:44,1,...,,,,,,,,,,
1004,Publication,https://www.bibsonomy.org/bibtex/263320f0ab96a...,"[Learning,, Systems,, Artificialintelligence, ...",63320f0ab96ae778b51b2376aa914f59,The Significance of Artificial Intelligence in...,ijtsrd,,2023-10-14 13:59:49,2023-10-14 13:59:49,1,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,Publication,https://www.bibsonomy.org/bibtex/29ff0bdc4bca2...,"[ai,, artificial-intelligence]",9ff0bdc4bca236096c62f6237a855ae2,"Proceedings, The Twentieth National Conference...",baisemain,,2011-05-04 16:04:17,2011-05-04 16:04:33,6,...,,,,,,,,,,
1996,Publication,https://www.bibsonomy.org/bibtex/294a94c450ebc...,"[Attendance, Artificial, Intelligence, Faceboo...",94a94c450ebcd7360757ad46f8b996e7,Student Library Attendance using Face Recognition,ijtsrd,,2018-09-04 11:11:34,2018-10-02 11:01:26,1,...,,,,,,,,,,
1997,Publication,https://www.bibsonomy.org/bibtex/241bd03f0041c...,[imported],41bd03f0041c0127cbca5bbf69e872f3,The problem of survival from an algorithmic po...,perceptron,,2007-12-16 20:00:22,2007-12-16 20:00:24,2,...,,,,,,,,,,"ALPAYDIN E, 1991, TR91032 INT COMP SCI ; BROOK..."
1998,Publication,https://www.bibsonomy.org/bibtex/267fedfa4aa69...,"[paper, cognitive, ai, processing, v1500, know...",67fedfa4aa6926e1771d7ca74298f8ff,Problem Solving Techniques in Cognitive Science,flint63,,2012-05-30 10:44:28,2018-04-16 12:07:00,2,...,,,,,,,,,,


In [72]:
#drops columns where at least 50% of the rows have missing values
df_ai_filtered = df_ai.dropna(axis=1, thresh=len(df_ai)*0.5)

In [73]:
df_ai_filtered.columns

Index(['type', 'id', 'tags', 'intraHash', 'label', 'user', 'description',
       'date', 'changeDate', 'count', 'url', 'interHash', 'pub-type', 'year',
       'author', 'authors', 'volume', 'pages', 'bibtexKey', 'publisher'],
      dtype='object')

In [74]:
#lets only use columns which seems useful for us
useful_coulmns = ['id', 'tags', 'label', 'description', 'date', 'changeDate', 'url', 'pub-type', 'year', 'author', 'authors', 'publisher']
df_ai_filtered = df_ai_filtered[useful_coulmns]

In [75]:
#initialise pyterrier
if not pt.started():
    pt.init()

In [77]:
#pyterrier exspects a docno field
df_ai_filtered['docno'] = df_ai_filtered['id']
df_ai_filtered['text'] = df_ai_filtered['label']

In [78]:
#excursus indexing

In [79]:
#now we start to index our data
#more details at https://pyterrier.readthedocs.io/en/latest/terrier-indexing.html

index_folder = "./ai_index"

indexer = pt.DFIndexer(index_folder, overwrite=True)
index_ref = indexer.index(df_ai_filtered['text'], df_ai_filtered['docno'])

print(f"path to our index: {index_ref.toString()}")

path to our index: ./ai_index/data.properties


In [80]:
#load our index
index = pt.IndexFactory.of(index_ref)

In [81]:
print(index.getCollectionStatistics().toString())

Number of documents: 1000
Number of terms: 1375
Number of postings: 6122
Number of fields: 4
Number of tokens: 6360
Field names: [docno, text, tags, description]
Positions:   false



In [82]:
#term -> term_id Nt Tf
#Nt: In how many documents does the term occur
#TF: How often does the term occur in total

for kv in index.getLexicon():
    print(f"{kv.getKey()} -> {kv.getValue().toString()}")

0 -> term826 Nt=1 TF=1 maxTF=1 @{0 0 0} TFf=0,0,0,0
02 -> term588 Nt=2 TF=2 maxTF=1 @{0 2 6} TFf=0,0,0,0
1 -> term327 Nt=2 TF=2 maxTF=1 @{0 7 2} TFf=0,0,0,0
10 -> term544 Nt=3 TF=3 maxTF=1 @{0 12 2} TFf=0,0,0,0
10th -> term580 Nt=2 TF=2 maxTF=1 @{0 19 4} TFf=0,0,0,0
11 -> term115 Nt=3 TF=3 maxTF=1 @{0 24 6} TFf=0,0,0,0
11th -> term1142 Nt=4 TF=4 maxTF=1 @{0 32 0} TFf=0,0,0,0
12 -> term381 Nt=2 TF=2 maxTF=1 @{0 41 0} TFf=0,0,0,0
12th -> term1098 Nt=4 TF=4 maxTF=1 @{0 45 4} TFf=0,0,0,0
13 -> term114 Nt=5 TF=5 maxTF=1 @{0 54 4} TFf=0,0,0,0
13th -> term1212 Nt=3 TF=3 maxTF=1 @{0 66 2} TFf=0,0,0,0
14 -> term526 Nt=4 TF=4 maxTF=1 @{0 73 2} TFf=0,0,0,0
14th -> term1047 Nt=4 TF=4 maxTF=1 @{0 82 4} TFf=0,0,0,0
15 -> term204 Nt=4 TF=4 maxTF=1 @{0 92 6} TFf=0,0,0,0
15th -> term1059 Nt=4 TF=4 maxTF=1 @{0 100 0} TFf=0,0,0,0
16 -> term263 Nt=5 TF=5 maxTF=1 @{0 109 4} TFf=0,0,0,0
16th -> term281 Nt=5 TF=5 maxTF=1 @{0 120 4} TFf=0,0,0,0
17 -> term203 Nt=4 TF=4 maxTF=1 @{0 132 0} TFf=0,0,0,0
17th -> te

In [83]:
term = "game"
index.getLexicon()[term].toString()

'term46 Nt=13 TF=14 maxTF=2 @{0 4701 5} TFf=0,0,0,0'

In [84]:
#how often do the terms occur
term_freq_dict = {}

for kv in index.getLexicon():
    term_freq_dict[kv.getKey()] = kv.getValue().frequency

In [85]:
#sort all terms from the index in descending order 
term_freq_dict_sorted = sorted(term_freq_dict.items(), key=lambda x: x[1], reverse=True)

In [86]:
#show the 30 nost popular entries
top_k = 30
term_freq_dict_sorted[:top_k]

[('intellig', 990),
 ('artifici', 958),
 ('confer', 95),
 ('proceed', 93),
 ('system', 64),
 ('ai', 63),
 ('approach', 49),
 ('advanc', 44),
 ('applic', 44),
 ('us', 43),
 ('comput', 41),
 ('gener', 36),
 ('base', 35),
 ('learn', 34),
 ('human', 31),
 ('ki', 30),
 ('nation', 29),
 ('research', 29),
 ('modern', 27),
 ('septemb', 27),
 ('german', 25),
 ('model', 25),
 ('educ', 24),
 ('network', 23),
 ('distribut', 22),
 ('intern', 22),
 ('annual', 21),
 ('develop', 21),
 ('germani', 21),
 ('impact', 21)]

In [87]:
#define our search engine
search_engine = pt.BatchRetrieve(index, wmodel="Tf")

In [88]:
search_engine.search("system")

Unnamed: 0,qid,docid,docno,rank,score,query
0,1,323,https://www.bibsonomy.org/bibtex/24768ef52a6c0...,0,2.0,system
1,1,32,https://www.bibsonomy.org/bibtex/249f58604f738...,1,1.0,system
2,1,50,https://www.bibsonomy.org/bibtex/2f693b1d1c623...,2,1.0,system
3,1,59,https://www.bibsonomy.org/bibtex/2f75838770906...,3,1.0,system
4,1,116,https://www.bibsonomy.org/bibtex/2ee0b496413e1...,4,1.0,system
...,...,...,...,...,...,...
58,1,934,https://www.bibsonomy.org/bibtex/2c1d6d19484ec...,58,1.0,system
59,1,950,https://www.bibsonomy.org/bibtex/2ef217db6a5b4...,59,1.0,system
60,1,951,https://www.bibsonomy.org/bibtex/293fb4fd88de7...,60,1.0,system
61,1,968,https://www.bibsonomy.org/bibtex/20dac3cd23401...,61,1.0,system


So the `search()` method returns a dataframe with columns:
 - `qid`: this is by default "1", since it's our first and only query
 - `docid`: Terrier' internal integer for each document
 - `docno`: the external (string) unique identifier for each document
 - `score`: since we use the `Tf` weighting model, this score corresponds the total frequency of the query (terms) in each document
 - `rank`: A handy attribute showing the descending order by score
 - `query`: the input query

As expected, the `Tf` weighting model used here only counts the frequencies of the query terms in each document, i.e.:
$$
score(d,q) = \sum_{t \in q} tf_{t,d}
$$

In [89]:
search_engine.search("intelligent education")

Unnamed: 0,qid,docid,docno,rank,score,query
0,1,736,https://www.bibsonomy.org/bibtex/2b476e9f37ca1...,0,5.0,intelligent education
1,1,635,https://www.bibsonomy.org/bibtex/226a944b1f244...,1,3.0,intelligent education
2,1,713,https://www.bibsonomy.org/bibtex/224ac67b53792...,2,3.0,intelligent education
3,1,792,https://www.bibsonomy.org/bibtex/21c47c79fb260...,3,3.0,intelligent education
4,1,993,https://www.bibsonomy.org/bibtex/2dfb01a4c7c8d...,4,3.0,intelligent education
...,...,...,...,...,...,...
900,1,988,https://www.bibsonomy.org/bibtex/2025ecfe8993f...,900,1.0,intelligent education
901,1,989,https://www.bibsonomy.org/bibtex/24bc8dd6afe36...,901,1.0,intelligent education
902,1,990,https://www.bibsonomy.org/bibtex/28b6f3d4fdad8...,902,1.0,intelligent education
903,1,991,https://www.bibsonomy.org/bibtex/2f4c4085c258c...,903,1.0,intelligent education


In [None]:
#what do we need for the inverse document frequency?

In [90]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

In [91]:
#stem the term its base form
stem = stemmer.stem("learning")
stem

'learn'

In [92]:
#in how many documents does the stem 'learn' occur?
lexicon = index.getLexicon()
lexicon[stem].getDocumentFrequency()

32

In [93]:
#how many documents occur in our index?
index.getCollectionStatistics().numberOfDocuments

1000

In [94]:
def get_idf_for_term(term, index):
  lex = index.getLexicon()
  stemmed_term = stemmer.stem(term)

  if not stemmed_term in lex:
    return 
    
  lex_entry = lex[stemmed_term]

  df_term = lex_entry.getDocumentFrequency()
  N = index.getCollectionStatistics().numberOfDocuments
  
  #inverse document frequency
  idf = N/df_term

  #apply logarithm(base 10) to idf
  log_idf = np.emath.logn(10, idf)

  return log_idf

In [95]:
get_idf_for_term("learning", index)

1.494850021680094

In [None]:
#define your own tf_idf method here:

def calc_tf_idf(query, docno, index):
    #remember that tfidf is the product of two components
    #hint: the tf model search result contains tf frequencies
    return 0

In [96]:
search_engine_tfidf = pt.BatchRetrieve(index, wmodel="TF_IDF")

In [97]:
search_engine_tfidf.search("intelligent education")

Unnamed: 0,qid,docid,docno,rank,score,query
0,1,277,https://www.bibsonomy.org/bibtex/201f2eb94f27f...,0,4.556954,intelligent education
1,1,761,https://www.bibsonomy.org/bibtex/2763f67bf4e7d...,1,4.556954,intelligent education
2,1,851,https://www.bibsonomy.org/bibtex/29cc0434b3e50...,2,4.556954,intelligent education
3,1,21,https://www.bibsonomy.org/bibtex/259133e526641...,3,4.211381,intelligent education
4,1,58,https://www.bibsonomy.org/bibtex/259133e526641...,4,4.211381,intelligent education
...,...,...,...,...,...,...
900,1,200,https://www.bibsonomy.org/bibtex/22113c27f2328...,900,0.323055,intelligent education
901,1,204,https://www.bibsonomy.org/bibtex/23934d5315fab...,901,0.323055,intelligent education
902,1,240,https://www.bibsonomy.org/bibtex/2201d72f3a55d...,902,0.323055,intelligent education
903,1,281,https://www.bibsonomy.org/bibtex/268ebe1d6569e...,903,0.323055,intelligent education


In [None]:
#Now we want to build an index with multiple fields

In [98]:
df_ai_filtered.columns

Index(['id', 'tags', 'label', 'description', 'date', 'changeDate', 'url',
       'pub-type', 'year', 'author', 'authors', 'publisher', 'docno', 'text'],
      dtype='object')

In [99]:
#transfrom dataframe into list of dictionaries
ai_dict = df_ai_filtered.to_dict(orient='records')

In [None]:
ai_dict

In [100]:
index_folder_mult = "./ai_index_mult"

fields=['docno', 'text', 'tags', 'description']

indexer_mult = pt.IterDictIndexer(index_folder_mult, meta={'docno': 200, 'text': 4096}, overwrite=True)
index_ref_mult = indexer_mult.index(ai_dict, fields=fields)

print(f"path to our index: {index_ref_mult.toString()}")

path to our index: ./ai_index_mult/data.properties


In [101]:
index_mult = pt.IndexFactory.of(index_ref_mult)

In [102]:
print(index_mult.getCollectionStatistics().toString())

Number of documents: 1000
Number of terms: 2174
Number of postings: 14507
Number of fields: 4
Number of tokens: 16458
Field names: [docno, text, tags, description]
Positions:   false



In [103]:
search_engine_mult = pt.BatchRetrieve(index_mult, wmodel="TF_IDF")

In [104]:
res = search_engine_mult.search("intelligence")
res

Unnamed: 0,qid,docid,docno,rank,score,query
0,1,713,https://www.bibsonomy.org/bibtex/224ac67b53792...,0,0.936110,intelligence
1,1,333,https://www.bibsonomy.org/bibtex/29a73b5ea0550...,1,0.879252,intelligence
2,1,503,https://www.bibsonomy.org/bibtex/2f7cf941158e2...,2,0.879252,intelligence
3,1,655,https://www.bibsonomy.org/bibtex/2cd2e7a05ba63...,3,0.879252,intelligence
4,1,74,https://www.bibsonomy.org/bibtex/25ec32dd52e8d...,4,0.876368,intelligence
...,...,...,...,...,...,...
938,1,315,https://www.bibsonomy.org/bibtex/2b5cd3e615563...,938,0.389409,intelligence
939,1,613,https://www.bibsonomy.org/bibtex/2b4e425ae3b44...,939,0.389409,intelligence
940,1,345,https://www.bibsonomy.org/bibtex/210b214d3428a...,940,0.376594,intelligence
941,1,354,https://www.bibsonomy.org/bibtex/299846faf0a0d...,941,0.370498,intelligence


In [105]:
#save our data
pickle.dump(df_ai_filtered, open("workspace/ai_publications.pkl", "wb"))

In [None]:
#define search engine for TF
search_engine_mult_tf = pt.BatchRetrieve(index_mult, wmodel="Tf")
search_engine_mult_tf_idf = pt.BatchRetrieve(index_mult, wmodel="TF_IDF")

In [None]:
query = "robots health care machine learning"

In [None]:
search_engine_mult_tf.search(query)

In [110]:
#Obtain several docnos for further analysis

docno0 = search_engine_mult_tf.search(query)['docno'].iloc[0]
docno1 = search_engine_mult_tf.search(query)['docno'].iloc[1]
docno2 = search_engine_mult_tf.search(query)['docno'].iloc[2]
docno3 = search_engine_mult_tf.search(query)['docno'].iloc[3]
docno4 = search_engine_mult_tf.search(query)['docno'].iloc[4]

In [111]:
docno0

'https://www.bibsonomy.org/bibtex/2dc0153e9b5a99a8daa2401050080370e/benem'

In [107]:
search_engine_mult_tf.search("learning")

Unnamed: 0,qid,docid,docno,rank,score,query
0,1,358,https://www.bibsonomy.org/bibtex/27cda56873f7e...,0,4.0,learning
1,1,17,https://www.bibsonomy.org/bibtex/2a812df36a1a6...,1,3.0,learning
2,1,614,https://www.bibsonomy.org/bibtex/2aac90bd39bb6...,2,3.0,learning
3,1,37,https://www.bibsonomy.org/bibtex/22d52c90de1cd...,3,2.0,learning
4,1,69,https://www.bibsonomy.org/bibtex/26150b835dd92...,4,2.0,learning
...,...,...,...,...,...,...
64,1,736,https://www.bibsonomy.org/bibtex/2b476e9f37ca1...,64,1.0,learning
65,1,738,https://www.bibsonomy.org/bibtex/215ab989afb1a...,65,1.0,learning
66,1,749,https://www.bibsonomy.org/bibtex/2869af8eb05d2...,66,1.0,learning
67,1,975,https://www.bibsonomy.org/bibtex/2be5ada247e4f...,67,1.0,learning


In [108]:
def calc_tf_idf(query, docno, index):

    terms = query.split(" ")
    final_score = 0

    for term in terms:
        
        #search for the term
        res = search_engine_mult_tf.search(term)
        
        #check if res is not empty

        tf_row = res[res['docno'] == docno]
        if len(tf_row) > 0:

            #how often does the term occur in the document
            tf = tf_row['score'].values[0]

            #lets apply the logarithm to the term frequency
            w_tf = 1 + np.emath.logn(10, tf)

            #whats the inverse document frequency?
            idf = get_idf_for_term(term, index)

            #lets add the score for the term
            term_score = w_tf * idf
            final_score += term_score

            print(f"Values for the term: {term}")
            print(f"Term frequency: {tf}")
            print(f"Weight: {w_tf}")
            print(f"Inverse document frequency: {idf}")
            print(f"Term score: {term_score}")
            print()

    return final_score


In [112]:
calc_tf_idf(query, docno4, index_mult)

Values for the term: artificial
Term frequency: 3.0
Weight: 1.4771212547196624
Inverse document frequency: 0.031517051446064856
Term score: 0.046554506577075466

Values for the term: intelligence
Term frequency: 3.0
Weight: 1.4771212547196624
Inverse document frequency: 0.025488307262671633
Term score: 0.03764932040451781



0.08420382698159327

In [None]:
result = search_engine_mult_tf_idf.search(query)

In [None]:
result[result['docno'] == docno0]

In [113]:
query = "machine learning robots"
print(f"Precision for query = {query}: {4/20}")
print(f"Recall for query = {query}: {4/22}")

Precision for query = machine learning robots: 0.2
Recall for query = machine learning robots: 0.18181818181818182


In [114]:
query = "machine learning medicine"
print(f"Precision for query = {query}: {8/20}")
print(f"Recall for query = {query}: {8/17}")

Precision for query = machine learning medicine: 0.4
Recall for query = machine learning medicine: 0.47058823529411764


In [None]:
#search_engine_mult_tf.search("robot cyborg android")

In [None]:
#search_engine_mult_tf.search("medical medicine healthcare")

In [None]:
#Results for TF Scores

In [115]:
query = "machine learning robots"
print(f"Precision for query = {query}: {3/20}")
print(f"Recall for query = {query}: {3/22}")

Precision for query = machine learning robots: 0.15
Recall for query = machine learning robots: 0.13636363636363635
