# Data Librarian - Modul 2 PyTerrier Tutorial
## This notebook is based on a CIKM workshop
https://github.com/terrier-org/cikm2021tutorial/blob/main/notebooks

In [None]:
#learn more about pyterrier: https://pyterrier.readthedocs.io/en/latest/

In [None]:
#install pyterrier
%pip install python-terrier

In [None]:
%pip install nltk

In [None]:
import pyterrier as pt
import requests
import pandas as pd
import pickle
import nltk
import numpy as np

In [None]:
query = "artificial intelligence"
url =  f"https://www.bibsonomy.org/json/search/{query}?items=1000"

In [None]:
res = requests.get(url)

In [None]:
data = res.json()

In [None]:
df_ai = pd.DataFrame(data["items"])
df_ai = df_ai[df_ai["type"] == "Publication"]

In [None]:
df_ai

In [None]:
#drops columns where at least 50% of the rows have missing values
df_ai_filtered = df_ai.dropna(axis=1, thresh=len(df_ai)*0.5)

In [None]:
df_ai_filtered.head(10)

In [None]:
df_ai_filtered.columns

In [None]:
#lets only use columns which seems useful for us
useful_coulmns = ['id', 'tags', 'label', 'description', 'date', 'changeDate', 'url', 'pub-type', 'year', 'author', 'authors', 'publisher']
df_ai_filtered = df_ai_filtered[useful_coulmns]

In [None]:
#initialise pyterrier
if not pt.started():
    pt.init()

In [None]:
#pyterrier exspects a docno field
df_ai_filtered['docno'] = df_ai_filtered['id']
df_ai_filtered['text'] = df_ai_filtered['label']

In [None]:
#excursus indexing

In [None]:
#now we start to index our data
#more details at https://pyterrier.readthedocs.io/en/latest/terrier-indexing.html

index_folder = "./ai_index"

indexer = pt.DFIndexer(index_folder, overwrite=True)
index_ref = indexer.index(df_ai_filtered['text'], df_ai_filtered['docno'])

print(f"path to our index: {index_ref.toString()}")

In [None]:
#load our index
index = pt.IndexFactory.of(index_ref)

In [None]:
print(index.getCollectionStatistics().toString())

In [None]:
#term -> term_id Nt Tf
#Nt: In how many documents does the term occur
#TF: How often does the term occur in total

for kv in index.getLexicon():
    print(f"{kv.getKey()} -> {kv.getValue().toString()}")

In [None]:
term = "game"
index.getLexicon()[term].toString()

In [None]:
#how often do the terms occur
term_freq_dict = {}

for kv in index.getLexicon():
    term_freq_dict[kv.getKey()] = kv.getValue().frequency

In [None]:
#sort all terms from the index in descending order 
term_freq_dict_sorted = sorted(term_freq_dict.items(), key=lambda x: x[1], reverse=True)

In [None]:
#show the 30 nost popular entries
top_k = 30
term_freq_dict_sorted[:top_k]

In [None]:
#define our search engine
search_engine = pt.BatchRetrieve(index, wmodel="Tf")

In [None]:
search_engine.search("system")

So the `search()` method returns a dataframe with columns:
 - `qid`: this is by default "1", since it's our first and only query
 - `docid`: Terrier' internal integer for each document
 - `docno`: the external (string) unique identifier for each document
 - `score`: since we use the `Tf` weighting model, this score corresponds the total frequency of the query (terms) in each document
 - `rank`: A handy attribute showing the descending order by score
 - `query`: the input query

As expected, the `Tf` weighting model used here only counts the frequencies of the query terms in each document, i.e.:
$$
score(d,q) = \sum_{t \in q} tf_{t,d}
$$

In [None]:
search_engine.search("intelligent education")

In [None]:
#what do we need for the inverse document frequency?

In [None]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

In [None]:
#stem the term its base form
stem = stemmer.stem("learning")
stem

In [None]:
#in how many documents does the stem 'learn' occur?
lexicon = index.getLexicon()
lexicon[stem].getDocumentFrequency()

In [None]:
#how many documents occur in our index?
index.getCollectionStatistics().numberOfDocuments

In [None]:
def get_idf_for_term(term, index):
  lex = index.getLexicon()
  stemmed_term = stemmer.stem(term)

  if not stemmed_term in lex:
    return 
    
  lex_entry = lex[stemmed_term]

  df_term = lex_entry.getDocumentFrequency()
  N = index.getCollectionStatistics().numberOfDocuments
  
  #inverse document frequency
  idf = N/df_term

  #apply logarithm(base 10) to idf
  log_idf = np.emath.logn(10, idf)

  return log_idf

In [None]:
get_idf_for_term("learning", index)

In [None]:
#define your own tf_idf method here:

def calc_tf_idf(query, docno, index):
    #remember that tfidf is the product of two components
    #hint: the tf model search result contains tf frequencies
    return 0

In [None]:
query = "intelligent education"
docno = "https://www.bibsonomy.org/bibtex/201f2eb94f27fe662c37249be37619d8b/dblp"

print(f"The tf-idf for query: {query} and document: {docno} is {calc_tf_idf(query, docno, index)}")

In [None]:
search_engine_tfidf = pt.BatchRetrieve(index, wmodel="TF_IDF")

In [None]:
search_engine_tfidf.search("intelligent education")

In [None]:
#Now we want to build an index with multiple fields

In [None]:
df_ai_filtered.columns

In [None]:
#transfrom dataframe into list of dictionaries
ai_dict = df_ai_filtered.to_dict(orient='records')

In [None]:
ai_dict

In [None]:
index_folder_mult = "./ai_index_mult"

fields=['docno', 'text', 'tags', 'description']

indexer_mult = pt.IterDictIndexer(index_folder_mult, meta={'docno': 200, 'text': 4096}, overwrite=True)
index_ref_mult = indexer_mult.index(ai_dict, fields=fields)

print(f"path to our index: {index_ref_mult.toString()}")

In [None]:
index_mult = pt.IndexFactory.of(index_ref_mult)

In [None]:
print(index_mult.getCollectionStatistics().toString())

In [None]:
search_engine_mult = pt.BatchRetrieve(index_mult, wmodel="TF_IDF")

In [None]:
res = search_engine_mult.search("intelligence")
res

In [None]:
#save our data
pickle.dump(df_ai_filtered, open("workspace/ai_publications.pkl", "wb"))

In [None]:
%pip install streamlit