# Introduction

This noteook contains implementation of a TF-IDF retreival for LEGSTAT IR Term Project. 

There are 197 statutes (documents) and 50 train queries. The task is to train TFIDF model and generate trec file for 10 test queries.

## Authors
- Sayan Mahapatra
- Mainak Chowdhury
- Upasana Mandal
- Khyati Puhup


# Setup Environment


In [1]:
!rm -rf sample_data/
!rm -rf IRTP/
!git clone https://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@github.com/MeSayan/IRTP.git
!cd IRTP/
!chmod a+x IRTP/trec_eval.8.1/trec_eval.8.1/trec_eval

Cloning into 'IRTP'...
remote: Enumerating objects: 249, done.[K
remote: Counting objects: 100% (249/249), done.[K
remote: Compressing objects: 100% (238/238), done.[K
remote: Total 249 (delta 10), reused 247 (delta 8), pack-reused 0[K
Receiving objects: 100% (249/249), 547.48 KiB | 14.80 MiB/s, done.
Resolving deltas: 100% (10/10), done.


In [None]:
!echo -e " scikit-learn==1.0 \n numpy==1.19.5 \n pandas==1.1.5 \n nltk==3.4" > requirements.txt
!pip install -U -r requirements.txt

Collecting scikit-learn==1.0
  Downloading scikit_learn-1.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (23.1 MB)
[K     |████████████████████████████████| 23.1 MB 1.8 MB/s 
Collecting nltk==3.4
  Downloading nltk-3.4.zip (1.4 MB)
[K     |████████████████████████████████| 1.4 MB 35.7 MB/s 
Collecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-3.0.0-py3-none-any.whl (14 kB)
Collecting singledispatch
  Downloading singledispatch-3.7.0-py2.py3-none-any.whl (9.2 kB)
Building wheels for collected packages: nltk
  Building wheel for nltk (setup.py) ... [?25l[?25hdone
  Created wheel for nltk: filename=nltk-3.4-py3-none-any.whl size=1436397 sha256=081c78b60ef423498f5d30963aff737efbd24a959e91bbf7bc1936a3934d4c5c
  Stored in directory: /root/.cache/pip/wheels/13/b8/81/2349be11dd144dc7b68ab983b58cd2fae353cdc50bbdeb09d0
Successfully built nltk
Installing collected packages: threadpoolctl, singledispatch, scikit-learn, nltk
  Attempting uninstall: scikit-learn
    Found exi

# Functions

- get_all_documents() // return list of documents 
- get_all_queries() // return list of queries     
- clean() // tokenization, stop word, punctuation removal      
- preprocessor() // lemmatization, steming etc    
- generate_doc_vectors() // tf_idf vectors        
- generate_query_vector() // tf_idf vector of query 
- evaluate_docs() // compute similarity of doc vector and query vector 
- generate_trec_file() // generate trek file for evaluatiob by trec tool 

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import sklearn
import numpy as np
import string

import nltk
import os
import glob
import re

nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords

print(sklearn.__version__)
print(np.__version__)
print(pd.__version__)
print(nltk.__version__)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


1.0
1.19.5
1.1.5
3.4


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
def get_all_documents():
  path = "IRTP/Object_statutes/*.txt"
  doc_vex=glob.glob(path)
  doc_vex.sort(key=lambda f: int(re.sub('\D', '', f)))
  doc_head=[]
  doc_cont=[]
  for i in doc_vex:
    storex=""
    f=open(i,"r")
    for j in f:
      storex+=j #store file content in storex and append the sting in doc_cont 
    doc_cont.append(storex)
    doct=i.split("IRTP/Object_statutes/")
    doctx=doct[1].split(".txt")
    doc_head.append(doctx[0]) #contains the file name (Except .txt)
  return doc_head,doc_cont

In [None]:
def get_all_queries(pathx):
  fx=open(pathx,"r") 
  quer_vec_head=[]
  quer_vec_cont=[]
  for j in fx:
    stor=j.split("||")
    quer_vec_head.append(stor[0]) #take query names like AILA_Q1,AILA_Q2 etc
    quer_vec_cont.append(stor[1]) #take query details of each query AILA_Qi i in 1...n, n is number of queries
  return quer_vec_head,quer_vec_cont


In [None]:
def clean(items):
  """ Tokenize string, remove punctuation & stopwords """
  words = []
  cleaned_docs = []
  st = set(stopwords.words('english'))
  for item in items:
    sentences = sent_tokenize(item)
    lowercase_words = [word.lower() for sentence in sentences for word in word_tokenize(sentence)]
    
    # custom Filtering
    # 1. w.e.f.<Date> -> [w.e.f., <Date>]
    # 2. w.r.e.f.<Date> -> [w.r.e.f, <Date>]
    # 3. X.-Y -> [X, Y]
    # 4. X.—Y -> [X, Y]
    # 5. X- -> X
    # 6. -X -> X
    # 7. .X -> X
    # 8. X. -> X
    # 9. 'X or X' -> X
    # 10. X-Y -> [X, Y]
    nl = []
    for word in lowercase_words:
      if 'w.e.f.' in word:
        a, b = word.split('w.e.f.', 1)
        nl.append(a)
        nl.append(b)
      elif 'w.r.e.f.' in word:
        a, b = word.split('w.r.e.f', 1)
        nl.append(a)
        nl.append(b)
      elif '.-' in word:
        nl.extend(word.split('.-'))
      elif '.—' in word:
        nl.extend(word.split('.—'))
      elif (word.endswith('-') and not word.endswith('/-')) or ((word.endswith('—') and not word.endswith('/—'))):
        nl.append(word[:-1])
      elif word.startswith('-') or word.startswith('—'):
        nl.append(word[1:])
      elif word.startswith("."):
        nl.append(word[1:])
      elif word.endswith("."):
        nl.append(word[:-1])
      elif word.startswith("'") and word.endswith("'"):
        nl.append(word[1:-1])
      elif word.startswith("'"):
        nl.append(word[1:])
      elif word.endswith("'"):
        nl.append(word[:-1])
      elif '-' in word:
        nl.extend(word.split('-'))
      else:
        nl.append(word)

    punctuation_symbols = string.punctuation + '‘’“”—``'
    punctuation_removed_words = [word for word in nl if not word in punctuation_symbols]
    stopwords_removed_words = [word for word in punctuation_removed_words if not word in st]
    n2 = [word for word in stopwords_removed_words 
          if (re.match(r"^[']?[a-z]*[-]{0,1}[a-z]*$", word) and 
          word not in ['title', 'desc'] and # Remove 'title' & 'desc'
          len(word) > 3 # remove 1 and 2 letter words
          )]
    words.append(n2)

  for words_of_a_sentence in words:
    cleaned_docs.append(words_of_a_sentence)

  return cleaned_docs


In [None]:
def preprocessor(items):
  items = clean(items)
  # items is now tokenized and stop words removed
  return items


## Evaluate Trec File (For Training Data)

In [None]:
class BM25:
    # b and k_1 are hyper parameters for BM25.
    # setting default vales for k1 = 1.5 and b = 0.75
    #used for initialising values
    def __init__(set, k1 = 1.5, b = 0.75):
        set.b = b
        set.k1 = k1
    #Fitting the different variables for calculating BM25
    def fit(set, statutes):
        N = 0 #number of documents
        tf = [] #stores term frequency per document
        df = {} #stores document frequency per item
        D = [] #stores number of term in document
        idf = {} #inverse document frequency
        for document in statutes:
            N = N + 1
            D.append(len(document))
            f = {} #stores number of times term q_i occurs in Document D.
            for term in document:
                t_c = f.get(term, 0) + 1
                f[term] = t_c
            tf.append(f)
            for term, _ in f.items():
                df_c = df.get(term, 0) + 1
                df[term] = df_c
        for term, i in df.items():
            idf[term] = np.log(1+(N-i+0.5)/(i+0.5))
        set.tf_ = tf
        set.df_ = df
        set.idf_ = idf
        set.D_ = D
        set.statutes_ = statutes
        set.N_ = N
        set.davg_ = sum(D)/N #stores average number of term for document
        return set
    def search(set, query):
        scores = [set._score(query, index) for index in range(set.N_)]
        return scores
    def _score(set, query, index):
        score = 0.0
        D = set.D_[index]
        f = set.tf_[index]
        for term in query:
            if term not in f:
                continue
            i = f[term]
            upper = set.idf_[term] * i * (set.k1 + 1)
            lower = i + set.k1 * (1 - set.b + set.b * D / set.davg_)
            score += (upper / lower) #Calculating the score
        return score

In [None]:
import numpy as np
from sklearn import preprocessing
def generate_trec_file_sayan(filename, query_head, queries, docs_head, docs):
  bm25 = BM25()
  bm25.fit(docs)
  c = 1
  with open(filename, "w") as f:
    for i,q in zip(query_head,queries):
      scores = bm25.search(q)
      n = 0
      for s in scores:
        n += s*s
      n = n ** 0.5
      for j in range(len(scores)):
        scores[j] /= n
      scores = list(zip(scores, range(len(docs))))
      scores.sort(key=lambda x: x[0], reverse=True)
      rnk = 1
      # print(scores, sep="\n")
      for x in scores:
        s, doc_id = x
        if s > 0:
          print(f"{i} Q0 {doc_head[doc_id]} {rnk} {s} LEG_STAT_TRIER R4", file=f)
          rnk += 1

In [None]:
doc_head, docs = get_all_documents()
query_head, queries = get_all_queries("IRTP/Query_doc_train.txt")
docs = preprocessor(docs)
queries = preprocessor(queries)
generate_trec_file_sayan("trec_output_file_train_data.txt", query_head, queries, doc_head, docs)

In [None]:
!chmod a+x IRTP/trec_eval.8.1/trec_eval.8.1/trec_eval
!IRTP/trec_eval.8.1/trec_eval.8.1/trec_eval  IRTP/relevance_judgements_train.txt ./trec_output_file_train_data.txt

num_q          	all	50
num_ret        	all	9153
num_rel        	all	221
num_rel_ret    	all	206
map            	all	0.0908
gm_ap          	all	0.0588
R-prec         	all	0.0707
bpref          	all	0.0470
recip_rank     	all	0.1944
ircl_prn.0.00  	all	0.2096
ircl_prn.0.10  	all	0.2096
ircl_prn.0.20  	all	0.2096
ircl_prn.0.30  	all	0.1135
ircl_prn.0.40  	all	0.1035
ircl_prn.0.50  	all	0.0840
ircl_prn.0.60  	all	0.0603
ircl_prn.0.70  	all	0.0566
ircl_prn.0.80  	all	0.0331
ircl_prn.0.90  	all	0.0280
ircl_prn.1.00  	all	0.0280
P5             	all	0.0760
P10            	all	0.0620
P15            	all	0.0493
P20            	all	0.0450
P30            	all	0.0420
P100           	all	0.0250
P200           	all	0.0206
P500           	all	0.0082
P1000          	all	0.0041


# Generate Trec Test File

In [None]:
doc_head_te, docs_te = get_all_documents()
query_head_te, queries_te = get_all_queries("IRTP/Query_doc_test.txt")
docs_te = preprocessor(docs_te)
queries_te = preprocessor(queries_te)
generate_trec_file_sayan("trec_output_file_test_data.txt", query_head_te, queries_te, doc_head_te, docs_te)

# References

- https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
- https://towardsdatascience.com/how-sklearns-tf-idf-is-different-from-the-standard-tf-idf-275fa582e73d
- https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
- http://www.rafaelglater.com/en/post/learn-how-to-use-trec_eval-to-evaluate-your-information-retrieval-system
- https://radimrehurek.com/gensim/models/tfidfmodel.html


