# Introduction

This noteook contains implementation of a Word2Vec based retreival for LEGSTAT IR Term Project. 

There are 197 statutes (documents) and 50 train queries. The task is to generate trec file for 10 test queries.

## Authors
- Sayan Mahapatra
- Mainak Chowdhury
- Upasana Mandal
- Khyati Puhup


# Setup Environment


In [2]:
!rm -rf sample_data/
!rm -rf IRTP/
!git clone https://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@github.com/MeSayan/IRTP.git
!cd IRTP/
!chmod a+x IRTP/trec_eval.8.1/trec_eval.8.1/trec_eval

Cloning into 'IRTP'...
remote: Enumerating objects: 249, done.[K
remote: Counting objects: 100% (249/249), done.[K
remote: Compressing objects: 100% (238/238), done.[K
remote: Total 249 (delta 10), reused 247 (delta 8), pack-reused 0[K
Receiving objects: 100% (249/249), 547.48 KiB | 5.07 MiB/s, done.
Resolving deltas: 100% (10/10), done.


In [None]:
!echo -e " scikit-learn==1.0 \n numpy==1.19.5 \n pandas==1.1.5 \n nltk==3.4 \n gensim" > requirements.txt
!pip install -U -r requirements.txt

Collecting gensim
  Downloading gensim-4.1.2-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (24.1 MB)
[K     |████████████████████████████████| 24.1 MB 2.8 kB/s 
Installing collected packages: gensim
  Attempting uninstall: gensim
    Found existing installation: gensim 3.6.0
    Uninstalling gensim-3.6.0:
      Successfully uninstalled gensim-3.6.0
Successfully installed gensim-4.1.2


# Functions

- get_all_documents() // return list of documents 
- get_all_queries() // return list of queries     
- clean() // tokenization, stop word, punctuation removal      
- preprocessor() // lemmatization, steming etc    
- generate() // return vectors (embeddings) for query / docs 
- evaluate_docs() // compute similarity of doc vector and query vector 
- generate_test_trec_file() // generate test trec file 
- generate_test_trec_file() // generate tain trec file for evaluation by trec tool

In [None]:
import pandas as pd
import sklearn
import numpy as np
import string
import pprint

pp = pprint.PrettyPrinter()

import torch
import logging

import matplotlib.pyplot as plt
% matplotlib inline

import nltk
import os
import glob
import re

nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords

from sklearn.preprocessing import normalize

print(sklearn.__version__)
print(np.__version__)
print(pd.__version__)
print(nltk.__version__)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


1.0
1.19.5
1.1.5
3.4


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
def get_all_documents():
  path = "IRTP/Object_statutes/*.txt"
  doc_vex=glob.glob(path)
  doc_vex.sort(key=lambda f: int(re.sub('\D', '', f)))
  doc_head=[]
  doc_cont=[]
  for i in doc_vex:
    storex=""
    f=open(i,"r")
    for j in f:
      storex+=j #store file content in storex and append the sting in doc_cont 
    doc_cont.append(storex)
    doct=i.split("IRTP/Object_statutes/")
    doctx=doct[1].split(".txt")
    doc_head.append(doctx[0]) #contains the file name (Except .txt)
  return doc_head,doc_cont

In [None]:
def get_all_queries(pathx):
  fx=open(pathx,"r") 
  quer_vec_head=[]
  quer_vec_cont=[]
  for j in fx:
    stor=j.split("||")
    quer_vec_head.append(stor[0]) #take query names like AILA_Q1,AILA_Q2 etc
    quer_vec_cont.append(stor[1]) #take query details of each query AILA_Qi i in 1...n, n is number of queries
  return quer_vec_head,quer_vec_cont


In [None]:
def clean(items):
  """ Tokenize string, remove punctuation & stopwords """
  words = []
  cleaned_docs = []
  st = set(stopwords.words('english'))
  for item in items:
    sentences = sent_tokenize(item)
    lowercase_words = [word.lower() for sentence in sentences for word in word_tokenize(sentence)]
    
    # custom Filtering
    # 1. w.e.f.<Date> -> [w.e.f., <Date>]
    # 2. w.r.e.f.<Date> -> [w.r.e.f, <Date>]
    # 3. X.-Y -> [X, Y]
    # 4. X.—Y -> [X, Y]
    # 5. X- -> X
    # 6. -X -> X
    # 7. .X -> X
    # 8. X. -> X
    # 9. 'X or X' -> X
    # 10. X-Y -> [X, Y]
    nl = []
    for word in lowercase_words:
      if 'w.e.f.' in word:
        a, b = word.split('w.e.f.', 1)
        nl.append(a)
        nl.append(b)
      elif 'w.r.e.f.' in word:
        a, b = word.split('w.r.e.f', 1)
        nl.append(a)
        nl.append(b)
      elif '.-' in word:
        nl.extend(word.split('.-'))
      elif '.—' in word:
        nl.extend(word.split('.—'))
      elif (word.endswith('-') and not word.endswith('/-')) or ((word.endswith('—') and not word.endswith('/—'))):
        nl.append(word[:-1])
      elif word.startswith('-') or word.startswith('—'):
        nl.append(word[1:])
      elif word.startswith("."):
        nl.append(word[1:])
      elif word.endswith("."):
        nl.append(word[:-1])
      elif word.startswith("'") and word.endswith("'"):
        nl.append(word[1:-1])
      elif word.startswith("'"):
        nl.append(word[1:])
      elif word.endswith("'"):
        nl.append(word[:-1])
      elif '-' in word:
        nl.extend(word.split('-'))
      else:
        nl.append(word)

    punctuation_symbols = string.punctuation + '‘’“”—``'
    punctuation_removed_words = [word for word in nl if not word in punctuation_symbols]
    stopwords_removed_words = [word for word in punctuation_removed_words if not word in st]
    n2 = [word for word in stopwords_removed_words 
          if (re.match(r"^[']?[a-z]*[-]{0,1}[a-z]*$", word) and 
          word not in ['title', 'desc'] and # Remove 'title' & 'desc'
          len(word) > 3 # remove 1 and 2 letter words
          )]
    words.append(n2)

  for words_of_a_sentence in words:
    cleaned_docs.append(words_of_a_sentence)

  return cleaned_docs


In [None]:
def preprocessor(items):
  items = clean(items)
  # items is now tokenized and stop words removed
  return items


# Train Model

In [None]:
from gensim.test.utils import common_texts
from gensim.models import Word2Vec, KeyedVectors
# Train model on corpora
doc_head, docs = get_all_documents()
query_head, queries = get_all_queries("IRTP/Query_doc_train.txt")
queries = preprocessor(queries)
docs = preprocessor(docs)
n_dim = 1000
train_data = docs

model = Word2Vec(sentences=train_data, vector_size=n_dim, window=300, min_count=1, workers=4)

In [None]:
from sklearn.preprocessing import normalize
def generate_vectors(word_vectors, items, n_dim=1000):
  D = []
  for i in range(len(items)):
    item = items[i]
    sum_vec = np.zeros((n_dim))
    for word in item:
      if word in word_vectors:
          sum_vec = sum_vec + word_vectors[word]
    sent_vec = sum_vec / len(item)
    D.append(sent_vec)
  D = np.array(D, dtype=np.float64)
  D = normalize(D, axis=1, norm='l2')
  return D


In [None]:
def generate_test_trec_file(D, Q, C, queries, file_name, threshold=0):
  with open(file_name, "w") as f:
    for q in range(len(queries)):
      drv = C[q]
      sdrv = np.flip(np.argsort(drv), axis = 0)
      c = 1
      for d in sdrv:
        if C[q][d] > threshold:
          print(f"AILA_TQ{q+1} Q0 {doc_head[d]} {c} {C[q][d]} LEG_STAT_TRIER R6", file=f)
          c += 1

In [None]:
def generate_train_trec_file(D, Q, C, queries, file_name, threshold=0):
  with open(file_name, "w") as f:
    for q in range(len(queries)):
      drv = C[q]
      sdrv = np.flip(np.argsort(drv), axis = 0)
      c = 1
      for d in sdrv:
        if C[q][d] > threshold:
          print(f"AILA_Q{q+1} Q0 {doc_head[d]} {c} {C[q][d]} LEG_STAT_TRIER R6", file=f)
          c += 1

# Generate Trec & Evaluate Trec File (Training)

In [None]:
# Store just the words + their trained embeddings.
word_vectors = model.wv
word_vectors.save("word2vec.wordvectors")
# Load back with memory-mapping = read-only, shared across processes.
word_vectors = KeyedVectors.load("word2vec.wordvectors", mmap='r')


doc_head, docs = get_all_documents()
docs = preprocessor(docs)

query_head, queries = get_all_queries("IRTP/Query_doc_train.txt")
queries = preprocessor(queries)

print("Embedding documents")
D_tr = generate_vectors(word_vectors, docs)

print("Embedding Querries")
Q_tr = generate_vectors(word_vectors, queries)

C_tr = Q_tr.dot(D_tr.T) # Q * D^T

print("Generating Trec File (Train)")
generate_train_trec_file(D_tr, Q_tr, C_tr, queries, "trec_output_file_train_data.txt")

print("Evaluating Trec File")
#Evaluate
!IRTP/trec_eval.8.1/trec_eval.8.1/trec_eval  IRTP/relevance_judgements_train.txt ./trec_output_file_train_data.txt

Embedding documents
Embedding Querries
Generating Trec File (Train)
Evaluating Trec File
num_q          	all	50
num_ret        	all	9850
num_rel        	all	221
num_rel_ret    	all	217
map            	all	0.0944
gm_ap          	all	0.0536
R-prec         	all	0.0660
bpref          	all	0.0542
recip_rank     	all	0.2091
ircl_prn.0.00  	all	0.2242
ircl_prn.0.10  	all	0.2242
ircl_prn.0.20  	all	0.2242
ircl_prn.0.30  	all	0.1032
ircl_prn.0.40  	all	0.0997
ircl_prn.0.50  	all	0.0923
ircl_prn.0.60  	all	0.0636
ircl_prn.0.70  	all	0.0498
ircl_prn.0.80  	all	0.0467
ircl_prn.0.90  	all	0.0366
ircl_prn.1.00  	all	0.0366
P5             	all	0.0640
P10            	all	0.0560
P15            	all	0.0427
P20            	all	0.0340
P30            	all	0.0307
P100           	all	0.0258
P200           	all	0.0217
P500           	all	0.0087
P1000          	all	0.0043


## Generate Trec File (Test)

In [None]:
doc_head, docs = get_all_documents()
docs = preprocessor(docs)

query_head, queries = get_all_queries("IRTP/Query_doc_test.txt")
queries = preprocessor(queries)

print("Embedding documents")
D_te = generate_vectors(word_vectors, docs)

print("Embedding Querries")
Q_te = generate_vectors(word_vectors, queries)


C_te = Q_te.dot(D_tr.T) # Q * D^T


print("Generating Trec File (Test)")
generate_test_trec_file(D_te, Q_te, C_te, queries, "trec_output_file_test_data.txt")

Embedding documents
Embedding Querries
Generating Trec File (Test)


In [None]:
A = [['A', 'B'], ['C', 'D']]
print(A)
B = [['E', 'F'], ['G', 'H']]
print(B)
A.extend(B)
A

[['A', 'B'], ['C', 'D']]
[['E', 'F'], ['G', 'H']]


[['A', 'B'], ['C', 'D'], ['E', 'F'], ['G', 'H']]