## Import all packages that will be used

In [3]:
# Numpy for data management
import numpy as np

# Pandas also for data management
import pandas as pd

# Matplotlib for additional customization
#import pyplot as plt
#%matplotlib inline

import matplotlib.pyplot as plt

#For uploading our dataset
import xml.etree.ElementTree as ET #Parse and read XML data
import tarfile #read from tarfile instead of extracting all data

import xml.etree.ElementTree as ET

from collections import Counter

from sklearn.feature_extraction.text import CountVectorizer

import trec
import pprint as pp
import pickle


## Understanding LMJM algorithm (with example datasets)

For references to the examples please see: https://nlp.stanford.edu/IR-book/pdf/12lmodel.pdf and https://www.elastic.co/blog/language-models-in-elasticsearch 

## This is the formula for Jelinek Mercer smoothing

$$P(q | d) =\prod_{t \in q} ((1 - \lambda) \frac{tf_{t,d}}{L_d} + \lambda \frac{tf_t}{L_c})$$

where, $\lambda$ is from 0 to 1 is our smoothing parameter. tf_{t,d} is a term frequency in our document and tf_t is a term frequency in corpus. L_d is length of tokens in doc, L_c is length of tokens in all document.

## This is the formula for Jelinek Mercer smoothing with logs as used in elastic search algorithm

$$ P(q| d) = \sum_{t \in q} \log(1 + \frac{(1- \lambda) M_d}{\lambda M_c}) = \sum_{t \in q} \log(1 + \frac{(1- \lambda) \frac{tf_{t,d}}{L_d}}{\lambda \frac{tf_t + 1}{L_c + 1}}) $$ 

In [5]:
corpus_4 = [
     'This is the desert There are no people in the desert The Earth is large',
     'Where are the people? resumed the little prince at last Its a little lonely in the desert It is lonely when youre among people too said the snake',
'What makes the desert beautiful said the little prince is that somewhere it hides a well',]

index_test = CountVectorizer(ngram_range=(1,1), analyzer='word', stop_words = None)
index_test.fit(corpus_4)

corpus_cv = index_test.transform(corpus_4)
corpus_array=corpus_cv.toarray()

df_corpus = pd.DataFrame(data=corpus_array,columns = index_test.get_feature_names_out())

#print(df_corpus)

l=0.1

# Compute the query representation 

query = ['desert people']

query_cv = index_test.transform(query)
#print(query_cv.todense())
qq = query_cv.toarray()[0]
#print(qq)
A=len(Counter(corpus_4))  
aa = np.tile(qq, [A,1]) 
#print("aa ", aa)
#print("corpos array ",corpus_array.T)
prob_word_docs = corpus_array.T/np.sum(corpus_array,axis=1) # divided by doclength
#print("prob word docs ", prob_word_docs)
prob_word_corpus = np.sum(corpus_array, axis=0)/np.sum(corpus_array)
prob_word_docs_query =(1-l)*(prob_word_docs.T**aa)
prob_word_corpus_query = l*(prob_word_corpus**aa)
docs_scores = prob_word_docs_query + prob_word_corpus_query
final = np.prod(docs_scores, axis = 1)


print(final)

#print(np.sum(corpus_array,axis=1))

[0.00413321 0.0036624  0.00330748]


In [5]:
corpus_3 = [
     'Xyzzy this reports profit but revenue is down',
     'Quorus narrows quarter loss but revenue increases further',]

index_test = CountVectorizer(ngram_range=(1,1), analyzer='word', stop_words = None)
index_test.fit(corpus_3)

corpus_cv = index_test.transform(corpus_3)

corpus_array=corpus_cv.toarray()

df_corpus = pd.DataFrame(data=corpus_array)

#print(df_corpus)

l=0.5

# Compute the query representation 

query = ['revenue down',]
query_cv = index_test.transform(query)

 
qq = query_cv.toarray()[0]
A=len(Counter(corpus_3))  
aa = np.tile(qq, [A,1]) 

prob_word_docs = corpus_array.T/np.sum(corpus_array,axis=1) # divided by doclength
prob_word_corpus = np.sum(corpus_array, axis=0)/np.sum(corpus_array)
prob_word_docs_query =(1-l)*(prob_word_docs.T**aa)
prob_word_corpus_query = l*(prob_word_corpus**aa)
docs_scores = prob_word_docs_query + prob_word_corpus_query
final = np.prod(docs_scores, axis = 1)


#prob_word_docs = corpus_array.T/np.sum(corpus_array,axis=1)
#prob_word_corpus = np.sum(corpus_array, axis=0)/np.sum(corpus_array)
#joint_probabilities = np.multiply(prob_word_docs.T, aa)*(1-l) + np.multiply(prob_word_corpus, aa)*l
#doc_scores_lmjm = np.sum(joint_probabilities, axis=1) 


print(final)

[0.01171875 0.00390625]
