# 3rd part: Image and texts, with EasyCLEF

In [1]:
import os
import ParserEasyClef
import ParserCACM
import modeles
import numpy as np
import query
import evaluation
import clusterings
# Auto reload the imported modules when running cells
%load_ext autoreload 
%autoreload 2

# Constants:

srcFolder = "easyCLEF08/"

# File that contains all documents
srcFile = "easyCLEF08_text.txt"
# File that contains all relevances for queries
relevantsFile = "easyCLEF08_gt.txt"
# File that contains all queries
qryFile = "easyCLEF08_query.txt"

easyClef_txt = os.path.join(srcFolder, srcFile)
easyClef_qry = os.path.join(srcFolder, qryFile)
easyClef_rel = os.path.join(srcFolder, relevantsFile)

In [2]:
import indexation 
import TextRepresenter

parser = ParserEasyClef.ParserEasyClef()
stemmer =  TextRepresenter.PorterStemmer()
idx = indexation.InMemoryIndex(easyClef_txt, 
                               parser, 
                               stemmer)
print("All doc ids:")
all_docs_id = idx.getDocsID()
print(all_docs_id)

All doc ids:
['40261', '17198', '6294', '31512', '13039', '15907', '20356', '31398', '16425', '10344', '6912', '40186', '37533', '31219', '6946', '11210', '8238', '40357', '13135', '3741', '35683', '38928', '1969', '24741', '39687', '40174', '7383', '38914', '8486', '3362', '1583', '34133', '13673', '38247', '40290', '35817', '16904', '15919', '1531', '40661', '12707', '27050', '21256', '31624', '32695', '39040', '12189', '37895', '39039', '19287', '10551', '38144', '5176', '24904', '22199', '10671', '31897', '22734', '3513', '7178', '5087', '14160', '4944', '6379', '39695', '23166', '39052', '2592', '12495', '30353', '27673', '9956', '37226', '8426', '40217', '27435', '3895', '15099', '20484', '2197', '10636', '10733', '19384', '6231', '11258', '2295', '38929', '11629', '39094', '35706', '39186', '37339', '7577', '8323', '39238', '35981', '32767', '31514', '31191', '37416', '9281', '11297', '37393', '31643', '9544', '8697', '37158', '22699', '37772', '38133', '40291', '6500', '11408',

## Test baseline for some queries

In [3]:
modele_vect = modeles.Vectoriel(idx, modeles.TfidfWeighter(idx))
print("\n###### Testing QueryParserCACM: ###### ")
qp = query.QueryParserCACM(easyClef_qry, easyClef_rel)
qry = qp.nextQuery()
wantedId = [5, 3]
print("Searching for query", wantedId)
while qry is not None:
    if int(qry.getID()) in wantedId:
        print("Query")
        print(qry)
        qry_repr = stemmer.getTextRepresentation(qry.getText())
        dict_scores = modele_vect.getScores(qry_repr)
        list_scores = [(docId, score) for docId, score in dict_scores.items()]
        irlist = evaluation.IRList(qry, list_scores)
        eval_precAtN = evaluation.PrecisionNDocuments(irlist)
        eval_CRAtN = evaluation.ClusterRecallNDocuments(irlist)
        n = 20
        print("Precision at %d docs:" % n)
        print(eval_precAtN.eval(n, verbose=True))
        print("Cluster Recall at %d docs:" % n)
        print(eval_CRAtN.eval(n, verbose=True))

        print(20 * '-')
    qry = qp.nextQuery()
    
print("Done.")



###### Testing QueryParserCACM: ###### 
Searching for query [5, 3]
Query
Query {id=3, txt='religious statue in the foreground. Relevant images will show a statue of one (or more) religious figures such as gods, angels, prophets etc. from any kind of religion in the foreground. Non-religious statues like war memorials or monuments are not relevant. Images with statues that are not the focus of the image (like the front view of church with many small statues) are not relevant. The statues of Easter Island are not relevant as they do not have any religious background.
', relevances=[28, 3147, 4531, 4533, 4759, 4815, 4895, 4896, 4900, 6501, 10657, 10999, 14520, 15330, 15973, 16819, 16993, 19135, 19194, 19198, 19211, 19259, 20356, 20357, 22777, 23108, 25966, 30011, 35678, 35780, 35782, 35783, 36035, 36036]}
Precision at 20 docs:
Result:  3407
Result:  3414
Result:  4815
Relevant, found docs = 1
Result:  38217
Result:  38216
Result:  38218
Result:  38222
Result:  38221
Result:  38219
Result

## Test of clustering

In [12]:
kmeans = clusterings.KMeansClustering()
prCluster = modeles.PRClustering(idx, modele_vect, kmeans, nDocs=100)

qp = query.QueryParserCACM(easyClef_qry, easyClef_rel)
qry = qp.nextQuery()
wantedId = [5, 3]
print("Searching for query", wantedId)
while qry is not None:
    if int(qry.getID()) in wantedId:
        print("Query")
        print(qry)
        qry_repr = stemmer.getTextRepresentation(qry.getText())
        ranking = prCluster.getRanking(qry_repr, Nclusters=1)
        irlist = evaluation.IRList(qry, scores=None, ranking=ranking)
        eval_precAtN = evaluation.PrecisionNDocuments(irlist)
        eval_CRAtN = evaluation.ClusterRecallNDocuments(irlist)
        n = 20
        print("Precision at %d docs:" % n)
        print(eval_precAtN.eval(n, verbose=True))
        print("Cluster Recall at %d docs:" % n)
        print(eval_CRAtN.eval(n, verbose=True))

        print(20 * '-')
    qry = qp.nextQuery()
    
print("Done.")


Searching for query [5, 3]
Query
Query {id=3, txt='religious statue in the foreground. Relevant images will show a statue of one (or more) religious figures such as gods, angels, prophets etc. from any kind of religion in the foreground. Non-religious statues like war memorials or monuments are not relevant. Images with statues that are not the focus of the image (like the front view of church with many small statues) are not relevant. The statues of Easter Island are not relevant as they do not have any religious background.
', relevances=[28, 3147, 4531, 4533, 4759, 4815, 4895, 4896, 4900, 6501, 10657, 10999, 14520, 15330, 15973, 16819, 16993, 19135, 19194, 19198, 19211, 19259, 20356, 20357, 22777, 23108, 25966, 30011, 35678, 35780, 35782, 35783, 36035, 36036]}

Clustering: [['3407', '3414', '4815', '38217', '38216', '38218', '38222', '38221', '38219', '35780', '4759', '6246', '21221', '4927', '5109', '38265', '35884', '35885', '25966', '30334', '39169', '38898', '4723', '17947', '15