# 3rd part: Image and texts, with EasyCLEF

In [1]:
import os
import ParserEasyClef
import ParserCACM
import modeles
import numpy as np
import query
import evaluation
import clusterings
import matplotlib.pyplot as plt
# Auto reload the imported modules when running cells
%load_ext autoreload 
%autoreload 2

# Constants:

srcFolder = "easyCLEF08/"

# File that contains all documents
srcFile = "easyCLEF08_text.txt"
# File that contains all relevances for queries
relevantsFile = "easyCLEF08_gt.txt"
# File that contains all queries
qryFile = "easyCLEF08_query.txt"

easyClef_txt = os.path.join(srcFolder, srcFile)
easyClef_qry = os.path.join(srcFolder, qryFile)
easyClef_rel = os.path.join(srcFolder, relevantsFile)

In [2]:
import indexation 
import TextRepresenter

parser = ParserEasyClef.ParserEasyClef()
stemmer =  TextRepresenter.PorterStemmer()
idx = indexation.InMemoryIndex(easyClef_txt, 
                               parser, 
                               stemmer)
all_docs_id = idx.getDocsID()
print("All docs ID (%d):" % len(all_docs_id))
print(all_docs_id)

allQueries = {}
qp = query.QueryParserCACM(easyClef_qry, easyClef_rel)
qry = qp.nextQuery()
while qry is not None:
    allQueries[int(qry.getID())] = qry
    qry = qp.nextQuery()
print("All qry ids (%d):" % len(allQueries))
print(allQueries.keys())


All docs ID (2256):
['6294', '26942', '37420', '15379', '7125', '34130', '37731', '31116', '39051', '6946', '31536', '36036', '31625', '27680', '31337', '31846', '38084', '23650', '9331', '35871', '6923', '38152', '37359', '20124', '8876', '10171', '13126', '31112', '15767', '6261', '9413', '31857', '24320', '10977', '8939', '7986', '11491', '20293', '39260', '21324', '32152', '27624', '10363', '26443', '4302', '38086', '19113', '37204', '30459', '19324', '30560', '37752', '14520', '37177', '37291', '9003', '16574', '40108', '35808', '38161', '24054', '2595', '13290', '10882', '31841', '4709', '37224', '40299', '7910', '8574', '13673', '38935', '32767', '38246', '2814', '32646', '13856', '40328', '35733', '31879', '40106', '13367', '4945', '24055', '40515', '4849', '17620', '10651', '2720', '16709', '35621', '4569', '13238', '39023', '34133', '6821', '1532', '27643', '16091', '35646', '30608', '19101', '6979', '8410', '2590', '39651', '7947', '24972', '32766', '1969', '39536', '3741', 

## Test baseline for some queries

In [24]:
modele_vect = modeles.Vectoriel(idx, modeles.TfidfWeighter(idx))
wantedId = [5, 3]
print("Searching for query", wantedId)
for qryID in wantedId :
    qry = allQueries[qryID]
    print("Query")
    print(qry)
    qry_repr = stemmer.getTextRepresentation(qry.getText())
    dict_scores = modele_vect.getScores(qry_repr)
    list_scores = [(docId, score) for docId, score in dict_scores.items()]
    irlist = evaluation.IRList(qry, list_scores)
    eval_precAtN = evaluation.PrecisionNDocuments(irlist)
    eval_CRAtN = evaluation.ClusterRecallNDocuments(irlist)
    n = 40
    print("Precision at %d docs:" % n)
    print(eval_precAtN.eval(n, verbose=True))
    print("Cluster Recall at %d docs:" % n)
    print(eval_CRAtN.eval(n, verbose=True))

    print(20 * '-')
    
print("Done.")


Searching for query [5, 3]
Query
Query {id=5, txt='animal swimming. Relevant images will show one or more animals (fish, birds, reptiles, etc.) swimming in a body of water. Images of people swimming in water are not relevant. Images of animals that are not swimming are not not relevant.
', relevances=[1515, 2007, 2285, 2764, 2913, 3090, 3166, 3248, 3660, 4218, 4219, 4969, 4975, 4979, 4993, 5011, 5012, 6294, 6297, 6342, 6556, 8458, 9149, 10738, 12608, 13170, 13183, 15027, 15172, 15405, 15645, 16638, 16952, 19108, 26625, 30783, 30817, 30822, 30824, 30825, 30826, 30827, 30828, 30829, 30830, 30831, 30843, 30844, 30845, 30846, 30863, 30865, 30866, 30867, 30868, 30869, 31072, 37447, 37448, 37449, 37450, 38269, 38270, 38271]}
Precision at 40 docs:
Result:  3090
Relevant, found docs = 1
Result:  15405
Relevant, found docs = 2
Result:  30828
Relevant, found docs = 3
Result:  30825
Relevant, found docs = 4
Result:  38271
Relevant, found docs = 5
Result:  30827
Relevant, found docs = 6
Result:  3

## Test of clustering

In [60]:
kmeans = clusterings.KMeansClustering()
meanshift = clusterings.MeanShiftClustering()

prCluster = modeles.PRClustering(idx, modele_vect, meanshift, nDocs=200)

wantedId = [5, 3]
print("Searching for query", wantedId)
for qryID in wantedId:
    qry = allQueries[qryID]
    print("Query")
    print(qry)
    qry_repr = stemmer.getTextRepresentation(qry.getText())
    %time ranking = prCluster.getRanking(qry_repr, Nclusters=None, verbose=True)
    irlist = evaluation.IRList(qry, scores=None, ranking=ranking)
    eval_precAtN = evaluation.PrecisionNDocuments(irlist)
    eval_CRAtN = evaluation.ClusterRecallNDocuments(irlist)
    n = 20
    print("Precision at %d docs:" % n)
    print(eval_precAtN.eval(n, verbose=True))
    print("Cluster Recall at %d docs:" % n)
    print(eval_CRAtN.eval(n, verbose=True))

    print(20 * '-')
    
print("Done.")


Searching for query [5, 3]
Query
Query {id=5, txt='animal swimming. Relevant images will show one or more animals (fish, birds, reptiles, etc.) swimming in a body of water. Images of people swimming in water are not relevant. Images of animals that are not swimming are not not relevant.
', relevances=[1515, 2007, 2285, 2764, 2913, 3090, 3166, 3248, 3660, 4218, 4219, 4969, 4975, 4979, 4993, 5011, 5012, 6294, 6297, 6342, 6556, 8458, 9149, 10738, 12608, 13170, 13183, 15027, 15172, 15405, 15645, 16638, 16952, 19108, 26625, 30783, 30817, 30822, 30824, 30825, 30826, 30827, 30828, 30829, 30830, 30831, 30843, 30844, 30845, 30846, 30863, 30865, 30866, 30867, 30868, 30869, 31072, 37447, 37448, 37449, 37450, 38269, 38270, 38271]}

Base ranking: ['3090', '15405', '30828', '30825', '38271', '30827', '30869', '30866', '30824', '3525', '30822', '30865', '30817', '30830', '3166', '30831', '30826', '26625', '9331', '35644', '19114', '30845', '30863', '30829', '15645', '17836', '23052', '11330', '26709'

## Plot for one query:

In [17]:
randQueries = np.random.choice(list(allQueries.values()), size=10, replace=False)
at = 40
nClusterRange = range(1, at)

for qry in randQueries:
    print("Query: ",qry)
    qry_repr = stemmer.getTextRepresentation(qry.getText())

    # Compute the baseline scores:
    dict_scores = modele_vect.getScores(qry_repr)
    list_scores = [(docId, score) for docId, score in dict_scores.items()]
    irlist = evaluation.IRList(qry, list_scores)
    eval_precAtN = evaluation.PrecisionNDocuments(irlist)
    eval_CRAtN = evaluation.ClusterRecallNDocuments(irlist)
    baseline_prec = eval_precAtN.eval(at, verbose=False)
    baseline_CR   = eval_CRAtN.eval(at, verbose=False)
    #print("Baseline: prec=%f, CR=%f" % (baseline_prec, baseline_CR))

    # Compute the post-retrieval clustering:
    precisions = []
    CRs = []
    for nCluster in nClusterRange:

        ranking = prCluster.getRanking(qry_repr, Nclusters=nCluster)
        irlist = evaluation.IRList(qry, scores=None, ranking=ranking)
        eval_precAtN = evaluation.PrecisionNDocuments(irlist)
        eval_CRAtN = evaluation.ClusterRecallNDocuments(irlist)
        prec = eval_precAtN.eval(at)
        CR = eval_CRAtN.eval(at, verbose=False)
        precisions.append(prec)
        CRs.append(CR)
        #print("%d clusters: prec=%f, CR=%f" % (nCluster, prec, CR))

    plt.figure(figsize=(12,8))
    plt.subplot(1,2,1)
    plt.hlines(baseline_prec, xmin=0, xmax=at-1, label="Baseline")
    plt.plot(precisions, "r", label="PR-Clustering")
    plt.legend()
    plt.title("Post-Recall Clustering")
    plt.xlabel("Number of clusters")
    plt.ylabel("Precision")
    plt.ylim([-0.1, 1.1])

    plt.subplot(1,2,2)
    plt.hlines(baseline_CR, xmin=0, xmax=at-1, label="Baseline")
    plt.plot(CRs, "r", label="PR-Clustering")
    plt.legend()
    plt.title("Post-Recall Clustering")
    plt.xlabel("Number of clusters")
    plt.ylabel("Cluster-Recall")
    plt.ylim([-0.1, 1.1])

    plt.show()

print("Done.")

Query:  Query {id=52, txt='sports people with prizes. Relevant images will show sports people with medals, trophies, cups or other types of sports prizes. Images of sportsmen or women without a prize are not relevant.
', relevances=[31536, 31537, 31538, 31539, 31540, 31541, 31543, 31650, 31653, 31680, 31702, 32151, 32152, 32153, 32248, 32249, 32250, 32293, 32297, 32349, 32351, 32479, 32480, 32481, 37880, 37883, 37884, 37885, 40150]}


KeyboardInterrupt: 

# Benchmark

In [56]:
at = 40
benchmarkQueries = np.random.choice(list(allQueries.values()), size=10, replace=False)

prCluster = modeles.PRClustering(idx, baseModel=modele_vect, cluster=kmeans, nDocs=300)
baseline_prec = []
baseline_CR = []
prCluster_prec = []
prCluster_CR = []

print("Query & Base prec. & PRC prec. & Delta  & Base CR & PRC CR & Delta")
print(70*'=')

for qry in benchmarkQueries:
    qry_repr = stemmer.getTextRepresentation(qry.getText())

    # Compute the baseline scores:
    dict_scores = modele_vect.getScores(qry_repr)
    list_scores = [(docId, score) for docId, score in dict_scores.items()]
    irlist = evaluation.IRList(qry, list_scores)
    basePrec = evaluation.PrecisionNDocuments(irlist).eval(at)
    baseCR = evaluation.ClusterRecallNDocuments(irlist).eval(at)
    baseline_prec.append(basePrec)
    baseline_CR.append(baseCR)
    
    # Compute the post-retrieval clustering:
    ranking = prCluster.getRanking(qry_repr, Nclusters=None, maxClusters=20)
    irlist = evaluation.IRList(qry, scores=None, ranking=ranking)
    prclusterPrec = evaluation.PrecisionNDocuments(irlist).eval(at)
    prclusterCR = evaluation.ClusterRecallNDocuments(irlist).eval(at)
    prCluster_prec.append(prclusterPrec)
    prCluster_CR.append(prclusterCR)
    print("%5s &    %.2f   &   %.2f   & %6.2f &  %.2f  & %.2f  & %6.2f \\\\" % 
          (qry.getID(), basePrec, prclusterPrec, prclusterPrec-basePrec, 
                        baseCR, prclusterCR, prclusterCR-baseCR))

        
print(70*'=')
meanBasePrec = np.mean(baseline_prec)
meanBaseCR = np.mean(baseline_CR)
meanPrcPrec = np.mean(prCluster_prec)
meanPrcCR = np.mean(prCluster_CR)
print("Mean  &    %.2f   &   %.2f   & %6.2f &  %.2f  & %.2f  & %6.2f \\\\" % 
          (meanBasePrec, meanPrcPrec, meanPrcPrec-meanBasePrec,
           meanBaseCR, meanPrcCR, meanPrcCR-meanBaseCR))    
print("Done.")

Query & Base prec. & PRC prec. & Delta  & Base CR & PRC CR & Delta
   18 &    0.42   &   0.42   &   0.00 &  0.56  & 0.56  &   0.00 \\
   28 &    0.47   &   0.47   &   0.00 &  1.00  & 1.00  &   0.00 \\
   60 &    0.86   &   0.55   &  -0.31 &  1.00  & 1.00  &   0.00 \\
   16 &    0.62   &   0.57   &  -0.05 &  0.52  & 0.61  &   0.09 \\
   34 &    0.50   &   0.47   &  -0.03 &  0.57  & 0.57  &   0.00 \\
   39 &    0.57   &   0.42   &  -0.15 &  0.75  & 0.75  &   0.00 \\
   44 &    0.05   &   0.12   &   0.07 &  0.13  & 0.20  &   0.07 \\
   10 &    0.40   &   0.33   &  -0.08 &  0.17  & 0.13  &  -0.04 \\
   52 &    0.69   &   0.55   &  -0.14 &  0.71  & 0.71  &   0.00 \\
   13 &    0.28   &   0.38   &   0.10 &  0.44  & 0.56  &   0.11 \\
Mean  &    0.49   &   0.43   &  -0.06 &  0.59  & 0.61  &   0.02 \\
Done.


## With MeanShift

In [57]:

at = 40

prCluster = modeles.PRClustering(idx, baseModel=modele_vect, cluster=meanshift, nDocs=300)
baseline_prec = []
baseline_CR = []
prCluster_prec = []
prCluster_CR = []

print("Query & Base prec. & PRC prec. & Delta  & Base CR & PRC CR & Delta")
print(70*'=')

for qry in benchmarkQueries:
    qry_repr = stemmer.getTextRepresentation(qry.getText())

    # Compute the baseline scores:
    dict_scores = modele_vect.getScores(qry_repr)
    list_scores = [(docId, score) for docId, score in dict_scores.items()]
    irlist = evaluation.IRList(qry, list_scores)
    basePrec = evaluation.PrecisionNDocuments(irlist).eval(at)
    baseCR = evaluation.ClusterRecallNDocuments(irlist).eval(at)
    baseline_prec.append(basePrec)
    baseline_CR.append(baseCR)
    
    # Compute the post-retrieval clustering:
    ranking = prCluster.getRanking(qry_repr, Nclusters=None, maxClusters=20)
    irlist = evaluation.IRList(qry, scores=None, ranking=ranking)
    prclusterPrec = evaluation.PrecisionNDocuments(irlist).eval(at)
    prclusterCR = evaluation.ClusterRecallNDocuments(irlist).eval(at)
    prCluster_prec.append(prclusterPrec)
    prCluster_CR.append(prclusterCR)
    print("%5s &    %.2f   &   %.2f   & %6.2f &  %.2f  & %.2f  & %6.2f \\\\" % 
          (qry.getID(), basePrec, prclusterPrec, prclusterPrec-basePrec, 
                        baseCR, prclusterCR, prclusterCR-baseCR))

        
print(70*'=')
meanBasePrec = np.mean(baseline_prec)
meanBaseCR = np.mean(baseline_CR)
meanPrcPrec = np.mean(prCluster_prec)
meanPrcCR = np.mean(prCluster_CR)
print("Mean  &    %.2f   &   %.2f   & %6.2f &  %.2f  & %.2f  & %6.2f \\\\" % 
          (meanBasePrec, meanPrcPrec, meanPrcPrec-meanBasePrec,
           meanBaseCR, meanPrcCR, meanPrcCR-meanBaseCR))    
print("Done.")

Query & Base prec. & PRC prec. & Delta  & Base CR & PRC CR & Delta
   18 &    0.42   &   0.42   &   0.00 &  0.56  & 0.56  &   0.00 \\
   28 &    0.47   &   0.47   &   0.00 &  1.00  & 1.00  &   0.00 \\
   60 &    0.86   &   0.86   &   0.00 &  1.00  & 1.00  &   0.00 \\
   16 &    0.62   &   0.68   &   0.05 &  0.52  & 0.57  &   0.04 \\
   34 &    0.50   &   0.50   &   0.00 &  0.57  & 0.43  &  -0.14 \\
   39 &    0.57   &   0.60   &   0.03 &  0.75  & 1.00  &   0.25 \\
   44 &    0.05   &   0.05   &   0.00 &  0.13  & 0.07  &  -0.07 \\
   10 &    0.40   &   0.40   &   0.00 &  0.17  & 0.17  &   0.00 \\
   52 &    0.69   &   0.69   &   0.00 &  0.71  & 0.71  &   0.00 \\
   13 &    0.28   &   0.25   &  -0.03 &  0.44  & 0.44  &   0.00 \\
Mean  &    0.49   &   0.49   &   0.00 &  0.59  & 0.59  &   0.01 \\
Done.
