<center><h1>Informarion Retrival Project</h1></center>
<center><h1>Build an Intelligent Information Retrival System</h1></center>

## Importation des librairies et du dataset

In [1]:
import nltk
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import json
from metric_tools import compute_metrics

In [2]:
fichier_inverse = pd.read_csv("../csv_docs/fichier_inverse.csv")
fichier_inverse = fichier_inverse.drop("Unnamed: 0", axis=1)
fichier_inverse

Unnamed: 0,Word,Document,Frequence,Poid
0,0,D443,1,0.210977
1,0.18,D1090,1,0.527442
2,0.5,D720,1,0.791163
3,0.7%,D691,1,0.395581
4,000,D687,2,1.071347
...,...,...,...,...
77177,zipfian,D329,1,0.527442
77178,zone,D62,2,0.575391
77179,zoolog,D755,1,0.632930
77180,zuckerman,D1291,1,1.431959


In [3]:
info_queries = pd.read_csv("../csv_docs/info_queries.csv")
info_queries = info_queries.drop("Unnamed: 0", axis=1)
info_queries

Unnamed: 0,Query,Word
0,Q1,problem
1,Q1,concern
2,Q1,make
3,Q1,descript
4,Q1,titl
...,...,...
5188,Q112,algorithm
5189,Q112,compar
5190,Q112,previous
5191,Q112,describ


In [4]:
eval_df = pd.read_csv("../csv_docs/evaluation.csv")
eval_df = eval_df.drop("Unnamed: 0", axis=1)
eval_df.loc[eval_df["Query"] == "Q1"]

Unnamed: 0,Query,Document
0,Q1,D28
1,Q1,D35
2,Q1,D38
3,Q1,D42
4,Q1,D43
5,Q1,D52
6,Q1,D65
7,Q1,D76
8,Q1,D86
9,Q1,D150


### SRI basé sur le modèle vectoriel avec la fonction Scalar Product

In [5]:
# SRI Vectoriel (Scalar Product)
def scalar_product(query,fichier_inverse,info_queries):
    words = list(info_queries.loc[info_queries["Query"] == query, "Word"].unique()) # liste des mots de la requête
    sri = fichier_inverse.loc[fichier_inverse["Word"].isin(words),["Document","Poid"]] # on récupère les documents et les poids des mots de la requête
    sri = sri.groupby('Document')['Poid'].sum().to_frame() # on somme les poids des mots de la requête
    sri.reset_index(inplace=True) # on remet les documents en colonne
    sri.rename(columns = {'Poid':'RSV'}, inplace = True) # on renomme la colonne
    sri = sri.sort_values(by=["RSV"]) # on trie les documents par poids
    return sri.reindex(index=sri.index[::-1]) # on inverse l'ordre des documents
sp_result = scalar_product("Q1", fichier_inverse, info_queries)
sp_result

Unnamed: 0,Document,RSV
571,D451,2.291059
894,D814,2.223797
543,D42,2.127047
261,D1364,1.971921
275,D1388,1.949736
...,...,...
30,D1039,0.075661
412,D248,0.075661
597,D480,0.070629
28,D1037,0.068783


In [6]:
# Calcule des métriques
metrics = []
for query in info_queries["Query"].unique():
    sp_result = scalar_product(query, fichier_inverse, info_queries)
    p, p5, p10, r, f = compute_metrics(query, eval_df, sp_result)
    metrics.append([query, p, p5, p10, r, f])
metrics = pd.DataFrame(metrics, columns=["Query", "Precision", "P@5", "P@10", "Rappel", "F-Mesure"])
metrics

Unnamed: 0,Query,Precision,P@5,P@10,Rappel,F-Mesure
0,Q1,0.044444,0.2,0.4,1.000000,0.085106
1,Q2,0.023901,0.0,0.1,0.961538,0.046642
2,Q3,0.048423,0.8,0.7,0.977273,0.092275
3,Q4,0.015066,0.2,0.2,1.000000,0.029685
4,Q5,0.017002,0.4,0.3,0.916667,0.033384
...,...,...,...,...,...,...
107,Q108,0.000000,0.0,0.0,0.000000,0.000000
108,Q109,0.048917,0.6,0.7,0.985915,0.093209
109,Q110,0.000000,0.0,0.0,0.000000,0.000000
110,Q111,0.004651,0.2,0.2,1.000000,0.009259


In [7]:
metrics.to_csv("../csv_docs/sp_metrics.csv", index=False)

### SRI basé sur le modèle vectoriel avec la fonction Cosine Measure

In [8]:
# SRI Vectoriel (Cosine Measure)
def cosine_measure(query,fichier_inverse,info_queries):
    words = list(info_queries.loc[info_queries["Query"] == query, "Word"].unique()) # liste des mots de la requête
    df = fichier_inverse.loc[fichier_inverse["Word"].isin(words),["Document","Word","Poid"]] # on récupère les documents et les poids des mots de la requête
    n = df.groupby('Document')['Word'].count().to_frame() # on compte le nombre de mots par document
    df = df.groupby('Document')['Poid'].sum().to_frame() # on somme les poids des mots de la requête
    df.reset_index(inplace=True) # on remet les documents en colonne
    df.rename(columns = {'Poid':'Scalar weight'}, inplace = True)  # on renomme la colonne

    df["SQRT weight query"] = np.sqrt(n).to_numpy() # on calcule la racine carrée du nombre de mots par document

    sqrt_weights_doc = fichier_inverse.copy() # on copie le fichier inverse
    sqrt_weights_doc["Poid"] = np.power(sqrt_weights_doc["Poid"], 2) # on calcule le carré des poids
    sqrt_weights_doc = sqrt_weights_doc.groupby('Document')['Poid'].sum().to_frame() # on somme les carrés des poids
    sqrt_weights_doc["Poid"] = np.sqrt(sqrt_weights_doc["Poid"]) # on calcule la racine carrée des sommes des carrés des poids
    sqrt_weights_doc.rename(columns = {'Poid':'SQRT weight doc'}, inplace = True) # on renomme la colonne
    df = df.merge(sqrt_weights_doc, on="Document") # on fusionne les deux dataframes

    df["RSV"] = np.divide( # on divise le poids de la requête par le produit des racines carrées des poids de la requête et du document
                            df["Scalar weight"], 
                            np.multiply(df["SQRT weight query"], df["SQRT weight doc"]))

    df = df[["Document", "RSV"]].sort_values(by=["RSV"]) # on trie les documents par poids
    return df.reindex(index=df.index[::-1]) # on inverse l'ordre des documents
cosine_measure("Q1", fichier_inverse, info_queries) 

Unnamed: 0,Document,RSV
568,D449,0.529178
677,D565,0.488123
457,D315,0.475616
563,D444,0.472281
213,D1281,0.452538
...,...,...
30,D1039,0.028411
966,D903,0.024858
28,D1037,0.024400
528,D400,0.020878


In [11]:
# Calcule des métriques
metrics = []
for query in info_queries["Query"].unique():
    cm_result = cosine_measure(query, fichier_inverse, info_queries) 
    p, p5, p10, r, f = compute_metrics(query, eval_df, cm_result)
    metrics.append([query, p, p5, p10, r, f])
metrics = pd.DataFrame(metrics, columns=["Query", "Precision", "P@5", "P@10", "Rappel", "F-Mesure"])
metrics

Unnamed: 0,Query,Precision,P@5,P@10,Rappel,F-Mesure
0,Q1,0.044444,0.2,0.2,1.000000,0.085106
1,Q2,0.023901,0.0,0.0,0.961538,0.046642
2,Q3,0.048423,0.6,0.6,0.977273,0.092275
3,Q4,0.015066,0.2,0.1,1.000000,0.029685
4,Q5,0.017002,0.0,0.0,0.916667,0.033384
...,...,...,...,...,...,...
107,Q108,0.000000,0.0,0.0,0.000000,0.000000
108,Q109,0.048917,0.4,0.5,0.985915,0.093209
109,Q110,0.000000,0.0,0.0,0.000000,0.000000
110,Q111,0.004651,0.0,0.0,1.000000,0.009259


In [None]:
metrics.to_csv("../csv_docs/cm_metrics.csv", index=False)

### SRI basé sur le modèle vectoriel avec la fonction Jaccard Measure

In [12]:
# SRI Vectoriel (Jaccard Measure)
def jaccard_measure(query,fichier_inverse,info_queries):
    words = list(info_queries.loc[info_queries["Query"] == query, "Word"]) # liste des mots de la requête
    df = fichier_inverse.loc[fichier_inverse["Word"].isin(words),["Document","Word","Poid"]] # on récupère les documents et les poids des mots de la requête
    n = df.groupby('Document')['Word'].count().to_frame() # on compte le nombre de mots par document
    df = df.groupby('Document')['Poid'].sum().to_frame() # on somme les poids des mots de la requête
    df.reset_index(inplace=True) # on remet les documents en colonne
    df.rename(columns = {'Poid':'Scalar weight'}, inplace = True) # on renomme la colonne

    df["square weight query"] = n.to_numpy() # on calcule le carré du nombre de mots par document

    square_weights_doc = fichier_inverse.copy() # on copie le fichier inverse
    square_weights_doc["Poid"] = np.power(square_weights_doc["Poid"], 2) # on calcule le carré des poids
    square_weights_doc = square_weights_doc.groupby('Document')['Poid'].sum().to_frame() # on somme les carrés des poids
    square_weights_doc.rename(columns = {'Poid':'square weight doc'}, inplace = True) # on renomme la colonne
    df = df.merge(square_weights_doc, on="Document") # on fusionne les deux dataframes

    df["RSV"] = np.divide(# on divise le poids de la requête par la différence des sommes des carrés des poids de la requête et du document
                            df["Scalar weight"], 
                            np.subtract(np.add(df["square weight query"], df["square weight doc"]), 
                                                df["Scalar weight"]))

    df = df[["Document", "RSV"]].sort_values(by=["RSV"]) # on trie les documents par poids
    return df.reindex(index=df.index[::-1]) # on inverse l'ordre des documents
jaccard_measure("Q1", fichier_inverse, info_queries)

Unnamed: 0,Document,RSV
568,D449,0.339517
457,D315,0.309393
677,D565,0.299543
820,D722,0.276014
213,D1281,0.262324
...,...,...
350,D173,0.004599
179,D1234,0.004437
528,D400,0.004202
412,D248,0.003973


In [13]:
# Calcule des métriques
metrics = []
for query in info_queries["Query"].unique():
    jm_result = jaccard_measure(query, fichier_inverse, info_queries)  
    p, p5, p10, r, f = compute_metrics(query, eval_df, jm_result)
    metrics.append([query, p, p5, p10, r, f])
metrics = pd.DataFrame(metrics, columns=["Query", "Precision", "P@5", "P@10", "Rappel", "F-Mesure"])
metrics

Unnamed: 0,Query,Precision,P@5,P@10,Rappel,F-Mesure
0,Q1,0.044444,0.4,0.4,1.000000,0.085106
1,Q2,0.023901,0.0,0.1,0.961538,0.046642
2,Q3,0.048423,0.6,0.4,0.977273,0.092275
3,Q4,0.015066,0.2,0.1,1.000000,0.029685
4,Q5,0.017002,0.0,0.0,0.916667,0.033384
...,...,...,...,...,...,...
107,Q108,0.000000,0.0,0.0,0.000000,0.000000
108,Q109,0.048917,0.8,0.6,0.985915,0.093209
109,Q110,0.000000,0.0,0.0,0.000000,0.000000
110,Q111,0.004651,0.0,0.2,1.000000,0.009259


In [None]:
metrics.to_csv("../csv_docs/jm_metrics.csv", index=False)