# LOADING THE DATASET

In [None]:
!sudo apt-get install python3.10

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
python3.10 is already the newest version (3.10.12-1~22.04.7).
python3.10 set to manually installed.
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.


In [None]:
!sudo update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1

In [None]:
!python --version

Python 3.11.11


In [1]:
!pip install python-terrier
!pip install --upgrade pytrec-eval-terrier
!pip install ir-measures[cwl_eval]
!pip install ir-measures[ranx]
!pip install --upgrade git+https://github.com/Georgetown-IR-Lab/OpenNIR
!pip install beir

import pyterrier as pt
import os
from google.colab import files
import onir_pt

Collecting python-terrier
  Downloading python_terrier-0.13.0-py3-none-any.whl.metadata (11 kB)
Collecting ir-datasets>=0.3.2 (from python-terrier)
  Downloading ir_datasets-0.5.9-py3-none-any.whl.metadata (12 kB)
Collecting wget (from python-terrier)
  Downloading wget-3.2.zip (10 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pyjnius>=1.4.2 (from python-terrier)
  Downloading pyjnius-1.6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Collecting ir-measures>=0.3.1 (from python-terrier)
  Downloading ir_measures-0.3.6-py3-none-any.whl.metadata (7.0 kB)
Collecting pytrec-eval-terrier>=0.5.3 (from python-terrier)
  Downloading pytrec_eval_terrier-0.5.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (777 bytes)
Collecting dill (from python-terrier)
  Downloading dill-0.3.9-py3-none-any.whl.metadata (10 kB)
Collecting chest (from python-terrier)
  Downloading chest-0.2.3.tar.gz (9.6 kB)
  Preparing metadata (setup.py)

In [None]:
from beir import util, LoggingHandler
from beir.retrieval import models
from beir.datasets.data_loader import GenericDataLoader
from beir.retrieval.evaluation import EvaluateRetrieval
from beir.retrieval.search.dense import DenseRetrievalExactSearch as DRES

import logging
import pathlib, os

#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
#### /print debug information to stdout

#### Download NFCorpus.zip dataset and unzip the dataset
dataset = "nfcorpus"
url = "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{}.zip".format(dataset)
out_dir = os.path.join(pathlib.Path('./').parent.absolute(), "datasets")
data_path = util.download_and_unzip(url, out_dir)

#### Provide the data_path where NFCorpus has been downloaded and unzipped
corpus, queries, qrels = GenericDataLoader(data_folder=data_path).load(split="test")

/content/datasets/nfcorpus.zip:   0%|          | 0.00/2.34M [8ms<?, ?iB/s]

  0%|          | 0/3633 [25ms<?, ?it/s]

In [None]:
import pandas as pd
data = pd.DataFrame.from_dict(corpus, orient='index')
data['doc_id'] = data.index
data.reset_index(drop=True, inplace=True)
data.head()

Unnamed: 0,text,title,doc_id
0,"Recent studies have suggested that statins, an...",Statin Use and Breast Cancer Survival: A Natio...,MED-10
1,BACKGROUND: Preclinical studies have shown tha...,Statin use after diagnosis of breast cancer an...,MED-14
2,The aims of this study were to determine the c...,Alkylphenols in human milk and their relations...,MED-118
3,Epilepsy or seizure disorder is one of the mos...,Methylmercury: A Potential Environmental Risk ...,MED-301
4,Hit Reaction Time latencies (HRT) in the Conti...,Sensitivity of Continuous Performance Test (CP...,MED-306


In [None]:
queries = pd.DataFrame.from_dict(queries, orient='index', columns=['query_text'])
queries['query_id'] = queries.index
queries.reset_index(drop=True, inplace=True)
queries.head()

Unnamed: 0,query_text,query_id
0,Do Cholesterol Statin Drugs Cause Breast Cancer?,PLAIN-2
1,Exploiting Autophagy to Live Longer,PLAIN-12
2,How to Reduce Exposure to Alkylphenols Through...,PLAIN-23
3,What’s Driving America’s Obesity Problem?,PLAIN-33
4,Who Should be Careful About Curcumin?,PLAIN-44


In [None]:
flat_qrels = [{'query_id': query_id, 'doc_id': doc_id, 'relevance': relevance}
              for query_id, inner_dict in qrels.items()
              for doc_id, relevance in inner_dict.items()]

qrels_df = pd.DataFrame(flat_qrels)
qrels_df.sample(20)

Unnamed: 0,query_id,doc_id,relevance
12266,PLAIN-3462,MED-5273,2
8586,PLAIN-2061,MED-1280,1
5615,PLAIN-1527,MED-2616,1
11201,PLAIN-2800,MED-4454,1
11772,PLAIN-3271,MED-1613,1
7329,PLAIN-1837,MED-5339,1
1545,PLAIN-531,MED-2279,1
4937,PLAIN-1419,MED-3281,1
8870,PLAIN-2061,MED-4738,1
8362,PLAIN-2051,MED-2008,1


# ADJUSTING THE DATASETS

## DATA

In [None]:
#Adjust the corpus in the rigth format
data.rename(columns={'doc_id': 'docno'}, inplace=True)
data = data[['docno', 'title', 'text']]

## QUERIES

In [None]:
#Firstly, we have to rename the columns of the queries
queries.rename(columns={'query_id': 'qid', 'query_text': 'query'}, inplace=True)

#Secondly, we have to follow this order in the columns: ['qid', 'query']
queries = queries[['qid', 'query']]

#Thirdly we remove special characters that could disturb the query parser
queries['query'] = queries['query'].str.replace(r'[^a-zA-Z0-9\s]', '', regex=True)

## QRLES

In [None]:
qrels_df.rename(columns={'query_id': 'qid', 'doc_id': 'docno', 'relevance': 'label'}, inplace=True)

In [None]:
#Create a function for adjusting the inconsistences between the ranking and the qrles

def adjust_qrels(results_pipeline, qrels_df):

  #We take the unique qid of the queries present in the ranking and in the qrles dataset
  qrels_queries = set(qrels_df['qid'].unique())
  run_queries = set(results_pipeline['qid'].unique())

  #We obtain the qid of the queries present in the qrles dataset but not in the ranking
  missing_in_run = qrels_queries - run_queries

  #We create a dataset for these queries. Each query will be associated with a fake document named "dummy" and will produce a score of zero.
  empty_results = pd.DataFrame({'qid': list(missing_in_run), 'docno': ['dummy'] * len(missing_in_run), 'score': [0] * len(missing_in_run)})

  # We combine the ranking with the new dataset we have just created
  results_pipeline = pd.concat([results_pipeline, empty_results])

  # We control that the query present in the ranking are also present in the qrles
  results_pipeline = results_pipeline[results_pipeline['qid'].isin(qrels_queries)]

  return results_pipeline

# Data Analysis

In [None]:
total_terms = {doc_id: len(doc.split()) for doc_id, doc in data['text'].items()}

In [None]:

total_terms_df = pd.DataFrame.from_dict(total_terms, orient = 'index', columns = ['total_terms'])
round(total_terms_df['total_terms'].mean())

221

In [None]:
total_terms

{0: 251,
 1: 236,
 2: 176,
 3: 179,
 4: 262,
 5: 248,
 6: 189,
 7: 251,
 8: 282,
 9: 274,
 10: 141,
 11: 212,
 12: 224,
 13: 190,
 14: 182,
 15: 285,
 16: 224,
 17: 290,
 18: 157,
 19: 203,
 20: 280,
 21: 261,
 22: 136,
 23: 250,
 24: 189,
 25: 343,
 26: 257,
 27: 124,
 28: 176,
 29: 199,
 30: 57,
 31: 237,
 32: 124,
 33: 239,
 34: 240,
 35: 236,
 36: 226,
 37: 279,
 38: 127,
 39: 112,
 40: 114,
 41: 184,
 42: 202,
 43: 242,
 44: 256,
 45: 203,
 46: 207,
 47: 143,
 48: 125,
 49: 133,
 50: 250,
 51: 312,
 52: 250,
 53: 385,
 54: 280,
 55: 244,
 56: 185,
 57: 247,
 58: 223,
 59: 358,
 60: 217,
 61: 216,
 62: 224,
 63: 295,
 64: 268,
 65: 266,
 66: 192,
 67: 262,
 68: 234,
 69: 301,
 70: 271,
 71: 192,
 72: 251,
 73: 266,
 74: 123,
 75: 260,
 76: 257,
 77: 185,
 78: 248,
 79: 255,
 80: 192,
 81: 320,
 82: 231,
 83: 197,
 84: 256,
 85: 244,
 86: 199,
 87: 171,
 88: 125,
 89: 215,
 90: 273,
 91: 249,
 92: 191,
 93: 223,
 94: 186,
 95: 244,
 96: 617,
 97: 204,
 98: 208,
 99: 244,
 100: 206,


In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from collections import Counter
import matplotlib.pyplot as plt

In [None]:

stemmer = PorterStemmer()

In [None]:
total_terms_stem = {}
for doc_id, doc in data['text'].items():
  tokens = [stemmer.stem(word) for word in doc.lower().split() if word not in tokens]
  total_terms_stem[doc_id] = len(tokens)


In [None]:
total_terms_stem_df = pd.DataFrame.from_dict(total_terms_stem, orient = 'index', columns = ['total_terms_stem'])
round(total_terms_stem_df['total_terms_stem'].mean())

185

In [None]:
total_terms_stem

{0: 219,
 1: 162,
 2: 160,
 3: 156,
 4: 234,
 5: 204,
 6: 153,
 7: 212,
 8: 230,
 9: 166,
 10: 121,
 11: 184,
 12: 184,
 13: 151,
 14: 145,
 15: 228,
 16: 189,
 17: 232,
 18: 140,
 19: 160,
 20: 241,
 21: 209,
 22: 109,
 23: 207,
 24: 142,
 25: 299,
 26: 199,
 27: 104,
 28: 151,
 29: 166,
 30: 51,
 31: 197,
 32: 102,
 33: 200,
 34: 209,
 35: 185,
 36: 186,
 37: 241,
 38: 107,
 39: 84,
 40: 99,
 41: 145,
 42: 186,
 43: 216,
 44: 232,
 45: 152,
 46: 188,
 47: 118,
 48: 112,
 49: 108,
 50: 237,
 51: 246,
 52: 231,
 53: 323,
 54: 236,
 55: 203,
 56: 147,
 57: 210,
 58: 191,
 59: 288,
 60: 179,
 61: 188,
 62: 197,
 63: 236,
 64: 222,
 65: 222,
 66: 168,
 67: 211,
 68: 200,
 69: 236,
 70: 242,
 71: 155,
 72: 219,
 73: 221,
 74: 104,
 75: 210,
 76: 234,
 77: 134,
 78: 220,
 79: 204,
 80: 147,
 81: 256,
 82: 206,
 83: 149,
 84: 225,
 85: 205,
 86: 173,
 87: 139,
 88: 105,
 89: 184,
 90: 209,
 91: 206,
 92: 155,
 93: 178,
 94: 154,
 95: 200,
 96: 515,
 97: 167,
 98: 184,
 99: 202,
 100: 176,
 1

In [None]:
nltk.download('stopwords')

# Initialize tools
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
total_term_no_stop = {}
for doc_id, doc in data['text'].items():
  tokens = [word for word in doc.lower().split() if word not in stop_words]
  total_term_no_stop[doc_id] = len(tokens)


In [None]:
total_term_no_stop_df = pd.DataFrame.from_dict(total_term_no_stop, orient = 'index', columns = ['total_term_no_stop'])
round(total_term_no_stop_df['total_term_no_stop'].mean())

147

In [None]:
total_term_no_stop

{0: 169,
 1: 165,
 2: 126,
 3: 115,
 4: 177,
 5: 167,
 6: 136,
 7: 168,
 8: 189,
 9: 177,
 10: 82,
 11: 136,
 12: 138,
 13: 109,
 14: 117,
 15: 175,
 16: 136,
 17: 174,
 18: 113,
 19: 129,
 20: 188,
 21: 176,
 22: 78,
 23: 147,
 24: 114,
 25: 235,
 26: 174,
 27: 71,
 28: 115,
 29: 128,
 30: 33,
 31: 148,
 32: 71,
 33: 158,
 34: 166,
 35: 146,
 36: 137,
 37: 189,
 38: 72,
 39: 69,
 40: 66,
 41: 123,
 42: 147,
 43: 200,
 44: 169,
 45: 119,
 46: 120,
 47: 89,
 48: 71,
 49: 84,
 50: 180,
 51: 186,
 52: 170,
 53: 291,
 54: 181,
 55: 171,
 56: 127,
 57: 167,
 58: 152,
 59: 212,
 60: 148,
 61: 153,
 62: 146,
 63: 202,
 64: 184,
 65: 194,
 66: 121,
 67: 175,
 68: 159,
 69: 189,
 70: 164,
 71: 115,
 72: 152,
 73: 187,
 74: 83,
 75: 165,
 76: 173,
 77: 120,
 78: 204,
 79: 186,
 80: 117,
 81: 211,
 82: 149,
 83: 145,
 84: 164,
 85: 171,
 86: 123,
 87: 108,
 88: 78,
 89: 145,
 90: 185,
 91: 172,
 92: 115,
 93: 149,
 94: 132,
 95: 165,
 96: 416,
 97: 129,
 98: 146,
 99: 172,
 100: 157,
 101: 149,
 

In [None]:
relevant_counts = qrels_df[qrels_df["label"] > 0].groupby("qid").size()

unique_labels = qrels_df["label"].unique()

# Display Results
print("Relevant Documents per Query:\n", relevant_counts)
print("Relevance Labels:\n", unique_labels)

# Example visualization of relevance distribution
qrels_df[qrels_df["label"] > 0].groupby("label").size().plot(kind="bar")
plt.title("Relevance Label Distribution")
plt.xlabel("Relevance Label")
plt.ylabel("Number of Documents")
plt.show()

Relevant Documents per Query:
 qid
PLAIN-1008      5
PLAIN-1018     60
PLAIN-102      24
PLAIN-1028      1
PLAIN-1039      2
             ... 
PLAIN-956     206
PLAIN-966       3
PLAIN-977       3
PLAIN-987       3
PLAIN-997       2
Length: 323, dtype: int64
Relevance Labels:
 [2 1]


too much to be plotted :(

In [None]:
import numpy as np

Unnamed: 0,qid,query
0,PLAIN-2,Do Cholesterol Statin Drugs Cause Breast Cancer
1,PLAIN-12,Exploiting Autophagy to Live Longer
2,PLAIN-23,How to Reduce Exposure to Alkylphenols Through...
3,PLAIN-33,Whats Driving Americas Obesity Problem
4,PLAIN-44,Who Should be Careful About Curcumin
...,...,...
318,PLAIN-3432,Healthy Chocolate Milkshakes
319,PLAIN-3442,The Healthiest Vegetables
320,PLAIN-3452,Bowel Movement Frequency
321,PLAIN-3462,Olive Oil and Artery Function


In [None]:
token_counts = [len(query.split()) for query in queries['query']]

# Descriptive statistics
mean_tokens = np.mean(token_counts)
median_tokens = np.median(token_counts)
min_tokens = np.min(token_counts)
max_tokens = np.max(token_counts)
std_dev_tokens = np.std(token_counts)

# Print statistics
print(f"Mean tokens: {mean_tokens}")
print(f"Median tokens: {median_tokens}")
print(f"Min tokens: {min_tokens}")
print(f"Max tokens: {max_tokens}")
print(f"Standard deviation: {std_dev_tokens}")

# Plot histogram
plt.hist(token_counts, bins=range(min_tokens, max_tokens + 2), edgecolor='black', align='left')
plt.title("Distribution of Token Counts in Queries")
plt.xlabel("Number of Tokens")
plt.ylabel("Frequency")
plt.show()

Mean tokens: 3.2724458204334366
Median tokens: 2.0
Min tokens: 1
Max tokens: 11
Standard deviation: 2.4074347997767314


### words cloud

In [None]:
!pip install wordcloud



In [None]:

from wordcloud import WordCloud
term_frequencies = {}
for doc in data['text']:
    for word in doc.lower().split():
        term_frequencies[word] = term_frequencies.get(word, 0) + 1

top_n = 10
sorted_terms = sorted(term_frequencies.items(), key=lambda x: x[1], reverse=True)[:top_n]
limited_term_frequencies = dict(sorted_terms)

print(limited_term_frequencies)
wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(limited_term_frequencies)
plt.figure(figsize=(20, 10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title("Term Distribution Word Cloud")
plt.show()

{'this': 2, 'is': 2, 'a': 1, 'sample': 1, 'text': 1, 'another': 1, 'example': 1}


In [None]:
plt.figure(figsize=(20, 10))
#plt.imshow(wordcloud, interpolation='bilinear')
#plt.axis('off')
#plt.title("Term Distribution Word Cloud")
plt.show()

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# Example data
data = {'text': ["This is a sample text", "This is another example"]}

# Compute term frequencies
term_frequencies = {}
for doc in data['text']:
    for word in doc.lower().split():
        term_frequencies[word] = term_frequencies.get(word, 0) + 1

# Generate word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(term_frequencies)

# Plot word cloud
plt.figure(figsize=(20, 10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title("Term Distribution Word Cloud")
plt.show()


# MM copia

## INDEXING THE COLLECTION (TITLE + TEXT) USING THE DEFAULT STEMMER

In [None]:
#We import Pyterrier and initiliaze it
import pyterrier as pt
import os

#Directory to store the index
pt_index_path = './antonio'

#Convert the dataset in a list of dictionaries
corpus = data.to_dict(orient='records')

# Index the collection (title + text) using the default stemmer and save as meta data: text, title and doc_id

if not os.path.exists(pt_index_path + "/data.properties"):
  indexer = pt.index.IterDictIndexer(pt_index_path, meta={'docno':20, 'text':2000, 'title':100}, text_attrs = ['title'])
  index_ref = indexer.index(corpus)
else:
  index_ref = pt.IndexRef.of(pt_index_path + "/data.properties")

index = pt.IndexFactory.of(index_ref)

print('Collection Statistics',index.getCollectionStatistics().toString())

terrier-assemblies 5.11 jar-with-dependencies not found, downloading to /root/.pyterrier...
Done
terrier-python-helper 0.0.8 jar not found, downloading to /root/.pyterrier...
Done


Java started (triggered by TerrierIndexer.__init__) and loaded: pyterrier.java, pyterrier.terrier.java [version=5.11 (build: craig.macdonald 2025-01-13 21:29), helper_version=0.0.8]


Collection Statistics Number of documents: 3633
Number of terms: 5075
Number of postings: 33108
Number of fields: 0
Number of tokens: 33787
Field names: []
Positions:   false



## SETTING THE PIPELINE (BM25-TFIDF) AND RUNNING THE EXPERIMENTS DEFAULT

In [None]:
#We set the pipeline
tf_idf = pt.terrier.Retriever(index, wmodel="TF_IDF")
bm25 = pt.terrier.Retriever(index, wmodel="BM25")
pipeline = (bm25 % 100) >> tf_idf

In [None]:
#Now we can run the pipeline
results_pipeline = pipeline.transform(queries)
results_pipeline = adjust_qrels(results_pipeline, qrels_df)

In [None]:
evaluate_pipeline_bm25_tfidf_default = pt.Evaluate(results_pipeline, qrels_df, metrics=['map', 'P_10', 'recall_25', 'ndcg'])
display(evaluate_pipeline_bm25_tfidf_default)

  scores[i] = _ndcg(qrels[i], run[i], k, rel_lvl, jarvelin)


{'map': 0.3621066039061444,
 'P_10': 0.1687306501547989,
 'recall_25': 0.1384755243657402,
 'ndcg': 0.19014391724343058}

In [None]:
#Evaluate the pipeline performances on each query
evaluate_pipeline_per_query = pt.Evaluate(results_pipeline, qrels_df, metrics=['map', 'P_10', 'recall_25', 'ndcg'], perquery=True)

In [None]:
#We convert the results of the pipeline_per_query in a Dataframe
evaluate_pipeline_per_query_df = pd.DataFrame(evaluate_pipeline_per_query)

#Transpose the Dataframe
evaluate_pipeline_per_query_df = evaluate_pipeline_per_query_df.T

#Exclude all the queries which do not appear in the queries dataset
evaluate_pipeline_per_query_df = evaluate_pipeline_per_query_df[evaluate_pipeline_per_query_df.index.isin(queries['qid'])]

In [None]:
#We save the results in a CSV and download it
evaluate_pipeline_per_query_df.to_csv('bm25_tfidf_default.csv')
files.download('bm25_tfidf_default.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Neural Reranking of pipeline (BM25 TF-IDF) DEFAULT STEMMER

In [None]:
#We use BERT to neural re-rank the pipeline. We specify that it will recive as input the titles of the documents.
vbert = onir_pt.reranker(
    'vanilla_transformer', 'bert',
    text_field='title',
    vocab_config={'train': True}
)
#We create the neural pipeline. We use the titles of the documents, which we have saved as metadata in the indexing, to re-rank.
neural_pipeline = pipeline >> pt.text.get_text(indexlike=index, metadata='title') >> vbert

config file not found: config


100%|██████████| 231508/231508 [178ms<0ms, 1302533.03B/s]
100%|██████████| 433/433 [1ms<0ms, 436360.80B/s]
100%|██████████| 440473133/440473133 [9.30s<0ms, 47349456.97B/s] 
  state_dict = torch.load(resolved_archive_file, map_location='cpu')


In [None]:
#Now we can run the pipeline
results_pipeline = neural_pipeline.transform(queries)
results_pipeline = adjust_qrels(results_pipeline, qrels_df)

[02;37m[2025-01-15 19:51:19,963][onir_pt][DEBUG] [0m[37musing GPU (deterministic)[0m
[02;37m[2025-01-15 19:51:20,489][onir_pt][DEBUG] [0m[37m[starting] batches[0m


batches:   0%|          | 0/3813 [19ms<?, ?it/s]

[02;37m[2025-01-15 19:52:31,917][onir_pt][DEBUG] [0m[37m[finished] batches: [01:11] [3813it] [53.38it/s][0m


In [None]:
#Evaluates the pipeline overall perfomances
evaluate_pipeline_bm25_tfid_default_neural = pt.Evaluate(results_pipeline, qrels_df, metrics=['map', 'P_10', 'recall_25', 'ndcg'])
display(evaluate_pipeline_bm25_tfid_default_neural)

{'map': 0.23964601686775233,
 'P_10': 0.10526315789473695,
 'recall_25': 0.10314990251040387,
 'ndcg': 0.1374106119398779}

## INDEXING THE COLLECTION (TITLE + TEXT) USING THE WEAKPORTER STEMMER

In [None]:
#We import Pyterrier and initiliaze it
import pyterrier as pt
import os

#Directory to store the index
pt_index_path = './claudio'

#Convert the dataset in a list of dictionaries
corpus = data.to_dict(orient='records')

# Index the collection (title + text) using the default stemmer and save as meta data: text, title and doc_id
if not os.path.exists(pt_index_path + "/data.properties"):
  indexer = pt.index.IterDictIndexer(pt_index_path, meta={'docno':20, 'text':2000, 'title':100}, text_attrs = ['text', 'title'], stemmer = pt.TerrierStemmer.weakporter)
  index_ref = indexer.index(corpus)
else:
  index_ref = pt.IndexRef.of(pt_index_path + "/data.properties")

index = pt.IndexFactory.of(index_ref)

print('Collection Statistics',index.getCollectionStatistics().toString())

Collection Statistics Number of documents: 3633
Number of terms: 21500
Number of postings: 345574
Number of fields: 0
Number of tokens: 567901
Field names: []
Positions:   false



## SETTING THE PIPELINE (BM25-TFIDF) AND RUNNING THE EXPERIMENTS WEAK PORTER

In [None]:
#We set the pipeline
tf_idf = pt.terrier.Retriever(index, wmodel="TF_IDF")
bm25 = pt.terrier.Retriever(index, wmodel="BM25")
pipeline = (bm25 % 100) >> tf_idf

In [None]:
#Now we can run the pipeline
results_pipeline = pipeline.transform(queries)
results_pipeline = adjust_qrels(results_pipeline, qrels_df)

In [None]:
evaluate_pipeline_bm25_tfidf_default = pt.Evaluate(results_pipeline, qrels_df, metrics=['map', 'P_10', 'recall_25', 'ndcg'])
display(evaluate_pipeline_bm25_tfidf_default)

{'map': 0.39514049569665693,
 'P_10': 0.23219814241486056,
 'recall_25': 0.1852769739839275,
 'ndcg': 0.2565750437414397}

In [None]:
#Evaluate the pipeline performances on each query
evaluate_pipeline_per_query = pt.Evaluate(results_pipeline, qrels_df, metrics=['map', 'P_10', 'recall_25', 'ndcg'], perquery=True)

In [None]:
#We convert the results of the pipeline_per_query in a Dataframe
evaluate_pipeline_per_query_df = pd.DataFrame(evaluate_pipeline_per_query)

#Transpose the Dataframe
evaluate_pipeline_per_query_df = evaluate_pipeline_per_query_df.T

#Exclude all the queries which do not appear in the queries dataset
evaluate_pipeline_per_query_df = evaluate_pipeline_per_query_df[evaluate_pipeline_per_query_df.index.isin(queries['qid'])]

In [None]:
#We save the results in a CSV and download it
evaluate_pipeline_per_query_df.to_csv('bm25_tfidf_default.csv')
files.download('bm25_tfidf_default.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Neural Reranking of pipeline (BM25 TF-IDF) WEAKPORTER

In [None]:
#We use BERT to neural re-rank the pipeline. We specify that it will recive as input the titles of the documents.
vbert = onir_pt.reranker(
    'vanilla_transformer', 'bert',
    text_field='title',
    vocab_config={'train': True}
)
#We create the neural pipeline. We use the titles of the documents, which we have saved as metadata in the indexing, to re-rank.
neural_pipeline = pipeline >> pt.text.get_text(indexlike=index, metadata='title') >> vbert

  state_dict = torch.load(resolved_archive_file, map_location='cpu')


In [None]:
#Now we can run the pipeline
results_pipeline = neural_pipeline.transform(queries)
results_pipeline = adjust_qrels(results_pipeline, qrels_df)

[02;37m[2025-01-15 19:55:31,645][onir_pt][DEBUG] [0m[37musing GPU (deterministic)[0m
[02;37m[2025-01-15 19:55:31,806][onir_pt][DEBUG] [0m[37m[starting] batches[0m


batches:   0%|          | 0/5274 [7ms<?, ?it/s]

[02;37m[2025-01-15 19:57:02,584][onir_pt][DEBUG] [0m[37m[finished] batches: [01:31] [5274it] [58.10it/s][0m


In [None]:
#Evaluates the pipeline overall perfomances
evaluate_pipeline_bm25_tfid_default_neural = pt.Evaluate(results_pipeline, qrels_df, metrics=['map', 'P_10', 'recall_25', 'ndcg'])
display(evaluate_pipeline_bm25_tfid_default_neural)

{'map': 0.20763650540672982,
 'P_10': 0.09845201238390103,
 'recall_25': 0.10117473759280424,
 'ndcg': 0.1648878518133511}

## INDEXING THE COLLECTION (TITLE + TEXT) USING NO STEMMER

In [None]:
#We import Pyterrier and initiliaze it
import pyterrier as pt
import os

#Directory to store the index
pt_index_path = './demitrio'

#Convert the dataset in a list of dictionaries
corpus = data.to_dict(orient='records')

# Index the collection (title + text) using the default stemmer and save as meta data: text, title and doc_id
if not os.path.exists(pt_index_path + "/data.properties"):
  indexer = pt.index.IterDictIndexer(pt_index_path, meta={'docno':20, 'text':2000, 'title':100}, text_attrs = ['title'], stemmer = pt.TerrierStemmer.none)
  index_ref = indexer.index(corpus)
else:
  index_ref = pt.IndexRef.of(pt_index_path + "/data.properties")

index = pt.IndexFactory.of(index_ref)

print('Collection Statistics',index.getCollectionStatistics().toString())

Collection Statistics Number of documents: 3633
Number of terms: 6453
Number of postings: 33201
Number of fields: 0
Number of tokens: 33787
Field names: []
Positions:   false



## SETTING THE PIPELINE (BM25-TFIDF) AND RUNNING THE EXPERIMENTS NO STEMMER

In [None]:
#We set the pipeline
tf_idf = pt.terrier.Retriever(index, wmodel="TF_IDF")
bm25 = pt.terrier.Retriever(index, wmodel="BM25")
pipeline = (bm25 % 100) >> tf_idf

In [None]:
#Now we can run the pipeline
results_pipeline = pipeline.transform(queries)
results_pipeline = adjust_qrels(results_pipeline, qrels_df)

In [None]:
evaluate_pipeline_bm25_tfidf_default = pt.Evaluate(results_pipeline, qrels_df, metrics=['map', 'P_10', 'recall_25', 'ndcg'])
display(evaluate_pipeline_bm25_tfidf_default)

{'map': 0.3414209745488222,
 'P_10': 0.1433436532507742,
 'recall_25': 0.12364491866950258,
 'ndcg': 0.1658606787744923}

In [None]:
#Evaluate the pipeline performances on each query
evaluate_pipeline_per_query = pt.Evaluate(results_pipeline, qrels_df, metrics=['map', 'P_10', 'recall_25', 'ndcg'], perquery=True)

In [None]:
#We convert the results of the pipeline_per_query in a Dataframe
evaluate_pipeline_per_query_df = pd.DataFrame(evaluate_pipeline_per_query)

#Transpose the Dataframe
evaluate_pipeline_per_query_df = evaluate_pipeline_per_query_df.T

#Exclude all the queries which do not appear in the queries dataset
evaluate_pipeline_per_query_df = evaluate_pipeline_per_query_df[evaluate_pipeline_per_query_df.index.isin(queries['qid'])]

In [None]:
#We save the results in a CSV and download it
evaluate_pipeline_per_query_df.to_csv('bm25_tfidf_default.csv')
files.download('bm25_tfidf_default.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Neural Reranking of pipeline (BM25 TF-IDF) NO STEMMER

In [None]:
#We use BERT to neural re-rank the pipeline. We specify that it will recive as input the titles of the documents.
vbert = onir_pt.reranker(
    'vanilla_transformer', 'bert',
    text_field='title',
    vocab_config={'train': True}
)
#We create the neural pipeline. We use the titles of the documents, which we have saved as metadata in the indexing, to re-rank.
neural_pipeline = pipeline >> pt.text.get_text(indexlike=index, metadata='title') >> vbert

  state_dict = torch.load(resolved_archive_file, map_location='cpu')


In [None]:
#Now we can run the pipeline
results_pipeline = neural_pipeline.transform(queries)
results_pipeline = adjust_qrels(results_pipeline, qrels_df)

[02;37m[2025-01-15 19:59:48,701][onir_pt][DEBUG] [0m[37musing GPU (deterministic)[0m
[02;37m[2025-01-15 19:59:48,836][onir_pt][DEBUG] [0m[37m[starting] batches[0m


batches:   0%|          | 0/3270 [7ms<?, ?it/s]

[02;37m[2025-01-15 20:00:46,108][onir_pt][DEBUG] [0m[37m[finished] batches: [57.27s] [3270it] [57.10it/s][0m


In [None]:
#Evaluates the pipeline overall perfomances
evaluate_pipeline_bm25_tfid_default_neural = pt.Evaluate(results_pipeline, qrels_df, metrics=['map', 'P_10', 'recall_25', 'ndcg'])
display(evaluate_pipeline_bm25_tfid_default_neural)

{'map': 0.2412589994035995,
 'P_10': 0.09349845201238399,
 'recall_25': 0.09430946568956919,
 'ndcg': 0.12583565059982063}

# Best/Worst queries

reading all the csv and merging together

In [7]:
import pandas as pd

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## import df bm25 tfidf pipeline

In [33]:
df_bm25_tfidf_1 = pd.read_csv('drive/MyDrive/Colab Notebooks/Magistrale/IR/bm25_tfidf_default.csv')
df_bm25_tfidf_1.rename(columns={'Unnamed: 0': 'qid'}, inplace = True)
df_bm25_tfidf_1

Unnamed: 0,qid,P_10,map,ndcg,recall_25
0,PLAIN-2,0.7,0.602967,0.778351,0.458333
1,PLAIN-12,0.1,0.200000,0.060448,0.033333
2,PLAIN-23,0.8,0.551268,0.332545,0.088889
3,PLAIN-33,0.2,0.331759,0.223693,0.062500
4,PLAIN-44,0.2,0.187655,0.210730,0.068966
...,...,...,...,...,...
318,PLAIN-2134,0.0,0.000000,0.000000,0.000000
319,PLAIN-1679,0.0,0.000000,0.000000,0.000000
320,PLAIN-2408,0.0,0.000000,0.000000,0.000000
321,PLAIN-997,0.0,0.000000,0.000000,0.000000


In [34]:
df_bm25_tfidf_2 = pd.read_csv('drive/MyDrive/Colab Notebooks/Magistrale/IR/bm25_tfidf_weak_porter.csv')
df_bm25_tfidf_2.rename(columns={'Unnamed: 0': 'qid'}, inplace = True)
df_bm25_tfidf_2

Unnamed: 0,qid,P_10,map,ndcg,recall_25
0,PLAIN-2,0.6,0.564411,0.765618,0.416667
1,PLAIN-12,0.1,0.200000,0.060448,0.033333
2,PLAIN-23,0.5,0.610925,0.297864,0.100000
3,PLAIN-33,0.2,0.280382,0.236305,0.062500
4,PLAIN-44,0.1,0.163297,0.182426,0.051724
...,...,...,...,...,...
318,PLAIN-2134,0.0,0.000000,0.000000,0.000000
319,PLAIN-1679,0.0,0.000000,0.000000,0.000000
320,PLAIN-2408,0.0,0.000000,0.000000,0.000000
321,PLAIN-997,0.0,0.000000,0.000000,0.000000


In [35]:
df_bm25_tfidf_3 = pd.read_csv('drive/MyDrive/Colab Notebooks/Magistrale/IR/bm25_tfidf_no_stemmer.csv')
df_bm25_tfidf_3.rename(columns={'Unnamed: 0': 'qid'}, inplace = True)
df_bm25_tfidf_3

Unnamed: 0,qid,P_10,map,ndcg,recall_25
0,PLAIN-2,0.4,0.474648,0.631728,0.291667
1,PLAIN-12,0.0,0.000000,0.000000,0.000000
2,PLAIN-23,0.5,0.311561,0.172310,0.066667
3,PLAIN-33,0.2,0.146941,0.207999,0.093750
4,PLAIN-44,0.0,0.157213,0.158664,0.068966
...,...,...,...,...,...
318,PLAIN-2880,0.0,0.000000,0.000000,0.000000
319,PLAIN-3131,0.0,0.000000,0.000000,0.000000
320,PLAIN-1621,0.0,0.000000,0.000000,0.000000
321,PLAIN-1363,0.0,0.000000,0.000000,0.000000


## import df tfidf bm25 pipeline

In [16]:
df_tfidf_bm25_1 = pd.read_csv('drive/MyDrive/Colab Notebooks/Magistrale/IR/tfidf_bm25_default.csv')
df_tfidf_bm25_1.rename(columns={'Unnamed: 0': 'qid'}, inplace = True)
df_tfidf_bm25_1

Unnamed: 0,qid,map,P_10,ndcg,recall_25
0,PLAIN-2,0.629057,0.7,0.766694,0.458333
1,PLAIN-12,0.200000,0.1,0.060448,0.033333
2,PLAIN-23,0.668309,0.8,0.325125,0.088889
3,PLAIN-33,0.239405,0.2,0.256239,0.062500
4,PLAIN-44,0.198861,0.2,0.214485,0.068966
...,...,...,...,...,...
318,PLAIN-2271,0.000000,0.0,0.000000,0.000000
319,PLAIN-2408,0.000000,0.0,0.000000,0.000000
320,PLAIN-551,0.000000,0.0,0.000000,0.000000
321,PLAIN-1621,0.000000,0.0,0.000000,0.000000


In [19]:
df_tfidf_bm25_2 = pd.read_csv('drive/MyDrive/Colab Notebooks/Magistrale/IR/tfidf_bm25_weak_porter.csv')
df_tfidf_bm25_2.rename(columns={'Unnamed: 0': 'qid'}, inplace = True)
df_tfidf_bm25_2

Unnamed: 0,qid,P_10,map,ndcg,recall_25
0,PLAIN-2,0.6,0.585980,0.748091,0.416667
1,PLAIN-12,0.1,0.200000,0.060448,0.033333
2,PLAIN-23,0.8,0.680343,0.326287,0.100000
3,PLAIN-33,0.2,0.242724,0.257736,0.062500
4,PLAIN-44,0.1,0.173239,0.187662,0.051724
...,...,...,...,...,...
318,PLAIN-1008,0.0,0.000000,0.000000,0.000000
319,PLAIN-1611,0.0,0.000000,0.000000,0.000000
320,PLAIN-2209,0.0,0.000000,0.000000,0.000000
321,PLAIN-1679,0.0,0.000000,0.000000,0.000000


In [20]:
df_tfidf_bm25_3 = pd.read_csv('drive/MyDrive/Colab Notebooks/Magistrale/IR/tfidf_bm25_no_stemmer.csv')
df_tfidf_bm25_3.rename(columns={'Unnamed: 0': 'qid'}, inplace = True)
df_tfidf_bm25_3

Unnamed: 0,qid,P_10,map,ndcg,recall_25
0,PLAIN-2,0.4,0.497106,0.638526,0.333333
1,PLAIN-12,0.0,0.000000,0.000000,0.000000
2,PLAIN-23,0.5,0.312373,0.172373,0.066667
3,PLAIN-33,0.2,0.146941,0.207999,0.093750
4,PLAIN-44,0.0,0.157213,0.158360,0.068966
...,...,...,...,...,...
318,PLAIN-1331,0.0,0.000000,0.000000,0.000000
319,PLAIN-817,0.0,0.000000,0.000000,0.000000
320,PLAIN-1309,0.0,0.000000,0.000000,0.000000
321,PLAIN-583,0.0,0.000000,0.000000,0.000000


## import df tfidf bm25 models

In [21]:
df_tfidf_1 = pd.read_csv('drive/MyDrive/Colab Notebooks/Magistrale/IR/tfidf_default.csv')
df_tfidf_1.rename(columns={'Unnamed: 0': 'qid'}, inplace = True)
df_tfidf_1

Unnamed: 0,qid,P_10,map,recall_25,ndcg
0,PLAIN-2,0.7,0.505938,0.458333,0.820251
1,PLAIN-12,0.1,0.079710,0.033333,0.082815
2,PLAIN-23,0.8,0.247876,0.088889,0.468726
3,PLAIN-33,0.2,0.147796,0.062500,0.336384
4,PLAIN-44,0.2,0.128289,0.068966,0.274872
...,...,...,...,...,...
318,PLAIN-997,0.0,0.000000,0.000000,0.000000
319,PLAIN-1679,0.0,0.000000,0.000000,0.000000
320,PLAIN-1309,0.0,0.000000,0.000000,0.000000
321,PLAIN-2209,0.0,0.000000,0.000000,0.000000


In [22]:
df_bm25_1 = pd.read_csv('drive/MyDrive/Colab Notebooks/Magistrale/IR/bm25_default.csv')
df_bm25_1.rename(columns={'Unnamed: 0': 'qid'}, inplace = True)
df_bm25_1

Unnamed: 0,qid,P_10,map,recall_25,ndcg
0,PLAIN-2,0.7,0.488793,0.458333,0.831714
1,PLAIN-12,0.1,0.079762,0.033333,0.082846
2,PLAIN-23,0.8,0.272577,0.088889,0.503403
3,PLAIN-33,0.2,0.147316,0.062500,0.335465
4,PLAIN-44,0.2,0.134542,0.068966,0.278511
...,...,...,...,...,...
318,PLAIN-997,0.0,0.000000,0.000000,0.000000
319,PLAIN-1679,0.0,0.000000,0.000000,0.000000
320,PLAIN-1309,0.0,0.000000,0.000000,0.000000
321,PLAIN-2209,0.0,0.000000,0.000000,0.000000


## Merge

In [82]:
df = pd.merge(df_bm25_tfidf_1, df_bm25_tfidf_2, on = 'qid', how = 'outer', suffixes=('_bm25_tfidf_default', '_bm25_tfidf_weak_porter'))
df = pd.merge(df, df_bm25_tfidf_3, on = 'qid', how = 'outer')
df = pd.merge(df, df_tfidf_bm25_1, on = 'qid', how = 'outer', suffixes = ('_bm25_tfidf_no_stemmer', '_tfidf_bm25_default'))
df = pd.merge(df, df_tfidf_bm25_2, on = 'qid', how = 'outer')
df = pd.merge(df, df_tfidf_bm25_3, on = 'qid', how = 'outer', suffixes = ('_tfidf_bm25_weak_porter', '_tfidf_bm25_no_stemmer'))
df = pd.merge(df, df_tfidf_1, on = 'qid', how = 'outer')
df = pd.merge(df, df_bm25_1, on = 'qid', how = 'outer', suffixes = ('_tfidf_default', '_bm25_default'))
df

Unnamed: 0,qid,P_10_bm25_tfidf_default,map_bm25_tfidf_default,ndcg_bm25_tfidf_default,recall_25_bm25_tfidf_default,P_10_bm25_tfidf_weak_porter,map_bm25_tfidf_weak_porter,ndcg_bm25_tfidf_weak_porter,recall_25_bm25_tfidf_weak_porter,P_10_bm25_tfidf_no_stemmer,...,ndcg_tfidf_bm25_no_stemmer,recall_25_tfidf_bm25_no_stemmer,P_10_tfidf_default,map_tfidf_default,recall_25_tfidf_default,ndcg_tfidf_default,P_10_bm25_default,map_bm25_default,recall_25_bm25_default,ndcg_bm25_default
0,PLAIN-1008,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000
1,PLAIN-1018,0.6,0.538758,0.195075,0.133333,0.6,0.538758,0.195075,0.133333,0.0,...,0.000000,0.000000,0.6,0.538758,0.133333,0.195075,0.6,0.538758,0.133333,0.195075
2,PLAIN-102,0.1,0.200000,0.081017,0.041667,0.1,0.200000,0.081017,0.041667,0.0,...,0.033834,0.000000,0.1,0.034496,0.041667,0.194871,0.1,0.033877,0.041667,0.193503
3,PLAIN-1028,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,...,0.270238,1.000000,0.0,0.002364,0.000000,0.114575,0.0,0.002278,0.000000,0.113878
4,PLAIN-1039,0.2,1.000000,1.000000,1.000000,0.2,1.000000,1.000000,1.000000,0.1,...,0.613147,0.500000,0.2,1.000000,1.000000,1.000000,0.2,1.000000,1.000000,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
318,PLAIN-956,0.5,0.499266,0.226991,0.072816,0.5,0.499266,0.226991,0.072816,0.4,...,0.089138,0.048544,0.5,0.315086,0.072816,0.408654,0.6,0.324810,0.067961,0.412933
319,PLAIN-966,0.1,0.100000,0.135652,0.333333,0.1,0.100000,0.135652,0.333333,0.0,...,0.000000,0.000000,0.1,0.100000,0.333333,0.135652,0.1,0.100000,0.333333,0.135652
320,PLAIN-977,0.0,0.071429,0.120116,0.333333,0.0,0.071429,0.120116,0.333333,0.0,...,0.123256,0.333333,0.0,0.071429,0.333333,0.120116,0.0,0.071429,0.333333,0.120116
321,PLAIN-987,0.1,0.333333,0.234639,0.333333,0.1,0.333333,0.234639,0.333333,0.0,...,0.000000,0.000000,0.1,0.333333,0.333333,0.234639,0.1,0.333333,0.333333,0.234639


In [47]:
df.columns

Index(['qid', 'P_10_bm25_tfidf_default', 'map_bm25_tfidf_default',
       'ndcg_bm25_tfidf_default', 'recall_25_bm25_tfidf_default',
       'P_10_bm25_tfidf_weak_porter', 'map_bm25_tfidf_weak_porter',
       'ndcg_bm25_tfidf_weak_porter', 'recall_25_bm25_tfidf_weak_porter',
       'P_10_bm25_tfidf_no_stemmer', 'map_bm25_tfidf_no_stemmer',
       'ndcg_bm25_tfidf_no_stemmer', 'recall_25_bm25_tfidf_no_stemmer',
       'map_tfidf_bm25_default', 'P_10_tfidf_bm25_default',
       'ndcg_tfidf_bm25_default', 'recall_25_tfidf_bm25_default',
       'P_10_tfidf_bm25_weak_porter', 'map_tfidf_bm25_weak_porter',
       'ndcg_tfidf_bm25_weak_porter', 'recall_25_tfidf_bm25_weak_porter',
       'P_10_tfidf_bm25_no_stemmer', 'map_tfidf_bm25_no_stemmer',
       'ndcg_tfidf_bm25_no_stemmer', 'recall_25_tfidf_bm25_no_stemmer',
       'P_10_tfidf_default', 'map_tfidf_default', 'recall_25_tfidf_default',
       'ndcg_tfidf_default', 'P_10_bm25_default', 'map_bm25_default',
       'recall_25_bm25_default', '

## best worst queries

In [91]:
df

Unnamed: 0,qid,P_10_bm25_tfidf_default,map_bm25_tfidf_default,ndcg_bm25_tfidf_default,recall_25_bm25_tfidf_default,P_10_bm25_tfidf_weak_porter,map_bm25_tfidf_weak_porter,ndcg_bm25_tfidf_weak_porter,recall_25_bm25_tfidf_weak_porter,P_10_bm25_tfidf_no_stemmer,...,ndcg_tfidf_bm25_no_stemmer,recall_25_tfidf_bm25_no_stemmer,P_10_tfidf_default,map_tfidf_default,recall_25_tfidf_default,ndcg_tfidf_default,P_10_bm25_default,map_bm25_default,recall_25_bm25_default,ndcg_bm25_default
0,PLAIN-1008,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000
1,PLAIN-1018,0.6,0.538758,0.195075,0.133333,0.6,0.538758,0.195075,0.133333,0.0,...,0.000000,0.000000,0.6,0.538758,0.133333,0.195075,0.6,0.538758,0.133333,0.195075
2,PLAIN-102,0.1,0.200000,0.081017,0.041667,0.1,0.200000,0.081017,0.041667,0.0,...,0.033834,0.000000,0.1,0.034496,0.041667,0.194871,0.1,0.033877,0.041667,0.193503
3,PLAIN-1028,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,...,0.270238,1.000000,0.0,0.002364,0.000000,0.114575,0.0,0.002278,0.000000,0.113878
4,PLAIN-1039,0.2,1.000000,1.000000,1.000000,0.2,1.000000,1.000000,1.000000,0.1,...,0.613147,0.500000,0.2,1.000000,1.000000,1.000000,0.2,1.000000,1.000000,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
318,PLAIN-956,0.5,0.499266,0.226991,0.072816,0.5,0.499266,0.226991,0.072816,0.4,...,0.089138,0.048544,0.5,0.315086,0.072816,0.408654,0.6,0.324810,0.067961,0.412933
319,PLAIN-966,0.1,0.100000,0.135652,0.333333,0.1,0.100000,0.135652,0.333333,0.0,...,0.000000,0.000000,0.1,0.100000,0.333333,0.135652,0.1,0.100000,0.333333,0.135652
320,PLAIN-977,0.0,0.071429,0.120116,0.333333,0.0,0.071429,0.120116,0.333333,0.0,...,0.123256,0.333333,0.0,0.071429,0.333333,0.120116,0.0,0.071429,0.333333,0.120116
321,PLAIN-987,0.1,0.333333,0.234639,0.333333,0.1,0.333333,0.234639,0.333333,0.0,...,0.000000,0.000000,0.1,0.333333,0.333333,0.234639,0.1,0.333333,0.333333,0.234639


In [96]:
# mean df rows
df_mean = pd.DataFrame([df['qid'], df.mean(axis=1, numeric_only = True)])
df_mean = df_mean.T
df_mean.columns = ['qid', 'mean']
df_mean = df_mean.sort_values(ascending = True, by = 'mean')
df_mean.head()


Unnamed: 0,qid,mean
0,PLAIN-1008,0.0
137,PLAIN-2321,0.0
136,PLAIN-2311,0.0
132,PLAIN-2281,0.0
131,PLAIN-2271,0.0


In [97]:
df_mean = df_mean.sort_values(ascending = False, by = 'mean')
df_mean.head()

Unnamed: 0,qid,mean
71,PLAIN-1710,0.909126
271,PLAIN-488,0.847294
295,PLAIN-721,0.835698
307,PLAIN-838,0.7967
300,PLAIN-771,0.783252
