## Import Libraries

In [1]:
import pandas as pd
import numpy as np
np.random.seed(42)

# ignore warnings
import warnings
warnings.filterwarnings('ignore')

## Data Processing

In [2]:
class DataFrameCreator:
    
    def __init__(self):
        self.qrel_docs = {}
        self.query_doc = {}
        self.df = pd.DataFrame()

        self.score_files = {
            "es": "scores/results_ES.txt",
            "bm": "scores/results_BM25.txt",
            "okapi": "scores/results_OKAPI.txt",
            "tfidf": "scores/results_TFIDF.txt",
            "lml": "scores/results_LMLAPLACE.txt",
            "lmjm": "scores/results_LMJM.txt"
        }

        self.read_qrel()
        self.read_score_files()
        self.get_final_query_doc()
        self.get_data_frame()

    def read_qrel(self):
        with open("qrels.adhoc.51-100.AP89.txt", "r") as f:
            for line in f:
                query_id, _, doc_id, rel = line.split()
                self.qrel_docs.setdefault(query_id, {})[doc_id] = rel

    def read_score_files(self):
        for method, file_path in self.score_files.items():
            scores = {}
            with open(file_path, "r") as f:
                for line in f:
                    query_id, _, doc_id, _, score, _ = line.split()
                    scores.setdefault(query_id, {})[doc_id] = score
            setattr(self, f"{method}_score", scores)

    def get_final_query_doc(self):
        for method in self.score_files.keys():
            for query_id, scores in getattr(self, f"{method}_score").items():
                self.query_doc.setdefault(query_id, set()).update(scores.keys())

    def get_data_frame(self):
        data = []
        for query_id, docs in self.query_doc.items():
            for doc_id in docs:
                row = {
                    "query": query_id,
                    "doc": doc_id,
                    **{method: getattr(self, f"{method}_score")[query_id].get(doc_id, 0) for method in self.score_files.keys()},
                    "label": self.qrel_docs.get(query_id, {}).get(doc_id, 0)
                }
                data.append(row)
        self.df = pd.DataFrame(data)

dfc = DataFrameCreator()
df = dfc.df
df['label'] = df['label'].astype(int)
df.head()

Unnamed: 0,query,doc,es,bm,okapi,tfidf,lml,lmjm,label
0,85,AP891027-0016,1.2539542,1.2306580370117726,0.4129732895994717,0.4280437005860374,-310.70553398005205,-305.2101702492856,0
1,85,AP890216-0271,0.0,0.0,0.0,0.0,-400.0,-400.0,0
2,85,AP890707-0181,0.0,0.0,0.0,0.0,-400.0,-400.0,0
3,85,AP891123-0121,0.0,0.0,0.0,0.0,-400.0,-400.0,0
4,85,AP890502-0155,2.5423493,2.499189552514198,0.592028597572928,0.8161166838548195,-222.22092170174855,-211.3620193411388,0


In [7]:
# randomly split into 20 and 5
queries = df['query'].unique()
np.random.seed(42)

np.random.shuffle(queries)
train_queries = queries[:20]
test_queries = queries[20:]



print(f"Train queries: {train_queries}")
print(f"Test queries: {test_queries}")

train_df = df[df['query'].isin(train_queries)]
test_df = df[df['query'].isin(test_queries)]

train_features = train_df.drop(columns=['query', 'doc', 'label'])
train_labels = train_df['label']

test_features = test_df.drop(columns=['query', 'doc', 'label'])
test_labels = test_df['label']

Train queries: ['58' '95' '85' '63' '87' '77' '100' '59' '80' '62' '56' '94' '61' '71'
 '64' '98' '68' '60' '57' '91']
Test queries: ['99' '54' '89' '97' '93']


In [8]:
def write_score(file_path, df):
    with open(file_path, "w") as f:
        for idx, row in df.iterrows():
            line = "{} Q0 {} 1 {} Exp\n".format(row["query"], row["doc"], row["predicted_label"])
            f.write(line)

## Model Training

In [9]:
# Train a classification and get ranked output based on probability
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error

lr = LogisticRegression()
lr.fit(train_features, train_labels)

train_df['predicted_label'] = lr.predict(train_features)
test_df['predicted_label'] = lr.predict(test_features)

# Print classification report
from sklearn.metrics import classification_report
print("Train Classification Report")
print(classification_report(train_labels, train_df['predicted_label']))

print("Test Classification Report")
print(classification_report(test_labels, test_df['predicted_label']))

# Sort based on probability
train_df['predicted_label'] = lr.predict_proba(train_features)[:,1]
test_df['predicted_label'] = lr.predict_proba(test_features)[:,1]

train_df = train_df.sort_values(by=['query', 'predicted_label'], ascending=[True, False])
test_df = test_df.sort_values(by=['query', 'predicted_label'], ascending=[True, False])

# Write the output to file
write_score("outputs/train_scores_lr.txt", train_df)
write_score("outputs/test_scores_lr.txt", test_df)

Train Classification Report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1688274
           1       0.50      0.12      0.19      1566

    accuracy                           1.00   1689840
   macro avg       0.75      0.56      0.60   1689840
weighted avg       1.00      1.00      1.00   1689840

Test Classification Report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    422195
           1       0.45      0.34      0.39       265

    accuracy                           1.00    422460
   macro avg       0.73      0.67      0.69    422460
weighted avg       1.00      1.00      1.00    422460



In [13]:
!perl trec_eval.pl qrels.adhoc.51-100.AP89.txt outputs/train_scores_lr.txt
!perl trec_eval.pl qrels.adhoc.51-100.AP89.txt outputs/test_scores_lr.txt

Error due to 20

Queryid (Num):       20
Total number of documents over all queries
    Retrieved:    20000
    Relevant:      1566
    Rel_ret:       1050
Interpolated Recall - Precision Averages:
    at 0.00       0.7063
    at 0.10       0.5176
    at 0.20       0.4616
    at 0.30       0.3450
    at 0.40       0.2953
    at 0.50       0.2331
    at 0.60       0.1928
    at 0.70       0.1703
    at 0.80       0.1306
    at 0.90       0.0542
    at 1.00       0.0228
Average precision (non-interpolated) for all rel docs(averaged over queries)
                  0.2683
Precision:
  At    5 docs:   0.4400
  At   10 docs:   0.4050
  At   15 docs:   0.3867
  At   20 docs:   0.3650
  At   30 docs:   0.3317
  At  100 docs:   0.2175
  At  200 docs:   0.1592
  At  500 docs:   0.0890
  At 1000 docs:   0.0525
R-Precision (precision after R (= num_rel for a query) docs retrieved):
    Exact:        0.2930
Error due to 5

Queryid (Num):        5
Total number of documents over all queries
    Retri

In [11]:
# Train a random forest and get ranked output based on probability
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf.fit(train_features, train_labels)

train_df['predicted_label'] = rf.predict(train_features)
test_df['predicted_label'] = rf.predict(test_features)

# Print classification report
print("Train Classification Report")
print(classification_report(train_labels, train_df['predicted_label']))

print("Test Classification Report")
print(classification_report(test_labels, test_df['predicted_label']))

# Sort based on probability
train_df['predicted_label'] = rf.predict_proba(train_features)[:,1]
test_df['predicted_label'] = rf.predict_proba(test_features)[:,1]

train_df = train_df.sort_values(by=['query', 'predicted_label'], ascending=[True, False])
test_df = test_df.sort_values(by=['query', 'predicted_label'], ascending=[True, False])

# Write the output to file
write_score("outputs/train_scores_rf.txt", train_df)
write_score("outputs/test_scores_rf.txt", test_df)


Train Classification Report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1688274
           1       0.99      0.81      0.89      1566

    accuracy                           1.00   1689840
   macro avg       1.00      0.90      0.94   1689840
weighted avg       1.00      1.00      1.00   1689840

Test Classification Report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    422195
           1       0.38      0.08      0.13       265

    accuracy                           1.00    422460
   macro avg       0.69      0.54      0.57    422460
weighted avg       1.00      1.00      1.00    422460



In [12]:
!perl trec_eval.pl qrels.adhoc.51-100.AP89.txt outputs/train_scores_rf.txt
!perl trec_eval.pl qrels.adhoc.51-100.AP89.txt outputs/test_scores_rf.txt

Error due to 20

Queryid (Num):       20
Total number of documents over all queries
    Retrieved:    20000
    Relevant:      1566
    Rel_ret:         17
Interpolated Recall - Precision Averages:
    at 0.00       0.0020
    at 0.10       0.0000
    at 0.20       0.0000
    at 0.30       0.0000
    at 0.40       0.0000
    at 0.50       0.0000
    at 0.60       0.0000
    at 0.70       0.0000
    at 0.80       0.0000
    at 0.90       0.0000
    at 1.00       0.0000
Average precision (non-interpolated) for all rel docs(averaged over queries)
                  0.0000
Precision:
  At    5 docs:   0.0000
  At   10 docs:   0.0000
  At   15 docs:   0.0000
  At   20 docs:   0.0000
  At   30 docs:   0.0000
  At  100 docs:   0.0005
  At  200 docs:   0.0003
  At  500 docs:   0.0005
  At 1000 docs:   0.0009
R-Precision (precision after R (= num_rel for a query) docs retrieved):
    Exact:        0.0004
Error due to 5

Queryid (Num):        5
Total number of documents over all queries
    Retri