## A notebook to evaluate our models

In [None]:
dev_data = '../dev_data.csv'

In [None]:
import pandas as pd

df = pd.read_csv(dev_data)

In [None]:
df.head()

In [None]:
from sentence_transformers.cross_encoder import CrossEncoder

model_checkpoint = "cross-encoder/stsb-TinyBERT-L-4"
model = CrossEncoder(model_checkpoint)

In [None]:
from huggingface_hub import notebook_login
notebook_login()

### Base evaluation (only pretrained model)

In [None]:
scores_base = model.predict(df[['Query', 'doc_text']].values.tolist(), show_progress_bar=True)

In [None]:
df["score_base"] = scores_base

In [None]:
df.head()

In [None]:
from sklearn import metrics
print("results on base pretrained model: " , metrics.roc_auc_score(y_true=df['label'], y_score=df["score_base"]))

In [None]:
scores_tuned = model.predict(df[['Query', 'doc_text']].values.tolist(), show_progress_bar=True)
df["score_tuned"] = scores_tuned

print("results on base pretrained model + tune: " , 
      metrics.roc_auc_score(y_true=df['label'], y_score=df["score_tuned"]))

### chunks

In [None]:
# Split the long document texts overlapping = 20 
def get_split480(text1):
    l_total = []
    l_parcial = []
    if len(text1.split())// 479>0:
        n = len(text1.split())//479
    else: 
        n = 1
    for w in range(n):
        if w == 0:
            l_parcial = text1.split()[:479]
            l_total.append(" ".join(l_parcial))
        else:
            l_parcial = text1.split()[w*459:w*459 + 479]
            l_total.append(" ".join(l_parcial))
    return l_total

# Split the document text
df['text_split1'] = df['doc_text'].apply(get_split480)
#df_all_docs.head()

docs_l = []
label_l = []
index_l =[]
query_l = []
query_n = []
doc_n = []
score_b = []
score_t = []
for idx,row in df.iterrows():
    for l in row['text_split1']:
        docs_l.append(l)
        label_l.append(row['label'])
        query_l.append(row['Query'])
        doc_n.append(row['doc_number'])
        query_n.append(row['Query_number'])
        
        index_l.append(idx)
len(docs_l), len(label_l), len(index_l)

# Create a new dataframe with the splitted documents
df_chunked = pd.DataFrame({"doc_text":docs_l, 'label':label_l, 'Query': query_l, 
                       "doc_number": doc_n, "query_number": query_n})


In [None]:
df_chunked.shape

In [None]:
df.shape

In [None]:
model_path = "../stsb-TinyBERT-L-4-finetuned_auc_151221-5-001"
model =  CrossEncoder(model_path)

In [None]:
scores_tuned = model.predict(df_chunked[['Query', 'doc_text']].values.tolist(), show_progress_bar=True)
df_chunked["score_tuned"] = scores_tuned

print("results on base pretrained model + tune: " , 
      metrics.roc_auc_score(y_true=df_chunked['label'], y_score=df_chunked["score_tuned"]))

In [None]:
model_path = "stsb-TinyBERT-L-4-finetuned_auc_151221-top3"
model3 =  CrossEncoder(model_path)

In [None]:
scores_top3 = model3.predict(df_chunked[['Query', 'doc_text']].values.tolist(), show_progress_bar=True)


In [None]:
df_chunked["score_top3"] = scores_top3

print("results on top 3 model: " , 
      metrics.roc_auc_score(y_true=df_chunked['label'], y_score=df_chunked["score_top3"]))

In [None]:
model_path = "stsb-TinyBERT-top1"
model1 =  CrossEncoder(model_path)

In [None]:
scores_top1 = model1.predict(df_chunked[['Query', 'doc_text']].values.tolist(), show_progress_bar=True)

In [None]:
df_chunked["score_top1"] = scores_top1

print("results on top 1 model: " , 
      metrics.roc_auc_score(y_true=df_chunked['label'], y_score=df_chunked["score_top1"]))

In [None]:
df_chunked.head()

In [None]:
df_chunked[(df_chunked['Query_number'] == 1089071) & (df_chunked['doc_number'] == 29215)]

In [None]:
grouped = pd.DataFrame(df_chunked.groupby(by=["query_number", "doc_number", "label"]).max(["score_top3", "score_top1", "score_tuned"])).reset_index()

In [None]:
grouped

In [None]:
print("results on top 1 model: " , 
      metrics.roc_auc_score(y_true=grouped['label'], y_score=grouped["score_top1"]))
print("results on top 3 model: " , 
      metrics.roc_auc_score(y_true=grouped['label'], y_score=grouped["score_top3"]))
print("results on chunked: " , 
      metrics.roc_auc_score(y_true=grouped['label'], y_score=grouped["score_tuned"]))

In [None]:
print("results on top 1 model: " , 
      metrics.roc_auc_score(y_true=df['label'], y_score=df["score_top1"]))
print("results on top 3 model: " , 
      metrics.roc_auc_score(y_true=df['label'], y_score=df["score_top3"]))
print("results on chunked: " , 
      metrics.roc_auc_score(y_true=df['label'], y_score=df["score_tuned"]))

In [None]:
print("results on top 1 model: " , 
      metrics.roc_auc_score(y_true=df_chunked['label'], y_score=df_chunked["score_top1"]))
print("results on top 3 model: " , 
      metrics.roc_auc_score(y_true=df_chunked['label'], y_score=df_chunked["score_top3"]))
print("results on chunked: " , 
      metrics.roc_auc_score(y_true=df_chunked['label'], y_score=df_chunked["score_tuned"]))

In [None]:
grouped[grouped['label'] == 1]['score_top3'].hist()

In [None]:
grouped[grouped['label'] == 0]['score_top3'].hist()

## Final Evaluation

In [None]:
model_checkpoint = "../stsb-TinyBERT-L-4-finetuned_auc_151221-top3"
model = CrossEncoder(model_checkpoint)

In [None]:
scores = model.predict(df_chunked[['Query', 'doc_text']].values.tolist(), show_progress_bar=True)
df_chunked["score"] = scores


In [None]:
from sklearn import metrics
print("results withour aggregation: " , 
      metrics.roc_auc_score(y_true=df_chunked['label'], y_score=df_chunked["score"]))

In [None]:
df_chunked.head()

In [None]:
grouped = pd.DataFrame(df_chunked.groupby(by=["query_number", "doc_number", "label"])["score"].max()).reset_index()

In [None]:
grouped.score.mean()

In [None]:
grouped.score.max()

In [None]:
grouped.score.min()

In [None]:
import numpy as np 
grouped['pred_label'] = np.where(grouped['score'] > 0.64, 1, 0)

In [None]:
grouped.head()

In [None]:
grouped['pred_label'].mean()

In [None]:
y_test = grouped.label
prediction = grouped['pred_label'] 

In [None]:


from sklearn.metrics import precision_score, recall_score, confusion_matrix, classification_report, accuracy_score, f1_score

print('Accuracy:', accuracy_score(y_test, prediction))
print('F1 score:', f1_score(y_test, prediction))
print('Recall:', recall_score(y_test, prediction))
print('Precision:', precision_score(y_test, prediction))
print('\n clasification report:\n', classification_report(y_test,prediction))
print('\n confussion matrix:\n',confusion_matrix(y_test, prediction))


In [None]:
import matplotlib.pyplot as plt

In [None]:
base = np.linspace(0, 1, 21)

data_ones = grouped[grouped['label'] == 1]['score']
digitized_ones = np.digitize(data_ones, base)
values_ones = [data_ones[digitized_ones == i].count() for i in range(1, len(base))]

data_all = grouped['score']
digitized_all = np.digitize(data_all, base)
values_all = [data_all[digitized_all == i].count() for i in range(1, len(base))]

cumulative_all = np.cumsum(values_all)
cumulative_ones = np.cumsum(values_ones)


In [None]:
base = np.linspace(0, 1, 51)

data_ones = fig, ax1 = plt.subplots(figsize=(10,5))

color = 'tab:red'
ax1.set_xlabel('Score')
ax1.set_ylabel('Number of records')

p1 = ax1.hist([grouped[grouped['label'] == 1]['score'], grouped[grouped['label'] == 0]['score']], bins=50, alpha=0.5, range=(0,1), label=['1', '0'])
ax1.tick_params(axis='y')
# ax2 = ax1.twinx()  # instantiate a second axes that shares the same x-axis

color = 'tab:blue'
# ax2.set_ylabel('%')  # we already handled the x-label with ax1
# p2 = ax2.p.lot(base[:-1]+0.025, cumulative_ones / cumulative_ones[-1], c='blue',marker="o", alpha=0.5, label = 'cumm for ones')
# p3 = ax2.plot(base[:-1]+0.025, (len(grouped['score'])-cumulative_all) / cumulative_all[-1] , c='green',marker="o", alpha=0.5, label = 'cumm for all')

# for container in ax1.containers:
#     ax1.bar_label(container)
# x0 = base[:-1]+0.025.
# y1 = cumulative_ones / cumulative_ones[-1]
# y2 = (len(grouped['score'])-cumulative_all) / cumulative_all[-1]

# for x, y, text in zip(x0, y1, y1):
#     text = str(round(text * 100)) + "%"
#     plt.text(x, y, text)
# for x, y, text in zip(x0, y2, y2):
#     text = str(round(text * 100)) + "%"
#     plt.text(x, y, text)
fig.legend(loc="upper right")
fig.tight_layout()  # otherwise the right y-label is slightly clipped
plt.show()
# digitized_ones = numpy.digitize(data_ones, base)
# values_ones = [data_ones[digitized_ones == i].count() for i in range(1, len(base))]

# data_all = grouped['score']
# digitized_all = numpy.digitize(data_all, base)
# values_all = [data_all[digitized_all == i].count() for i in range(1, len(base))]

# cumulative_all = np.cumsum(values_all)
# cumulative_ones = np.cumsum(values_ones)


### Testing data

In [None]:
test_df = pd.read_csv("../test_data.csv")

In [None]:
test_df.head()

In [None]:


# Split the document text
test_df['text_split1'] = test_df['doc_text'].apply(get_split480)
#df_all_docs.head()

docs_l = []
index_l =[]
query_l = []
query_n = []
doc_n = []
for idx,row in test_df.iterrows():
    for l in row['text_split1']:
        docs_l.append(l)
        query_l.append(row['Query'])
        doc_n.append(row['doc_number'])
        query_n.append(row['Query_number'])
        
        index_l.append(idx)
len(docs_l), len(label_l), len(index_l)

# Create a new dataframe with the splitted documents
test_df_chunked = pd.DataFrame({"doc_text":docs_l,
                                'Query': query_l, 
                       "doc_number": doc_n, "query_number": query_n})


In [None]:
test_df_chunked

In [None]:
scores_test = model.predict(test_df_chunked[['Query', 'doc_text']].values.tolist(), show_progress_bar=True)
test_df_chunked["score"] = scores_test

In [None]:
test_df_chunked.head()

In [None]:
grouped_test = pd.DataFrame(test_df_chunked.groupby(by=["query_number", "doc_number"])["score"].max()).reset_index()

In [None]:
grouped_test['pred_label'] = np.where(grouped_test['score'] > 0.64, 1, 0)

In [None]:
grouped_test.sort_values(['query_number', "score"], ascending=False, inplace=True)

In [None]:
grouped_test.to_csv('neural model rank.csv')

## Check another model

In [None]:
model_checkpoint = "../stsb-TinyBERT-top3-70000/content/stsb-TinyBERT-L-4-finetuned_auc_161221-top3"
model1 = CrossEncoder(model_checkpoint)

In [None]:
scores1 = model1.predict(df_chunked[['Query', 'doc_text']].values.tolist(), show_progress_bar=True)
df_chunked["score1"] = scores1


In [None]:
from sklearn import metrics
print("results withour aggregation: " , 
      metrics.roc_auc_score(y_true=df_chunked['label'], y_score=df_chunked["score1"]))

In [None]:
df_chunked.head()

In [None]:
grouped = pd.DataFrame(df_chunked.groupby(by=["query_number", "doc_number", "label"])["score", "score1"].max()).reset_index()

In [None]:
print("prev: ", grouped.score.mean())
print("new: ", grouped.score1.mean())

In [None]:
print("prev: ", grouped.score.max())
print("new: ", grouped.score1.max())

In [None]:
print("prev: ", grouped.score.min())
print("new: ", grouped.score1.min())

In [None]:
metrics.roc_auc_score(y_true=grouped['label'], y_score=grouped["score1"])

In [None]:
metrics.roc_auc_score(y_true=grouped['label'], y_score=grouped["score"])

In [None]:
import numpy as np 
grouped['pred_label1'] = np.where(grouped['score1'] > 0.63, 1, 0)

In [None]:
grouped.head()

In [None]:
grouped['pred_label1'].mean()

In [None]:
y_test = grouped.label
prediction1 = grouped['pred_label1'] 

In [None]:


from sklearn.metrics import precision_score, recall_score, confusion_matrix, classification_report, accuracy_score, f1_score

print('Accuracy:', accuracy_score(y_test, prediction1))
print('F1 score:', f1_score(y_test, prediction1))
print('Recall:', recall_score(y_test, prediction1))
print('Precision:', precision_score(y_test, prediction1))
print('\n clasification report:\n', classification_report(y_test,prediction1))
print('\n confussion matrix:\n',confusion_matrix(y_test, prediction1))


In [None]:
import matplotlib.pyplot as plt

In [None]:
base = np.linspace(0, 1, 21)

data_ones = grouped[grouped['label'] == 1]['score1']
digitized_ones = np.digitize(data_ones, base)
values_ones = [data_ones[digitized_ones == i].count() for i in range(1, len(base))]

data_all = grouped['score1']
digitized_all = np.digitize(data_all, base)
values_all = [data_all[digitized_all == i].count() for i in range(1, len(base))]

cumulative_all = np.cumsum(values_all)
cumulative_ones = np.cumsum(values_ones)


In [None]:
base = np.linspace(0, 1, 21)

data_ones = fig, ax1 = plt.subplots(figsize=(15,8))

color = 'tab:red'
ax1.set_xlabel('Score')
ax1.set_ylabel('Number of values')

p1 = ax1.hist([grouped[grouped['label'] == 1]['score1'], grouped[grouped['label'] == 0]['score1']], bins=20, alpha=0.5, range=(0,1), label=['1', '0'])
ax1.tick_params(axis='y')
ax2 = ax1.twinx()  # instantiate a second axes that shares the same x-axis

color = 'tab:blue'
ax2.set_ylabel('%')  # we already handled the x-label with ax1
p2 = ax2.plot(base[:-1]+0.025, cumulative_ones / cumulative_ones[-1], c='blue',marker="o", alpha=0.5, label = 'cumm for ones')
p3 = ax2.plot(base[:-1]+0.025, (len(grouped['score1'])-cumulative_all) / cumulative_all[-1] , c='green',marker="o", alpha=0.5, label = 'cumm for all')

# for container in ax1.containers:
#     ax1.bar_label(container)
x0 = base[:-1]+0.025
y1 = cumulative_ones / cumulative_ones[-1]
y2 = (len(grouped['score1'])-cumulative_all) / cumulative_all[-1]

for x, y, text in zip(x0, y1, y1):
    text = str(round(text * 100)) + "%"
    plt.text(x, y, text)
for x, y, text in zip(x0, y2, y2):
    text = str(round(text * 100)) + "%"
    plt.text(x, y, text)
fig.legend(loc="upper right")
fig.tight_layout()  # otherwise the right y-label is slightly clipped
plt.show()[grouped['label'] == 1]['score1']
digitized_ones = numpy.digitize(data_ones, base)
values_ones = [data_ones[digitized_ones == i].count() for i in range(1, len(base))]

data_all = grouped['score1']
digitized_all = numpy.digitize(data_all, base)
values_all = [data_all[digitized_all == i].count() for i in range(1, len(base))]

cumulative_all = np.cumsum(values_all)
cumulative_ones = np.cumsum(values_ones)
