In [1]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
import wandb
import logging
from functools import reduce
from statistics import mean

In [2]:
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.ERROR)

In [3]:
df = pd.read_csv("jlbasernd5udeltopics.txt", sep="\t", names=["topic", "relevance", "docid", "text", "title", "abstract"], quoting=3, dtype={"docid": object, "text":object, "title":object,"abstract":object}).fillna("")
alltext = df["text"] + " " + df["title"] + " " + df["abstract"]
df["alltext"] = alltext
df = df.groupby("topic").apply(lambda x: x.head(200))
df.index = df.index.droplevel(0)
#df.to_csv("input.txt", sep="\t", header=True, index=False)

#df = pd.read_csv("input.txt", sep="\t", quoting=3).fillna("")

topics_udel = pd.read_csv("topics.covid-round4-udel.tsv", sep="\t", names=["topic", "query_udel"])
topics_queries = pd.read_csv("topics-rnd4-queries.tsv", sep="\t", names=["topic", "query"])
topics_questions = pd.read_csv("topics-rnd4-questions.tsv", sep="\t", names=["topic", "question"])
topics_narrative = pd.read_csv("topics-rnd4-narrative.tsv", sep="\t", names=["topic", "narrative"])

In [4]:
df.shape

(7810, 7)

In [5]:
topics = reduce(lambda x,y: pd.merge(x, y, on="topic"), [topics_udel, topics_queries, topics_questions, topics_narrative])

In [6]:
input_data = pd.merge(topics, df, on="topic", how="outer")
input_data.shape

(7810, 11)

In [7]:
input_lists_title = [list(l) for l in input_data[["question", "title"]].values]
input_lists_text = [list(l) for l in input_data[["question", "text"]].values]
input_lists_abstract = [list(l) for l in input_data[["question", "abstract"]].values]

In [8]:
#[planet for sublist in planets for planet in sublist if len(planet) < 6] 

In [9]:
for i in range(len(input_lists_title)):
    for j in range(len(input_lists_title[i])):
        input_lists_title[i][j] = str(input_lists_title[i][j])
for i in range(len(input_lists_text)):
    for j in range(len(input_lists_text[i])):
        input_lists_text[i][j] = str(input_lists_text[i][j])
for i in range(len(input_lists_abstract)):
    for j in range(len(input_lists_abstract[i])):
        input_lists_abstract[i][j] = str(input_lists_abstract[i][j])

In [10]:
#input_lists_alltext = [list(l) for l in input_data[["question", "alltext"]].values]

In [11]:
model_args = ClassificationArgs()
model_args.eval_batch_size = 16
model_args.dataloader_num_workers = 1
model_args.manual_seed = 4
model_args.dataloader_num_workers = 1
model_args.labels_list = [0, 1]
model_args.max_seq_length = 512
model_args.no_cache = False

In [12]:
#model = ClassificationModel(
#    "roberta",
#    "best_model_distilroberta_cutoff512",
#    args=model_args,
#    cuda_device=1
#)

model = ClassificationModel(
    "bert",
    "best_model_biobert/",
    args=model_args,
    cuda_device=1
)

In [13]:
#predictions_alltext, outputs_alltext = model.predict(input_lists_alltext)

In [14]:
predictions_title, outputs_title = model.predict(input_lists_title)
predictions_text, outputs_text = model.predict(input_lists_text)
predictions_abstract, outputs_abstract = model.predict(input_lists_abstract)

INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.


HBox(children=(FloatProgress(value=0.0, max=7810.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=489.0), HTML(value='')))

INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.





HBox(children=(FloatProgress(value=0.0, max=7810.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=489.0), HTML(value='')))

INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.





HBox(children=(FloatProgress(value=0.0, max=7810.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=489.0), HTML(value='')))




In [15]:
#rel_scores_alltext = [o[1] for o in outputs_alltext]

In [16]:
rel_scores_title = [o[1] for o in outputs_title]
rel_scores_text = [o[1] for o in outputs_text]
rel_scores_abstract = [o[1] for o in outputs_abstract]

In [17]:
#rel_scores = rel_scores_alltext

In [18]:
rel_scores = [sum([x,y,z]) for x,y,z in zip(rel_scores_title, rel_scores_text, rel_scores_abstract)]

In [19]:
result_df = pd.DataFrame({"predicted_label": predictions_title, "rel_score": rel_scores})

In [20]:
#result_df = pd.DataFrame({"predicted_label": predictions_alltext, "rel_score": rel_scores})
#result_df.head(3)

In [21]:
output_data = input_data.copy().reset_index().drop("index", 1)
sorted_output_data = pd.concat([output_data, result_df],  axis=1)
sorted_output_data = sorted_output_data.set_index(["topic", "docid"])
sorted_output_data = sorted_output_data.groupby(["topic", "docid"]).sum()
sorted_output_data = sorted_output_data.groupby("topic").apply(lambda x: x.sort_values("rel_score", ascending=False))
sorted_output_data.index = sorted_output_data.index.droplevel(0)
sorted_output_data["rank"] = sorted_output_data.groupby("topic").cumcount()
sorted_output_data = sorted_output_data.reset_index()

In [22]:
sorted_output_data["Q0"] = 0
trec_eval_df = sorted_output_data[["topic", "Q0", "docid", "rank", "rel_score"]]
trec_eval_df["run"] = "jl5_reranked_biobert"

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [23]:
trec_eval_df.head()

Unnamed: 0,topic,Q0,docid,rank,rel_score,run
0,1,0,hyjzofps,0,3.511364,jl5_reranked_biobert
1,1,0,8gtnbm1c,1,1.177436,jl5_reranked_biobert
2,1,0,dyhd8p8z,2,1.122417,jl5_reranked_biobert
3,1,0,qi9323yl,3,0.391587,jl5_reranked_biobert
4,1,0,jm18lj5t,4,0.072344,jl5_reranked_biobert


In [24]:
pred_df = trec_eval_df[["topic", "docid", "rel_score"]]
gold_df = input_data[["topic", "docid", "relevance"]]
gold_df["relevance"] = gold_df["relevance"].apply(lambda x: 1 if x>0 else 0)

pred_df.set_index(["topic", "docid"], inplace=True)
gold_df.set_index(["topic", "docid"], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [25]:
#join = pred_df.join(gold_df)
#join = join[join["relevance"] > 0]
#sklearn.metrics.accuracy_score(join["predicted_label"], join["relevance"])

In [26]:
#join.to_csv("tmp.tsv", sep="\t")

In [27]:
trec_eval_df.to_csv("jl5_reranked_biobert.trec_eval", sep="\t", header=None, index=None)

In [28]:
!~/work/Coding/git/trec-covid/target/lib/trec_eval -c -M1000 -m all_trec ~/work/Coding/git/trec-covid/src/main/resources/gold-standard/qrels-covid_d4_j0.5-4.txt reranked.trec_eval

runid                 	all	reranked
num_q                 	all	45
num_ret               	all	18000
num_rel               	all	15765
num_rel_ret           	all	4891
map                   	all	0.1909
gm_map                	all	0.1549
Rprec                 	all	0.2958
bpref                 	all	0.3054
recip_rank            	all	0.9444
iprec_at_recall_0.00  	all	0.9698
iprec_at_recall_0.10  	all	0.5982
iprec_at_recall_0.20  	all	0.4435
iprec_at_recall_0.30  	all	0.2565
iprec_at_recall_0.40  	all	0.1591
iprec_at_recall_0.50  	all	0.0612
iprec_at_recall_0.60  	all	0.0000
iprec_at_recall_0.70  	all	0.0000
iprec_at_recall_0.80  	all	0.0000
iprec_at_recall_0.90  	all	0.0000
iprec_at_recall_1.00  	all	0.0000
P_5                   	all	0.8311
P_10                  	all	0.7667
P_15                  	all	0.7096
P_20                  	all	0.6822
P_30                  	all	0.6422
P_100                 	all	0.4987
P_200                 	all	0.3887
P_500                 	all