In [1]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
import wandb
import logging
from functools import reduce
from statistics import mean

In [2]:
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.ERROR)

In [3]:
df = pd.read_csv("jlbasernd5udeltopics.txt", sep="\t", names=["topic", "relevance", "docid", "text", "title", "abstract"], quoting=3).fillna("")
alltext = df["text"] + " " + df["title"] + " " + df["abstract"]
df["alltext"] = alltext
df = df.groupby("topic").apply(lambda x: x.head(200))
df.index = df.index.droplevel(0)
df.to_csv("input.txt", sep="\t", header=True, index=False)

#df = pd.read_csv("input.txt", sep="\t", quoting=3).fillna("")

topics_udel = pd.read_csv("topics.covid-round4-udel.tsv", sep="\t", names=["topic", "query_udel"])
topics_queries = pd.read_csv("topics-rnd4-queries.tsv", sep="\t", names=["topic", "query"])
topics_questions = pd.read_csv("topics-rnd4-questions.tsv", sep="\t", names=["topic", "question"])
topics_narrative = pd.read_csv("topics-rnd4-narrative.tsv", sep="\t", names=["topic", "narrative"])

In [4]:
df.shape

(7810, 7)

In [5]:
topics = reduce(lambda x,y: pd.merge(x, y, on="topic"), [topics_udel, topics_queries, topics_questions, topics_narrative])

In [6]:
input_data = pd.merge(topics, df, on="topic", how="outer")
input_data.shape

(7810, 11)

In [7]:
input_lists_title = [list(l) for l in input_data[["question", "title"]].values]
input_lists_text = [list(l) for l in input_data[["question", "text"]].values]
input_lists_abstract = [list(l) for l in input_data[["question", "abstract"]].values]

In [8]:
#input_lists_alltext = [list(l) for l in input_data[["question", "alltext"]].values]

In [9]:
model_args = ClassificationArgs()
model_args.eval_batch_size = 16
model_args.dataloader_num_workers = 1
model_args.manual_seed = 4
model_args.dataloader_num_workers = 1
model_args.labels_list = [0, 1]
model_args.max_seq_length = 512
model_args.no_cache = False

In [10]:
model = ClassificationModel(
    "bert",
    "best_model_scibert_cased/",
    args=model_args,
    cuda_device=0
)


In [11]:
#predictions_alltext, outputs_alltext = model.predict(input_lists_alltext)

In [12]:
predictions_title, outputs_title = model.predict(input_lists_title)
predictions_text, outputs_text = model.predict(input_lists_text)
predictions_abstract, outputs_abstract = model.predict(input_lists_abstract)

INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.


HBox(children=(FloatProgress(value=0.0, max=7810.0), HTML(value='')))




AttributeError: 'float' object has no attribute 'strip'

In [None]:
#rel_scores_alltext = [o[1] for o in outputs_alltext]

In [None]:
rel_scores_title = [o[1] for o in outputs_title]
rel_scores_text = [o[1] for o in outputs_text]
rel_scores_abstract = [o[1] for o in outputs_abstract]

In [None]:
#rel_scores = rel_scores_alltext

In [None]:
rel_scores = [sum([x,y,z]) for x,y,z in zip(rel_scores_title, rel_scores_text, rel_scores_abstract)]

In [None]:
result_df = pd.DataFrame({"predicted_label": predictions_title, "rel_score": rel_scores})

In [None]:
#result_df = pd.DataFrame({"predicted_label": predictions_alltext, "rel_score": rel_scores})
#result_df.head(3)

In [None]:
output_data = input_data.copy().reset_index().drop("index", 1)
sorted_output_data = pd.concat([output_data, result_df],  axis=1)
sorted_output_data = sorted_output_data.set_index(["topic", "docid"])
sorted_output_data = sorted_output_data.groupby(["topic", "docid"]).sum()
sorted_output_data = sorted_output_data.groupby("topic").apply(lambda x: x.sort_values("rel_score", ascending=False))
sorted_output_data.index = sorted_output_data.index.droplevel(0)
sorted_output_data["rank"] = sorted_output_data.groupby("topic").cumcount()
sorted_output_data = sorted_output_data.reset_index()

In [None]:
sorted_output_data["Q0"] = 0
trec_eval_df = sorted_output_data[["topic", "Q0", "docid", "rank", "rel_score"]]
trec_eval_df["run"] = "jl5_reranked_scibert"

In [None]:
trec_eval_df.head()

In [None]:
pred_df = trec_eval_df[["topic", "docid", "rel_score"]]
gold_df = input_data[["topic", "docid", "relevance"]]
gold_df["relevance"] = gold_df["relevance"].apply(lambda x: 1 if x>0 else 0)

pred_df.set_index(["topic", "docid"], inplace=True)
gold_df.set_index(["topic", "docid"], inplace=True)

In [None]:
#join = pred_df.join(gold_df)
#join = join[join["relevance"] > 0]
#sklearn.metrics.accuracy_score(join["predicted_label"], join["relevance"])

In [None]:
#join.to_csv("tmp.tsv", sep="\t")

In [None]:
trec_eval_df.to_csv("jl5_reranked_scibert.trec_eval", sep="\t", header=None, index=None)

In [None]:
!~/work/Coding/git/trec-covid/target/lib/trec_eval -c -M1000 -m all_trec ~/work/Coding/git/trec-covid/src/main/resources/gold-standard/qrels-covid_d4_j0.5-4.txt reranked.trec_eval