In [None]:
import pandas as pd
import pyterrier as pt
# read csvs without header, column names are qid, sep, uuid, score
if not pt.started():
    pt.init()
qrels = pt.io.read_qrels("../dataset/assessments/qrels.txt") # type: ignore
qcred = pt.io.read_qrels("../dataset/assessments/qcredibility.txt") # type: ignore
qread = pt.io.read_qrels("../dataset/assessments/qreadability.txt") # type: ignore


In [None]:
import xml.etree.ElementTree as ET
import pandas as pd

def load_topics(path):
    with open(path) as f:
        root = ET.fromstring(f.read())
    topic_dict = {}
    for topic in root.findall("topic"):
        topic_id = topic.findtext("id")
        topic_query = topic.findtext("query")
        if topic_id and topic_query:
            topic_dict[topic_id] = topic_query.strip().lower()
    topics = pd.DataFrame(topic_dict.items(), columns=["qid", "query"]) 
    topics["query"] = topics["query"].str.replace(r'\W+', ' ', regex=True)
    return topics

In [None]:
queries = load_topics("../dataset/topics/topics.txt")

In [None]:
all_passages = pd.read_csv("../dataset/Webdoc/data/txt_min_length_50.tsv", sep="\t")

In [None]:
import textstat
# rank documents with custom function that evaluates readability of the document
def readability_score(text):
    score = textstat.flesch_reading_ease(text)
    return score 

def text_standard(text):
    score = textstat.text_standard(text, float_output=True)
    return score

In [None]:
sample = "Cytokine dysregulation is a central driver of chronic inflammatory diseases such as multiple sclerosis (MS). Here, we sought to determine the characteristic cellular and cytokine polarization profile in patients with relapsing–remitting multiple sclerosis (RRMS) by high-dimensional single-cell mass cytometry (CyTOF). Using a combination of neural network-based representation learning algorithms, we identified an expanded T helper cell subset in patients with MS, characterized by the expression of granulocyte–macrophage colony-stimulating factor and the C-X-C chemokine receptor type 4. This cellular signature, which includes expression of very late antigen 4 in peripheral blood, was also enriched in the central nervous system of patients with relapsing–remitting multiple sclerosis. In independent validation cohorts, we confirmed that this cell population is increased in patients with MS compared with other inflammatory and non-inflammatory conditions. Lastly, we also found the population to be reduced under effective disease-modifying therapy, suggesting that the identified T cell profile represents a specific therapeutic target in MS."
print(readability_score(sample))
print(text_standard(sample))
docid = "00bd4ea6-90d6-40ac-93ab-d0be20d6c8e5"
print(readability_score(all_passages[all_passages.docid == docid].text.values[0]))
print(text_standard(all_passages[all_passages.docid == docid].text.values[0]))
print(all_passages[all_passages.docid == docid].text.values[0])

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("Tymoteusz/distilbert-base-uncased-kaggle-readability", truncation=True)

model = AutoModelForSequenceClassification.from_pretrained("Tymoteusz/distilbert-base-uncased-kaggle-readability")

In [None]:
# get the transformer score 
def readability_score_transformer(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    outputs = model(**inputs)
    logits = outputs.logits
    return logits[0][0].item() 

In [None]:
readability_score_transformer(sample)

In [None]:
read_scores_transformer = []
for index, row in all_passages.iterrows():
    read_scores_transformer.append(readability_score_transformer(row["text"]))
all_passages["readability_score_transformer"] = read_scores_transformer

all_passages.head()

In [None]:
all_passages["flesch_reading_ease"] = all_passages["text"].apply(readability_score)
all_passages["text_standard"] = all_passages["text"].apply(text_standard)


In [None]:
all_passages[['docid', 'readability_score_transformer', 'flesch_reading_ease', 'text_standard']].to_csv('all_passages_readability_scores.tsv', sep='\t', index=False)

In [None]:
from transformers import pipeline
import torch
MODEL = "jy46604790/Fake-News-Bert-Detect"
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
clf = pipeline("text-classification", model=MODEL, tokenizer=MODEL, max_length=512, device=device)

truth_label = "LABEL_1"
def credibility_score(text, model=clf, truth_label=truth_label):
    score = model(text)
    if score[0]["label"] == truth_label:
        return score[0]["score"]
    else:
        return 1 - score[0]["score"]


In [None]:
# all_passages["credibility_score_bert"] = all_passages["text"].apply(credibility_score)
# get texts as list from all_passages
texts = all_passages["text"].tolist()
# get credibility scores for all texts
credibility_scores = clf(texts, truncation=True, max_length=512, verbose=True, batch_size=32)

In [None]:
# get credibility scores from the list of dictionaries
credibility_scores_float = [score["score"] if score["label"] == truth_label else 1 - score["score"] for score in credibility_scores]


In [None]:
# add credibility scores to the dataframe
all_passages["credibility_score_bert"] = credibility_scores_float

In [None]:
all_passages.head()
# save docid and credibility score to tsv
all_passages[["docid", "credibility_score_bert"]].to_csv("all_passages_credibility_scores_bert.tsv", sep="\t", index=False)