In [1]:
import pandas as pd
import pyterrier as pt
# read csvs without header, column names are qid, sep, uuid, score
if not pt.started():
    pt.init()
qrels = pt.io.read_qrels("../data/assessments/qrels.txt") # type: ignore
qcred = pt.io.read_qrels("../data/assessments/qcredibility.txt") # type: ignore
qread = pt.io.read_qrels("../data/assessments/qreadability.txt") # type: ignore


PyTerrier 0.9.2 has loaded Terrier 5.7 (built by craigm on 2022-11-10 18:30) and terrier-helper 0.0.7

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


In [2]:
import xml.etree.ElementTree as ET
import pandas as pd

def load_topics(path):
    with open(path) as f:
        root = ET.fromstring(f.read())
    topic_dict = {}
    for topic in root.findall("topic"):
        topic_id = topic.findtext("id")
        topic_query = topic.findtext("query")
        if topic_id and topic_query:
            topic_dict[topic_id] = topic_query.strip().lower()
    topics = pd.DataFrame(topic_dict.items(), columns=["qid", "query"]) 
    topics["query"] = topics["query"].str.replace(r'\W+', ' ', regex=True)
    return topics

In [3]:
queries = load_topics("./topics/topics.txt")

In [4]:
queries.head()

Unnamed: 0,qid,query
0,1,what are the most common chronic diseases what...
1,8,best apps daily activity exercise diabetes
2,22,my risk for developing type 2 diabetes
3,35,is a ketogenic keto diet suitable for people w...
4,45,can diabetes be cured


In [20]:
all_passages = pd.read_csv("../CHS-2021/documents/Webdoc/crawl/txt_over_50.tsv", sep="\t")

In [22]:
import textstat
# rank documents with custom function that evaluates readability of the document
def readability_score(text):
    score = textstat.flesch_reading_ease(text)
    return score 

def text_standard(text):
    score = textstat.text_standard(text, float_output=True)
    return score

In [23]:
sample = "Cytokine dysregulation is a central driver of chronic inflammatory diseases such as multiple sclerosis (MS). Here, we sought to determine the characteristic cellular and cytokine polarization profile in patients with relapsing–remitting multiple sclerosis (RRMS) by high-dimensional single-cell mass cytometry (CyTOF). Using a combination of neural network-based representation learning algorithms, we identified an expanded T helper cell subset in patients with MS, characterized by the expression of granulocyte–macrophage colony-stimulating factor and the C-X-C chemokine receptor type 4. This cellular signature, which includes expression of very late antigen 4 in peripheral blood, was also enriched in the central nervous system of patients with relapsing–remitting multiple sclerosis. In independent validation cohorts, we confirmed that this cell population is increased in patients with MS compared with other inflammatory and non-inflammatory conditions. Lastly, we also found the population to be reduced under effective disease-modifying therapy, suggesting that the identified T cell profile represents a specific therapeutic target in MS."
print(readability_score(sample))
print(text_standard(sample))
docid = "00bd4ea6-90d6-40ac-93ab-d0be20d6c8e5"
print(readability_score(all_passages[all_passages.docid == docid].text.values[0]))
print(text_standard(all_passages[all_passages.docid == docid].text.values[0]))
print(all_passages[all_passages.docid == docid].text.values[0])

3.09
20.0
45.66
14.0
Thank you for visiting nature.com. You are using a browser version with limited support for CSS. To obtain the best experience, we recommend you use a more up to date browser (or turn off compatibility mode in Internet Explorer). In the meantime, to ensure continued support, we are displaying the site without styles and JavaScript. GM-CSF and CXCR4 define a T helper cell signature in multiple sclerosis   ,   ,   ,   ,     , –() Cytokine dysregulation is a central driver of chronic inflammatory diseases such as multiple sclerosis (MS). Here, we sought to determine the characteristic cellular and cytokine polarization profile in patients with relapsing–remitting multiple sclerosis (RRMS) by high-dimensional single-cell mass cytometry (CyTOF). Using a combination of neural network-based representation learning algorithms, we identified an expanded T helper cell subset in patients with MS, characterized by the expression of granulocyte–macrophage colony-stimulating fac

In [14]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("Tymoteusz/distilbert-base-uncased-kaggle-readability", truncation=True)

model = AutoModelForSequenceClassification.from_pretrained("Tymoteusz/distilbert-base-uncased-kaggle-readability")

In [25]:
# get the transformer score 
def readability_score_transformer(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    outputs = model(**inputs)
    logits = outputs.logits
    return logits[0][0].item() 

In [16]:
readability_score_transformer(sample)

-2.660423755645752

In [26]:
read_scores_transformer = []
for index, row in all_passages.iterrows():
    read_scores_transformer.append(readability_score_transformer(row["text"]))
all_passages["readability_score_transformer"] = read_scores_transformer

all_passages.head()

Unnamed: 0,docid,text,readability_score_transformer
0,44f906a1-f818-4d48-a3fb-1673ebdeff13,Billing & Financial Services Patients & Visito...,-1.882378
1,334ed241-6337-41ce-884c-2755648e14ea,Billing & Financial Services Patients & Visito...,-1.881258
2,391b9de2-26dc-4187-b7ea-461153352e12,Billing & Financial Services Patients & Visito...,-1.762164
3,461b6eb2-2b79-4fec-bf6b-63e302a093d9,Billing & Financial Services Patients & Visito...,-1.762164
4,4e151d9e-a3fc-4e21-b5db-7b8ad4ea9d23,Billing & Financial Services Patients & Visito...,-1.61085


In [27]:
all_passages["flesch_reading_ease"] = all_passages["text"].apply(readability_score)
all_passages["text_standard"] = all_passages["text"].apply(text_standard)


In [28]:
all_passages[['docid', 'readability_score_transformer', 'flesch_reading_ease', 'text_standard']].to_csv('all_passages_over_50_readability_scores.tsv', sep='\t', index=False)

In [10]:
from transformers import pipeline
import torch
MODEL = "jy46604790/Fake-News-Bert-Detect"
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
clf = pipeline("text-classification", model=MODEL, tokenizer=MODEL, max_length=512, device=device)

truth_label = "LABEL_1"
def credibility_score(text, model=clf, truth_label=truth_label):
    score = model(text)
    if score[0]["label"] == truth_label:
        return score[0]["score"]
    else:
        return 1 - score[0]["score"]


In [13]:
# all_passages["credibility_score_bert"] = all_passages["text"].apply(credibility_score)
# get texts as list from all_passages
texts = all_passages["text"].tolist()
# get credibility scores for all texts
credibility_scores = clf(texts, truncation=True, max_length=512, verbose=True, batch_size=32)

In [15]:
# get credibility scores from the list of dictionaries
credibility_scores_float = [score["score"] if score["label"] == truth_label else 1 - score["score"] for score in credibility_scores]


TypeError: 'float' object is not subscriptable

In [18]:
# add credibility scores to the dataframe
all_passages["credibility_score_bert"] = credibility_scores_float

In [23]:
all_passages.head()
# save docid and credibility score to tsv
all_passages[["docid", "credibility_score_bert"]].to_csv("all_passages_credibility_scores_bert.tsv", sep="\t", index=False)

In [60]:
passages_with_qrels = pd.merge(all_passages, qrels, left_on="docid", right_on="docno")

In [29]:
passages_with_qread = pd.merge(all_passages, qread, left_on="docid", right_on="docno")

In [19]:
passages_with_qcred = pd.merge(all_passages, qcred, left_on="docid", right_on="docno")

In [21]:
passages_with_qcred[["credibility_score_bert", "label"]].corr()

Unnamed: 0,credibility_score_bert,label
credibility_score_bert,1.0,0.067204
label,0.067204,1.0


In [33]:
# remove duplicates in passages_with_qread by docid and keep the one with the highest label
passages_with_qread = passages_with_qread.sort_values("label", ascending=False).drop_duplicates("docid", keep="first")

In [34]:

passages_with_qread[["flesch_reading_ease", "text_standard", "readability_score_transformer", "label"]].corr()

Unnamed: 0,flesch_reading_ease,text_standard,readability_score_transformer,label
flesch_reading_ease,1.0,-0.725921,0.290964,0.181301
text_standard,-0.725921,1.0,-0.222037,-0.149064
readability_score_transformer,0.290964,-0.222037,1.0,0.261095
label,0.181301,-0.149064,0.261095,1.0


In [31]:
# get correlation between readability score and and qread label
passages_with_qread[["flesch_reading_ease", "text_standard", "readability_score_transformer", "label"]].corr()

Unnamed: 0,flesch_reading_ease,text_standard,readability_score_transformer,label
flesch_reading_ease,1.0,-0.731399,0.296111,0.176506
text_standard,-0.731399,1.0,-0.228651,-0.147296
readability_score_transformer,0.296111,-0.228651,1.0,0.261609
label,0.176506,-0.147296,0.261609,1.0
