In [None]:
from google.colab import drive

drive.mount('/content/gdrive')
root_path = 'gdrive/My Drive/'

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
# =======================================================
#  SARI -- Text Simplification Tunable Evaluation Metric
# =======================================================
#
# Author: Wei Xu (UPenn xwe@cis.upenn.edu)
#
# A Python implementation of the SARI metric for text simplification
# evaluation in the following paper  
#
#     "Optimizing Statistical Machine Translation for Text Simplification"
#     Wei Xu, Courtney Napoles, Ellie Pavlick, Quanze Chen and Chris Callison-Burch
#     In Transactions of the Association for Computational Linguistics (TACL) 2015
# 
# There is also a Java implementation of the SARI metric 
# that is integrated into the Joshua MT Decoder. It can 
# be used for tuning Joshua models for a real end-to-end
# text simplification model. 
#

from __future__ import division
from collections import Counter
import sys



def ReadInFile (filename):
    
    with open(filename) as f:
        lines = f.readlines()
        lines = [x.strip() for x in lines]
    return lines


def SARIngram(sgrams, cgrams, rgramslist, numref):


    rgramsall = [rgram for rgrams in rgramslist for rgram in rgrams]
    rgramcounter = Counter(rgramsall)
	
    sgramcounter = Counter(sgrams)
    sgramcounter_rep = Counter()
    for sgram, scount in sgramcounter.items():
        sgramcounter_rep[sgram] = scount * numref
        
    cgramcounter = Counter(cgrams)
    cgramcounter_rep = Counter()
    for cgram, ccount in cgramcounter.items():
        cgramcounter_rep[cgram] = ccount * numref
	
    
    # KEEP
    keepgramcounter_rep = sgramcounter_rep & cgramcounter_rep
    keepgramcountergood_rep = keepgramcounter_rep & rgramcounter
    keepgramcounterall_rep = sgramcounter_rep & rgramcounter

    keeptmpscore1 = 0
    keeptmpscore2 = 0
    for keepgram in keepgramcountergood_rep:
        keeptmpscore1 += keepgramcountergood_rep[keepgram] / keepgramcounter_rep[keepgram]
        keeptmpscore2 += keepgramcountergood_rep[keepgram] / keepgramcounterall_rep[keepgram]
        #print "KEEP", keepgram, keepscore, cgramcounter[keepgram], sgramcounter[keepgram], rgramcounter[keepgram]
    keepscore_precision = 0
    if len(keepgramcounter_rep) > 0:
    	keepscore_precision = keeptmpscore1 / len(keepgramcounter_rep)
    keepscore_recall = 0
    if len(keepgramcounterall_rep) > 0:
    	keepscore_recall = keeptmpscore2 / len(keepgramcounterall_rep)
    keepscore = 0
    if keepscore_precision > 0 or keepscore_recall > 0:
        keepscore = 2 * keepscore_precision * keepscore_recall / (keepscore_precision + keepscore_recall)


    # DELETION
    delgramcounter_rep = sgramcounter_rep - cgramcounter_rep
    delgramcountergood_rep = delgramcounter_rep - rgramcounter
    delgramcounterall_rep = sgramcounter_rep - rgramcounter

    deltmpscore1 = 0
    deltmpscore2 = 0
    for delgram in delgramcountergood_rep:
        deltmpscore1 += delgramcountergood_rep[delgram] / delgramcounter_rep[delgram]
        deltmpscore2 += delgramcountergood_rep[delgram] / delgramcounterall_rep[delgram]
    delscore_precision = 0
    if len(delgramcounter_rep) > 0:
    	delscore_precision = deltmpscore1 / len(delgramcounter_rep)
    delscore_recall = 0
    if len(delgramcounterall_rep) > 0:
    	delscore_recall = deltmpscore1 / len(delgramcounterall_rep)
    delscore = 0
    if delscore_precision > 0 or delscore_recall > 0:
        delscore = 2 * delscore_precision * delscore_recall / (delscore_precision + delscore_recall)


    # ADDITION
    addgramcounter = set(cgramcounter) - set(sgramcounter)
    addgramcountergood = set(addgramcounter) & set(rgramcounter)
    addgramcounterall = set(rgramcounter) - set(sgramcounter)

    addtmpscore = 0
    for addgram in addgramcountergood:
        addtmpscore += 1

    addscore_precision = 0
    addscore_recall = 0
    if len(addgramcounter) > 0:
    	addscore_precision = addtmpscore / len(addgramcounter)
    if len(addgramcounterall) > 0:
    	addscore_recall = addtmpscore / len(addgramcounterall)
    addscore = 0
    if addscore_precision > 0 or addscore_recall > 0:
        addscore = 2 * addscore_precision * addscore_recall / (addscore_precision + addscore_recall)
    

    return (keepscore, delscore_precision, addscore)
    

def SARIsent (ssent, csent, rsents) :
    numref = len(rsents)	

    s1grams = ssent.lower().split(" ")
    c1grams = csent.lower().split(" ")
    s2grams = []
    c2grams = []
    s3grams = []
    c3grams = []
    s4grams = []
    c4grams = []
 
    r1gramslist = []
    r2gramslist = []
    r3gramslist = []
    r4gramslist = []
    for rsent in rsents:
        r1grams = rsent.lower().split(" ")    
        r2grams = []
        r3grams = []
        r4grams = []
        r1gramslist.append(r1grams)
        for i in range(0, len(r1grams)-1) :
            if i < len(r1grams) - 1:
                r2gram = r1grams[i] + " " + r1grams[i+1]
                r2grams.append(r2gram)
            if i < len(r1grams)-2:
                r3gram = r1grams[i] + " " + r1grams[i+1] + " " + r1grams[i+2]
                r3grams.append(r3gram)
            if i < len(r1grams)-3:
                r4gram = r1grams[i] + " " + r1grams[i+1] + " " + r1grams[i+2] + " " + r1grams[i+3]
                r4grams.append(r4gram)        
        r2gramslist.append(r2grams)
        r3gramslist.append(r3grams)
        r4gramslist.append(r4grams)
       
    for i in range(0, len(s1grams)-1) :
        if i < len(s1grams) - 1:
            s2gram = s1grams[i] + " " + s1grams[i+1]
            s2grams.append(s2gram)
        if i < len(s1grams)-2:
            s3gram = s1grams[i] + " " + s1grams[i+1] + " " + s1grams[i+2]
            s3grams.append(s3gram)
        if i < len(s1grams)-3:
            s4gram = s1grams[i] + " " + s1grams[i+1] + " " + s1grams[i+2] + " " + s1grams[i+3]
            s4grams.append(s4gram)
            
    for i in range(0, len(c1grams)-1) :
        if i < len(c1grams) - 1:
            c2gram = c1grams[i] + " " + c1grams[i+1]
            c2grams.append(c2gram)
        if i < len(c1grams)-2:
            c3gram = c1grams[i] + " " + c1grams[i+1] + " " + c1grams[i+2]
            c3grams.append(c3gram)
        if i < len(c1grams)-3:
            c4gram = c1grams[i] + " " + c1grams[i+1] + " " + c1grams[i+2] + " " + c1grams[i+3]
            c4grams.append(c4gram)


    (keep1score, del1score, add1score) = SARIngram(s1grams, c1grams, r1gramslist, numref)
    (keep2score, del2score, add2score) = SARIngram(s2grams, c2grams, r2gramslist, numref)
    (keep3score, del3score, add3score) = SARIngram(s3grams, c3grams, r3gramslist, numref)
    (keep4score, del4score, add4score) = SARIngram(s4grams, c4grams, r4gramslist, numref)

    avgkeepscore = sum([keep1score,keep2score,keep3score,keep4score])/4
    avgdelscore = sum([del1score,del2score,del3score,del4score])/4
    avgaddscore = sum([add1score,add2score,add3score,add4score])/4
    finalscore = ( avgkeepscore + avgdelscore + avgaddscore ) / 3

    return finalscore


In [None]:
!pip install transformers
!pip install sentence_transformers
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

'''model1 = AutoModelForSeq2SeqLM.from_pretrained('google/pegasus-xsum')
tokenizer1 = AutoTokenizer.from_pretrained('google/pegasus-xsum')

tokenizer2 = AutoTokenizer.from_pretrained("csebuetnlp/mT5_multilingual_XLSum")
model2 = AutoModelForSeq2SeqLM.from_pretrained("csebuetnlp/mT5_multilingual_XLSum")'''

tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")


In [None]:
!pip install bibtexparser

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting bibtexparser
  Downloading bibtexparser-1.4.0.tar.gz (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.9/51.9 KB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: bibtexparser
  Building wheel for bibtexparser (setup.py) ... [?25l[?25hdone
  Created wheel for bibtexparser: filename=bibtexparser-1.4.0-py3-none-any.whl size=42442 sha256=8ce6340c32e8ee995df8520f07575a1695c4d24e3895c07a2eb313e47972191e
  Stored in directory: /root/.cache/pip/wheels/83/e1/e3/2311be27728119eefd014e0a6039eee58470560d8ab31fd1fa
Successfully built bibtexparser
Installing collected packages: bibtexparser
Successfully installed bibtexparser-1.4.0


In [None]:
#Step 2 MedTSS PTMs Summaries+hallucination

import pandas as pd
#Step 2 generate clusters of selected sentences
import sys
import bibtexparser as bp
import re
import itertools
import sys
import urllib.request
from urllib.error import HTTPError
import pandas as pd

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop = stopwords.words('english')
from collections import OrderedDict
import re
from sentence_transformers import SentenceTransformer
from sklearn.cluster import AgglomerativeClustering
import numpy as np
import csv
from numpy import dot
from numpy.linalg import norm
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop = stopwords.words('english')
from collections import OrderedDict
import re
import numpy as np
import csv
from numpy import dot
from numpy.linalg import norm
def extractDigits(lst):
    return [[el] for el in lst]
def check_words(listwords, listsent, listsent2):
    listsent_new = []
    # interate through each sentence
    for i in range(0, len(listsent)):
    #for sentence in listsent2:
        # iterate through each group of words
        for words in listwords:
            # check to see if each word group is in the current sentence
            if all(word in listsent2[i] for word in words):
                listsent_new.append(listsent[i])

    return listsent_new

#header_list =["Full_Paper_XML","Paper_Title", "KeyWord", "MeSH", "Eureka_Title_Simplified", "Eureka_Text_Simplified", "linnk"]
df22 = pd.read_csv("gdrive/My Drive/HTSS-Testing-Samples.csv", sep='\t')#, names=header_list)
df2 = df22[["Full_Paper_XML", "Paper_Title","KeyWord", "MeSH", "Paper_DOI"]]
df2 = df2.replace(r'\n',', ', regex=True) 
df2 = df2.replace(r',,',',', regex=True) 
df2 = df2.replace(r';',',', regex=True)
df2['stopwords'] = df2['Paper_Title'].apply(lambda x: ','.join([word for word in x.split() if word not in (stop)]))
df2['mergeo'] = df2[df2.columns[2:]].apply(
    lambda x: ','.join(x.dropna().astype(str)),
    axis=1
)
df2.mergeo = df2.mergeo.apply(lambda x : x.replace('.',''))
ccc=df2.Paper_Title.tolist()
xmll=df2.Full_Paper_XML.tolist()
mergo= df2.mergeo.tolist()
linkk= df2.Paper_DOI.tolist()
cccc= ccc#[906:907]
linkkn = linkk#[906:907]
xmlll = xmll#[906:907]
margo = mergo#[906:907]
clust = []
titlo = []
for i, val in enumerate(xmlll):
 linkn = linkkn[i]
 print(linkn)

 #BASE_URL = 'http://dx.doi.org/'

 try:
      doi = linkn
 except IndexError:
      print('Usage:\n{} <doi>'.format(sys.argv[0]))
      sys.exit(1)

 url = doi
 req = urllib.request.Request(url)
 req.add_header('Accept', 'application/x-bibtex')
 try:
      with urllib.request.urlopen(req) as f:
          bibtex = f.read().decode()
      #print(bibtex)
 except HTTPError as e:
      if e.code == 404:
          print('DOI not found.')
      else:
          print('Service unavailable.')
      sys.exit(1)
 with open('gdrive/My Drive/test.bib', 'w') as f:
    f.write(bibtex)
 with open('gdrive/My Drive/test.bib') as bibtex_file:
    bd = bp.load(bibtex_file)
    for art in bd.entries_dict:
      print("*********")
      ae = bd.entries_dict[art]
      #print(ae[u'title'])
    
      auths=ae[u'author']#.split(" And ")
      #res=re.split('And|.|', auths) 
      exp=itertools.chain(*[y.split('And') for y in auths.split('and')])
      exp=[x.strip() for x in list(exp)]
      print(len(exp))  
      plen = len(exp)  
      #print(exp[0])#+" --- "+auths[-1])
      if plen>1:
       pr='Published by '+ae[u'publisher']+ ' authors '+exp[0]+ ' and '+exp[1]+ ', journal name '+ae[u'journal']+'.'      
      else:
       pr='Published by '+ae[u'publisher']+ ' authors '+exp[0]+ ', journal name '+ae[u'journal']+'.'      
      texth = pr
 print(texth, 'mmmmmmmmmmm')
 pr= "gdrive/My Drive/HTSS-master/data/"+val
 #pr= 'gdrive/My Drive/HTSS-master/data/Data_files/NCOMMS_XML/1.txt'
 article_path = pr
 prkey=(margo[i].lower())
 ls = prkey.split(",")
 ls = [item for item in ls if len(item)>=4]

 ls = [x.lstrip() for x in ls]
 ls = filter(None, ls)
 ls = set(ls)
 ls =extractDigits(ls)
 print(ls)
 f = open(article_path, encoding="utf-8")  # for extractive summary

 ax = f.read()
 ax = re.sub(r' \[.*?\]', '',ax)
 ax = re.sub(r' \(.*?\)', '',ax)
 ax = ax.replace("\t", " ")  # remove extra character
 ax = ax.replace("\n", "")  # remove extra character
 ax = ax.replace("\r", " ")  # remove extra character
 ax = ax.replace("\" \', ", "")  # remove extra character

 #ax = ax.lower()  # change text to lowercase letter


 #ax = [l.replace('  \n', '').replace('Background', '').replace('title', '').replace('Abstract', '').replace('Introduction', '') for l in ax]

 #ax
 #wordss = ax.split('  #@new_line#@#  ')
 char_list = ['_'] # select headinglines
 wordss = ax.split('  #@NEW_LINE#@# ')
 axs = ax.split('  #@NEW_LINE#@# ')
 wordss = [ele for ele in wordss if all(ch not in ele for ch in char_list)] # remove headinglines
 wordlow = ax.lower()  # change text to lowercase letter
 wordlow = wordlow.split('  #@new_line#@# ')
 wordlow = [ele for ele in wordlow if all(ch not in ele for ch in char_list)] # remove headinglines
 pri = check_words(ls, wordss[3:], wordlow[3:])
 bvb= (list(OrderedDict.fromkeys(pri)))
 bvb = list(filter(None, bvb))
 facebook=[]
 if len(bvb)>5:
   embedder = SentenceTransformer('paraphrase-MiniLM-L6-v2')
   corpus= bvb
   corpus_embeddings = embedder.encode(corpus)

   # Normalize the embeddings to unit length
   corpus_embeddings = corpus_embeddings /  np.linalg.norm(corpus_embeddings, axis=1, keepdims=True)

   # Perform kmean clustering
   clustering_model = AgglomerativeClustering(n_clusters=5, affinity='cosine', linkage='average')#, distance_threshold=0.4)
   clustering_model.fit(corpus_embeddings)
   cluster_assignment = clustering_model.labels_

   clustered_sentences = {}
   for sentence_id, cluster_id in enumerate(cluster_assignment):
     if cluster_id not in clustered_sentences:
         clustered_sentences[cluster_id] = []

     clustered_sentences[cluster_id].append(corpus[sentence_id])
   ccc= cccc[i]
   print(ccc)
   facebook=[]
   for i in range(0, 5):
     listt = []
     
     strrr = ' '.join(map(str, clustered_sentences[i]))
     strr = texth+strrr 
     print(strr)
     '''tokens_input = tokenizer1.encode("summarize: "+ strr, return_tensors='pt', max_length=512, truncation=True)
     ids = model1.generate(tokens_input, min_length=20, max_length=50)
     summary = tokenizer1.decode(ids[0], skip_special_tokens=True)
     peagus.append(summary)
     
     tokens_input = tokenizer2.encode("summarize: "+strr, return_tensors='pt', max_length=512, truncation=True)
     summary_ids = model2.generate(tokens_input, min_length=20, max_length=50)
     summary = tokenizer2.decode(summary_ids[0], skip_special_tokens=True)
     t5.append(summary)
     '''

     tokens_input = tokenizer.encode("summarize: "+strr, return_tensors='pt', max_length=512, truncation=True)
     summary_ids = model.generate(tokens_input, min_length=80, max_length=250)
     summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
     facebook.append(summary)

   print('facebook : ', facebook) 
 else:
   ccc= cccc[i]
   bvb.append(texth) 
   facebook.append(bvb)
 dfhj = pd.DataFrame({'col':ccc, 'textt':[texth],'facebook':[facebook]})
 dfhj.to_csv("gdrive/My Drive/MedLSSBartSummaryTop5halluccc.csv" , sep='\t', encoding='utf-8', doublequote=False, index=False, header=None, mode="a", quoting=csv.QUOTE_NONE)



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


http://dx.doi.org/10.1038/ncomms12461
*********
4
Published by Springer Science and Business Media {LLC} authors Jacob E. Allgeier and Abel Valdivia, journal name Nature Communications. mmmmmmmmmmm
[['coral'], ['biomass'], ['coral reefs'], ['nutrients'], ['conservation of natural resources'], ['animals'], ['fishing'], ['food'], ['chemistry'], ['reefs'], ['population dynamics'], ['ecosystem'], ['fisheries'], ['metabolism'], ['anthozoa'], ['http://dxdoiorg/101038/ncomms12461'], ['seawater'], ['fishes'], ['caribbean region'], ['growth & development']]




Fishing down nutrients on coral reefs
Published by Springer Science and Business Media {LLC} authors Jacob E. Allgeier and Abel Valdivia, journal name Nature Communications. 'Fishing is widely considered a leading cause of biodiversity loss in marine environments, but the potential effect on ecosystem processes, such as nutrient fluxes, is less explored.  Here, we test how fishing on Caribbean coral reefs influences biodiversity and ecosystem functions provided by the fish community, that is, fish-mediated nutrient capacity.  Specifically, we modelled five processes of nutrient storage and supply of nutrients, as well as a measure of their multifunctionality, onto 143 species of coral reef fishes across 110 coral reef fish communities.  These communities span a gradient from extreme fishing pressure to protected areas with little to no fishing.  Instead, changes in community size and trophic structure were the primary cause of shifts in ecosystem function.  These findings suggest that 



Cognitive Improvement after Mild Traumatic Brain Injury Measured with Functional Neuroimaging during the Acute Period
Published by Public Library of Science ({PLoS}) authors Glenn R. Wylie and Kalev Freeman, journal name {PLOS} {ONE}. Functional neuroimaging studies in mild traumatic brain injury have been largely limited to patients with persistent post-concussive symptoms, utilizing images obtained months to years after the actual head trauma.  We also hypothesized that increased memory workload at 1 week following injury would expose different cortical activation patterns in mTBI patients with persistent post-concussive symptoms, compared to those with full clinical recovery.  We performed a prospective, cohort study of working memory in emergency department patients with isolated head injury and clinical diagnosis of concussion, compared to control subjects.  The primary outcome of cognitive recovery was defined as resolution of reported cognitive impairment and quantified by scori



Effectiveness of Electronic Reminders to Improve Medication Adherence in Tuberculosis Patients: A Cluster-Randomised Trial
Published by Public Library of Science ({PLoS}) authors Xiaoqiu Liu and James J. Lewis, journal name {PLOS} Medicine. In a pragmatic cluster-randomised trial, 36 districts/counties within the provinces of Heilongjiang, Jiangsu, Hunan, and Chongqing, China, were randomised using stratification and restriction to one of four case-management approaches in which patients received reminders via text messages, a medication monitor, combined, or neither.  Current Controlled Trials, ISRCTN46846388  A pragmatic trial asks whether an intervention works under real-life conditions; a cluster-randomized trial randomly assigns groups of people to receive alternative interventions and compares outcomes in the differently treated clusters.  More information about this trial is available  The funders contributed to design, running of the trial, and preparation of manuscript.  Thus,



Differential Toxicity of Antibodies to the Prion Protein
Published by Public Library of Science ({PLoS}) authors Regina R. Reimann and Tiziana Sonati, journal name {PLOS} Pathogens. The authors pledge to distribute all reagents described in this article, and particularly POM antibodies and derivatives therefrom, with all interested parties at fair market value.  Animal care and all experimental protocols were in accordance with the Swiss Ethical Principles and Guidelines for Experiments on Animals, and approved by the Animal Experimentation Committee of the Canton of Zurich.  Animal care and protocol guidelines were obtained from http://www.blv.admin.ch/themen/tierschutz/index.html?lang=en and strictly adhered to by the experimenters and animal facility at the institution where the experiments were performed.
Published by Public Library of Science ({PLoS}) authors Regina R. Reimann and Tiziana Sonati, journal name {PLOS} Pathogens. Antibodies against the prion protein PrPC can antagoni



Low Cloud Cover-Adjusted Ultraviolet B Irradiance Is Associated with High Incidence Rates of Leukemia: Study of 172 Countries
Published by Public Library of Science ({PLoS}) authors Raphael E. Cuomo and Cedric F. Garl, journal name {PLOS} {ONE}. There are 52,380 cases of leukemia and 24,090 deaths from it in the US annually.  We hypothesized that leukemia is due mainly to vitamin D deficiency, which is due mainly to low solar ultraviolet B irradiance.  To test this hypothesis, we estimated age-standardized cloud-cover-adjusted winter UVB irradiance using cloud cover data from the International Satellite Cloud Climatology Project, latitudes of population centroids, and standard astronomical calculations.  Incidence rates for 172 countries, available from the International Agency for Cancer Research, were plotted according to cloud-adjusted UVB irradiance.  Leukemia incidence rates were inversely associated with cloud-adjusted UVB irradiance in males and females in both hemispheres.  The

In [None]:
import pandas as pd
header_list =["Paper_Title", "preface","MedTSS_summary"]

df = pd.read_csv('gdrive/My Drive/MedLSSBartSummaryTop5halluccc.csv', encoding='ISO-8859-1', sep = '\t', names=header_list)

In [None]:
import pandas as pd
header_list =["Full_Paper_XML","Paper_Title","HTSS-Summary"]

df = pd.read_csv('gdrive/My Drive/htss-Top5.csv', encoding='ISO-8859-1', sep = '\t', names=header_list, skiprows=1)

In [None]:
import pandas as pd
header_list =["Full_Paper_XML","Paper_Title", "KeyWord", "MeSH", "Eureka_Title_Simplified", "Eureka_Text_Simplified", "Paper_DIO","Original-Text" ]#Original text of paper including Abstract and Introducion sections
df22 = pd.read_csv("gdrive/My Drive/HTSS-Testing-Samples.csv", sep='\t', names=header_list, skiprows=1)

In [None]:
df22['text_new'] = df22['Eureka_Text_Simplified'].str.split('###').str[0]
df22['Eura'] = df22[['Eureka_Title_Simplified', 'text_new']].apply(lambda x: '. '.join(x), axis=1)


In [None]:
df3 = [df22['Eura'].str.lower(), df['MedTSS_summary'].str.lower(), df22['Original-Text'].str.lower()]
df3 = pd.concat(df3, axis=1)

df3

Unnamed: 0,Eura,MedTSS_summary,Original-Text
0,big fish -- and their pee -- are key parts of ...,['fishing is widely considered a leading cause...,fishing down nutrients on coral reefs 'fishing...
1,researchers use neuroimaging to measure early ...,[' functional neuroimaging studies in mild tra...,cognitive improvement after mild traumatic bra...
2,electronic reminders keep tb patients on track...,"['in a pragmatic cluster-randomised trial, 36 ...",effectiveness of electronic reminders to impro...
3,it's complicated: benefits and toxicity of ant...,['the authors pledge to distribute all reagent...,differential toxicity of antibodies to the pri...
4,uc san diego researchers link higher risk of l...,"['there are 52,380 cases of leukemia and 24,09...",low cloud cover-adjusted ultraviolet b irradia...


In [None]:
df3 = [df22['Eura'].str.lower(), df['HTSS-Summary'].str.lower(), df22['Original-Text'].str.lower()]
df3 = pd.concat(df3, axis=1)

df3

Unnamed: 0,Eura,HTSS-Summary,Original-Text
0,big fish -- and their pee -- are key parts of ...,a new study led by researchers at the universi...,fishing down nutrients on coral reefs 'fishing...
1,researchers use neuroimaging to measure early ...,new cognitive improvement after mild traumatic...,cognitive improvement after mild traumatic bra...
2,electronic reminders keep tb patients on track...,electronic reminders to prevent tuberculosis t...,effectiveness of electronic reminders to impro...
3,it's complicated: benefits and toxicity of ant...,a new insights into how the prion protein [unk...,differential toxicity of antibodies to the pri...
4,uc san diego researchers link higher risk of l...,leukemia are benefit to reduce ultraviolet b c...,low cloud cover-adjusted ultraviolet b irradia...


In [None]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Eura            5 non-null      object
 1   MedTSS_summary  5 non-null      object
 2   Original-Text   5 non-null      object
dtypes: object(3)
memory usage: 248.0+ bytes


In [None]:
sariBiobert = []
for index, row in df3.iterrows():
  scores = SARIsent(row['Original-Text'],row['MedTSS_summary'], row['Eura'])
  print(scores)
  sariBiobert.append(scores)
len(sariBiobert)
'''
ssent = "About 95 species are currently accepted ."
csent1 = "About 95 you now get in ."
csent2 = "About 95 species are now agreed ."
csent3 = "About 95 species are currently agreed ."
rsents = ["About 95 species are currently known .", "About 95 species are now accepted .", "95 species are now accepted ."]

print(SARIsent(ssent, csent1, rsents))
print(SARIsent(ssent, csent2, rsents))
print(SARIsent(ssent, csent3, rsents))
'''

0.3347149746323786
0.33340555000338473
0.33453036568943023
0.33624680410685964
0.3346763984148469


'\nssent = "About 95 species are currently accepted ."\ncsent1 = "About 95 you now get in ."\ncsent2 = "About 95 species are now agreed ."\ncsent3 = "About 95 species are currently agreed ."\nrsents = ["About 95 species are currently known .", "About 95 species are now accepted .", "95 species are now accepted ."]\n\nprint(SARIsent(ssent, csent1, rsents))\nprint(SARIsent(ssent, csent2, rsents))\nprint(SARIsent(ssent, csent3, rsents))\n'

In [None]:
print(sum(sariBiobert)/5)   





0.33471481856938


In [None]:
sariBiobert = []
for index, row in df3.iterrows():
  scores = SARIsent(row['Original-Text'],row['HTSS-Summary'], row['Eura'])
  print(scores)
  sariBiobert.append(scores)
len(sariBiobert)
'''
ssent = "About 95 species are currently accepted ."
csent1 = "About 95 you now get in ."
csent2 = "About 95 species are now agreed ."
csent3 = "About 95 species are currently agreed ."
rsents = ["About 95 species are currently known .", "About 95 species are now accepted .", "95 species are now accepted ."]

print(SARIsent(ssent, csent1, rsents))
print(SARIsent(ssent, csent2, rsents))
print(SARIsent(ssent, csent3, rsents))
'''

0.3341886952693538
0.33364461486657687
0.3375459056247247
0.3337631442123408
0.3346286300596857


'\nssent = "About 95 species are currently accepted ."\ncsent1 = "About 95 you now get in ."\ncsent2 = "About 95 species are now agreed ."\ncsent3 = "About 95 species are currently agreed ."\nrsents = ["About 95 species are currently known .", "About 95 species are now accepted .", "95 species are now accepted ."]\n\nprint(SARIsent(ssent, csent1, rsents))\nprint(SARIsent(ssent, csent2, rsents))\nprint(SARIsent(ssent, csent3, rsents))\n'

In [None]:
print(sum(sariBiobert)/5)   





0.3347541980065364


In [None]:
!pip install rouge
!pip install rouge-score

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24955 sha256=0ac6bef7686074ef70592abcc0ff23b8cd6f8a6b880cc70eff680b13fc26681e
  Stored in directory: /root/.cache/pip/wheels/24/55/6f/ebfc4cb176d1c9665da4e306e1705496206d08215c1acd9dde
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [None]:
from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(['rouge1','rouge2', 'rougeL'], use_stemmer=True)
rougeBiobert = []
for index, row in df3.iterrows():
  scores = scorer.score(row['Eura'], row['MedTSS_summary'])
  print(scores)
  rougeBiobert.append(scores)
len(rougeBiobert)

{'rouge1': Score(precision=0.4940119760479042, recall=0.19320843091334894, fmeasure=0.2777777777777778), 'rouge2': Score(precision=0.08408408408408409, recall=0.032825322391559206, fmeasure=0.04721753794266442), 'rougeL': Score(precision=0.2155688622754491, recall=0.08430913348946135, fmeasure=0.1212121212121212)}
{'rouge1': Score(precision=0.37809187279151946, recall=0.3252279635258359, fmeasure=0.3496732026143791), 'rouge2': Score(precision=0.06028368794326241, recall=0.051829268292682924, fmeasure=0.05573770491803278), 'rougeL': Score(precision=0.14487632508833923, recall=0.12462006079027356, fmeasure=0.13398692810457516)}
{'rouge1': Score(precision=0.5583596214511041, recall=0.3695198329853862, fmeasure=0.4447236180904522), 'rouge2': Score(precision=0.1518987341772152, recall=0.100418410041841, fmeasure=0.12090680100755667), 'rougeL': Score(precision=0.2334384858044164, recall=0.1544885177453027, fmeasure=0.18592964824120603)}
{'rouge1': Score(precision=0.34415584415584416, recall=

5

In [None]:
lk = []
for x in rougeBiobert:
   f = x.get('rouge1')[2]
   lk.append(f)
print(sum(lk)/5)   




0.34005244622209296


In [None]:
from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(['rouge1','rouge2', 'rougeL'], use_stemmer=True)
rougeBiobert = []
for index, row in df3.iterrows():
  scores = scorer.score(row['Eura'], row['HTSS-Summary'])
  print(scores)
  rougeBiobert.append(scores)
len(rougeBiobert)

{'rouge1': Score(precision=0.6842105263157895, recall=0.03044496487119438, fmeasure=0.05829596412556053), 'rouge2': Score(precision=0.40540540540540543, recall=0.017584994138335287, fmeasure=0.033707865168539325), 'rougeL': Score(precision=0.631578947368421, recall=0.02810304449648712, fmeasure=0.05381165919282511)}
{'rouge1': Score(precision=0.6170212765957447, recall=0.08814589665653495, fmeasure=0.15425531914893617), 'rouge2': Score(precision=0.2608695652173913, recall=0.036585365853658534, fmeasure=0.0641711229946524), 'rougeL': Score(precision=0.48936170212765956, recall=0.06990881458966565, fmeasure=0.12234042553191489)}
{'rouge1': Score(precision=0.7407407407407407, recall=0.04175365344467641, fmeasure=0.07905138339920949), 'rouge2': Score(precision=0.34615384615384615, recall=0.01882845188284519, fmeasure=0.03571428571428571), 'rougeL': Score(precision=0.5185185185185185, recall=0.029227557411273485, fmeasure=0.05533596837944663)}
{'rouge1': Score(precision=0.5714285714285714, 

5

In [None]:
lk = []
for x in rougeBiobert:
   f = x.get('rouge1')[2]
   lk.append(f)
print(sum(lk)/5)   #0.19234492990007765 #0.3901856292667061 #0.18940115929475698 #0.3946713788855009 #0.3919748372753436 




0.09124453839458362
