In [26]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [76]:
from datasets import load_dataset

t_df = load_dataset("mystic-leung/medical_cord19")
train_dataset = t_df['train']
train_df_full = pd.DataFrame(train_dataset)
df = train_df_full.head(2000)
data=df['input'][:1]

In [80]:
type(data)

numpy.ndarray

In [78]:
data = np.array(data)

In [79]:
data

array(['Cardiovascular disease is the leading cause of death globally. While pharmacological advancements have improved the morbidity and mortality associated with cardiovascular disease, non-adherence to prescribed treatment remains a significant barrier to improved patient outcomes. A variety of strategies to improve medication adherence have been tested in clinical trials, and include the following categories: improving patient education, implementing medication reminders, testing cognitive behavioral interventions, reducing medication costs, utilizing healthcare team members, and streamlining medication dosing regimens. In this review, we describe specific trials within each of these categories and highlight the impact of each on medication adherence. We also examine ongoing trials and future lines of inquiry for improving medication adherence in patients with cardiovascular diseases.'],
      dtype=object)

In [None]:
import re
from nltk.tokenize import sent_tokenize

def pre_processing(text):
    
    # text to sentence
    tokenized = sent_tokenize(text)
    
    # Remove Punctuation
    # Lower Case 
    # Strip White Spaces
    pattern   = re.compile(r'[^a-zA-Z0-9\s]')
    tokenized = [pattern.sub('', sent).strip().lower() for sent in tokenized]
    
    return tokenized

corpus = []
for doc in data:
    corpus.extend(pre_processing(doc))
    
print("Number of Sentences in Corpus : ", len(corpus))

In [82]:
data

array(['Cardiovascular disease is the leading cause of death globally. While pharmacological advancements have improved the morbidity and mortality associated with cardiovascular disease, non-adherence to prescribed treatment remains a significant barrier to improved patient outcomes. A variety of strategies to improve medication adherence have been tested in clinical trials, and include the following categories: improving patient education, implementing medication reminders, testing cognitive behavioral interventions, reducing medication costs, utilizing healthcare team members, and streamlining medication dosing regimens. In this review, we describe specific trials within each of these categories and highlight the impact of each on medication adherence. We also examine ongoing trials and future lines of inquiry for improving medication adherence in patients with cardiovascular diseases.'],
      dtype=object)

In [83]:
out=df['output']

In [84]:
input_text = "The legal and illegal trade in wildlife for food, medicine and other products is a globally significant threat to biodiversity that is also responsible for the emergence of pathogens that threaten human and livestock health and our global economy. Trade in wildlife likely played a role in the origin of COVID-19, and viruses closely related to SARS-CoV-2 have been identified in bats and pangolins, both traded widely. To investigate the possible role of pangolins as a source of potential zoonoses, we collected throat and rectal swabs from 334 Sunda pangolins (Manis javanica) confiscated in Peninsular Malaysia and Sabah between August 2009 and March 2019. Total nucleic acid was extracted for viral molecular screening using conventional PCR protocols used to routinely identify known and novel viruses in extensive prior sampling (> 50,000 mammals). No sample yielded a positive PCR result for any of the targeted viral families-Coronaviridae, Filoviridae, Flaviviridae, Orthomyxoviridae and Paramyxoviridae. In the light of recent reports of coronaviruses including a SARS-CoV-2-related virus in Sunda pangolins in China, the lack of any coronavirus detection in our 'upstream' market chain samples suggests that these detections in 'downstream' animals more plausibly reflect exposure to infected humans, wildlife or other animals within the wildlife trade network. While confirmatory serologic studies are needed, it is likely that Sunda pangolins are incidental hosts of coronaviruses. Our findings further support the importance of ending the trade in wildlife globally."


input_text = input_text.replace("\n", " ")
sentences = sent_tokenize(input_text)
input_tok = pre_processing(input_text)

In [85]:
from rouge_score import rouge_scorer

expected = "The SARS-CoV-2 pandemic has created a pressing need for vaccines effective against mutated variants. Peptide vaccines offer a promising solution due to their efficient design process. Three potential vaccine regions have been identified, meeting criteria of surface exposure and immune response generation. Crucially, these regions lack mutations seen in prevailing variants, making them viable candidates for combating future strains."

expected = expected.replace("\n", " ").strip()

def rouge_metrics(summary):
    
    scorer = rouge_scorer.RougeScorer(['rouge1'], use_stemmer=True)
    scores = scorer.score(summary, expected)
    
    print("Rouge Score : ", scores, end="\n\n")

In [89]:
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx

def summarize(input_vec):
    # Cosine Similarity
    similarity_matrix = cosine_similarity(input_vec, input_vec)

    # Matrix to Graph
    G = nx.from_numpy_array(similarity_matrix)

    # PageRank Algorithm
    pagerank_scores = nx.pagerank(G)

    # Sort sentences based on PageRank Scores
    sorted_sentences = sorted(pagerank_scores, key=pagerank_scores.get, reverse=True)

    # Select top 10 
    top_k = 3
    summary = [sentences[i] for i in sorted_sentences[:top_k]]

    print(" ".join(summary))

In [90]:
from sklearn.feature_extraction.text import CountVectorizer

bag_of_words = CountVectorizer()

corpus_bow = bag_of_words.fit_transform(corpus)
input_bow  = bag_of_words.transform(input_tok)

In [91]:
summarize(input_bow)

To investigate the possible role of pangolins as a source of potential zoonoses, we collected throat and rectal swabs from 334 Sunda pangolins (Manis javanica) confiscated in Peninsular Malaysia and Sabah between August 2009 and March 2019. The legal and illegal trade in wildlife for food, medicine and other products is a globally significant threat to biodiversity that is also responsible for the emergence of pathogens that threaten human and livestock health and our global economy. Trade in wildlife likely played a role in the origin of COVID-19, and viruses closely related to SARS-CoV-2 have been identified in bats and pangolins, both traded widely.


In [92]:
from sklearn.feature_extraction.text import TfidfVectorizer

tf_idf = TfidfVectorizer()

corpus_idf = tf_idf.fit_transform(corpus)
input_idf = tf_idf.transform(input_tok)
summarize(input_idf)

The legal and illegal trade in wildlife for food, medicine and other products is a globally significant threat to biodiversity that is also responsible for the emergence of pathogens that threaten human and livestock health and our global economy. To investigate the possible role of pangolins as a source of potential zoonoses, we collected throat and rectal swabs from 334 Sunda pangolins (Manis javanica) confiscated in Peninsular Malaysia and Sabah between August 2009 and March 2019. Trade in wildlife likely played a role in the origin of COVID-19, and viruses closely related to SARS-CoV-2 have been identified in bats and pangolins, both traded widely.


In [93]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize

g_model = Word2Vec(sentences=[word_tokenize(sent) for sent in corpus], vector_size=200, window=5, workers=5, epochs=500)

In [94]:
def get_embeddings(sent_l):
    vec = np.array([g_model.wv[word] if word in g_model.wv else np.zeros((200)) for word in sent_l])
    vec = vec.sum(axis=0)
    return vec

input_cbow = np.array([get_embeddings(sent) for sent in [word_tokenize(sent) for sent in input_tok]])

In [95]:
summarize(input_cbow)

No sample yielded a positive PCR result for any of the targeted viral families-Coronaviridae, Filoviridae, Flaviviridae, Orthomyxoviridae and Paramyxoviridae. To investigate the possible role of pangolins as a source of potential zoonoses, we collected throat and rectal swabs from 334 Sunda pangolins (Manis javanica) confiscated in Peninsular Malaysia and Sabah between August 2009 and March 2019. Trade in wildlife likely played a role in the origin of COVID-19, and viruses closely related to SARS-CoV-2 have been identified in bats and pangolins, both traded widely.


In [96]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize

g_model = Word2Vec(sentences=[word_tokenize(sent) for sent in corpus], vector_size=200, window=5, workers=5, epochs=500, sg=1)

In [97]:
def get_embeddings(sent_l):
    vec = np.array([g_model.wv[word] if word in g_model.wv else np.zeros((200)) for word in sent_l])
    vec = vec.sum(axis=0)
    return vec

input_sg = np.array([get_embeddings(sent) for sent in [word_tokenize(sent) for sent in input_tok]])

In [98]:
summarize(input_sg)

No sample yielded a positive PCR result for any of the targeted viral families-Coronaviridae, Filoviridae, Flaviviridae, Orthomyxoviridae and Paramyxoviridae. To investigate the possible role of pangolins as a source of potential zoonoses, we collected throat and rectal swabs from 334 Sunda pangolins (Manis javanica) confiscated in Peninsular Malaysia and Sabah between August 2009 and March 2019. Trade in wildlife likely played a role in the origin of COVID-19, and viruses closely related to SARS-CoV-2 have been identified in bats and pangolins, both traded widely.


In [99]:
import gensim.downloader as api

model = api.load("glove-wiki-gigaword-200")

def get_embeddings(sent_l):
    vec = np.array([model[word] if word in model else np.zeros((200)) for word in sent_l])
    vec = vec.sum(axis=0)
    return vec

input_wv = np.array([get_embeddings(sent) for sent in [word_tokenize(sent) for sent in input_tok]])

In [None]:
summarize(input_wv)