<a href="https://colab.research.google.com/github/GuptaNavdeep1983/CS688/blob/main/ThemeExtraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
pip install rake_nltk



In [9]:
import io
import matplotlib.pyplot as plt
import tensorflow as tf
import pandas as pd 
from tensorflow import keras
from tensorflow.keras import layers
import tensorflow_datasets as tfds
from tensorflow.keras import losses
from tensorflow.keras import preprocessing
from tensorflow.keras import utils
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from gensim.models import Word2Vec
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
import numpy as np
from collections import Counter
from pprint import pprint
from rake_nltk import Rake
from nltk.corpus import stopwords 

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
df = pd.read_csv("pubmed_results.csv")
df.dropna(inplace=True)
df

Unnamed: 0,year,month,title,day,abstract
1,2012,7,Golimumab for the treatment of psoriatic arthr...,5,This paper presents a summary of the evidence ...
2,2012,7,Liraglutide for the treatment of type 2 diabetes.,5,This paper presents a summary of the evidence ...
3,2012,7,Trabectedin for the treatment of relapsed ovar...,5,The paper presents a summary of the evidence r...
4,2012,7,Ofatumumab for the treatment of chronic lympho...,5,This paper presents a summary of the evidence ...
5,2012,7,Denosumab for the prevention of osteoporotic f...,5,This paper presents a summary of the evidence ...
...,...,...,...,...,...
995,2020,12,Using Whatsapp for the Consultation of Covid-1...,4,"Telephone, internet-connected devices (phablet..."
996,2020,12,A comprehensive study on ozone pollution in a ...,4,Tropospheric ozone (O3) pollution has b
997,2020,12,Long-term exposure to low-level air pollution ...,4,Air pollution has been suggested as a risk fac...
998,2020,12,Radiation-induced lens opacities: Epidemiologi...,4,"In 2011, the International Commission on Radio..."


In [11]:
def extract_words(text):
    temp = text.split() # Split the text on whitespace
    text_words = []

    for word in temp:
        # Remove any punctuation characters present in the beginning of the word
        while word[0] in string.punctuation:
            word = word[1:]

        # Remove any punctuation characters present in the end of the word
        while word[-1] in string.punctuation:
            word = word[:-1]

        # Append this word into our list of words.
        text_words.append(word.lower())
        
    return text_words

In [12]:
all_documents = []
all_sentences = df['title'].to_numpy()

# for line in all_sentences:
#   all_words.extend(line.strip().split())
all_documents = [nltk.word_tokenize(sent) for sent in all_sentences]

In [13]:
from nltk.corpus import stopwords
for i in range(len(all_documents)):
    all_documents[i] = [w for w in all_documents[i] if w not in stopwords.words('english')]

In [15]:
def tf_idf(corpus_words):
    IDF = inv_doc_freq(corpus_words)
    
    TFIDF = []
    
    for document in corpus_words:
        TFIDF.append(Counter(document))
    
    for document in TFIDF:
        for word in document:
            document[word] = document[word]*IDF[word]
            
    return TFIDF

def inv_doc_freq(corpus_words):
    number_docs = len(corpus_words)
    
    document_count = {}

    for document in corpus_words:
        word_set = set(document)

        for word in word_set:
            document_count[word] = document_count.get(word, 0) + 1
    
    IDF = {}
    
    for word in document_count:
        IDF[word] = np.log(number_docs/document_count[word])
        
    
    return IDF

def term_document_matrix(TFIDF, word_list, word_dict):
    vocabulary_size = len(word_dict)
    number_documents = len(TFIDF)
    
    TDM = np.zeros((vocabulary_size, number_documents))
    
    for doc in range(number_documents):
        document = TFIDF[doc]
        
        for word in document.keys():
            pos = word_dict[word]
            
            TDM[pos, doc] = document[word]
            
    return TDM

def build_vocabulary(TFIDF):
    words = set()
    
    for document in TFIDF:
        words |= document.keys()
    
    word_list = list(words)
    word_dict = dict(zip(word_list, range(len(word_list))))
    
    return word_dict, word_list

In [None]:
IDF = inv_doc_freq(all_words)

pprint(IDF)

In [None]:
IDF


In [None]:
TFIDF = tf_idf(all_words)

In [None]:
word_dict, word_list = build_vocabulary(TFIDF)

In [None]:
TDM = term_document_matrix(TFIDF, word_list, word_dict)

In [None]:
TDM

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [None]:
def find_related_docs(tweet, TDM):
    new_vector = np.zeros(TDM.shape[1])
    
    for word in tweet:
        if word in word_dict:
          pos = word_dict[word]
          new_vector += TDM[pos, :]
        
    # Now the entries of new_vector tell us which documents are activated by this one.
    # Let's extract the list of documents sorted by activation
    doc_list = sorted(zip(range(TDM.shape[1]), new_vector), key=lambda x:x[1], reverse=True)
    
    return doc_list

In [None]:
research_words = ["Obesity", "Cancer", "Covid-19", "wearable", "mental health", "influenza"]
related = find_related_docs(research_words, TDM)

In [None]:
for tweet, score in related[:100]:
    print(tweet, " ".join(all_documents[tweet]))

In [18]:
all_sentences


array(['Golimumab for the treatment of psoriatic arthritis.',
       'Liraglutide for the treatment of type 2 diabetes.',
       'Trabectedin for the treatment of relapsed ovarian cancer.',
       'Ofatumumab for the treatment of chronic lymphocytic leukaemia in patients who are refractory to fludarabine and alemtuzumab: a critique of the submission from GSK.',
       'Denosumab for the prevention of osteoporotic fractures in postmenopausal women.',
       'Prucalopride for the treatment of women with chronic constipation in whom standard laxative regimens have failed to provide adequate relief.',
       'Trastuzumab for the treatment of HER2-positive metastatic adenocarcinoma of the stomach or gastro-oesophageal junction.',
       'Eltrombopag for the treatment of chronic idiopathic (immune) thrombocytopenic purpura (ITP).',
       'Omalizumab for the treatment of severe persistent allergic asthma in children aged 6-11 years.',
       'Bevacizumab in combination with a taxane for the 

In [20]:
all_words = []
for line in all_sentences:
  line = line.strip().split()
  out = [w for w in line if w not in stopwords.words('english')]
  all_words.extend(out)
listToStr = ' '.join([str(elem) for elem in all_words]) 
listToStr


"Golimumab treatment psoriatic arthritis. Liraglutide treatment type 2 diabetes. Trabectedin treatment relapsed ovarian cancer. Ofatumumab treatment chronic lymphocytic leukaemia patients refractory fludarabine alemtuzumab: critique submission GSK. Denosumab prevention osteoporotic fractures postmenopausal women. Prucalopride treatment women chronic constipation standard laxative regimens failed provide adequate relief. Trastuzumab treatment HER2-positive metastatic adenocarcinoma stomach gastro-oesophageal junction. Eltrombopag treatment chronic idiopathic (immune) thrombocytopenic purpura (ITP). Omalizumab treatment severe persistent allergic asthma children aged 6-11 years. Bevacizumab combination taxane first-line treatment HER2-negative metastatic breast cancer. Reflections knowledge translation Canadian NICUs using EPIQ method. [A study goods supply system hospital outpatients intermittent self-catheterization(ISC)]. [Self purchase catheter supplies hardship associated urinary ho

In [23]:
r = Rake() 

a=r.extract_keywords_from_text(listToStr)
b=r.get_ranked_phrases()
c=r.get_ranked_phrases_with_scores()
print(b[:50])


['case home parenteral nutrition therapy associated problems exchanging popular tpn one bag solution new type one bag tpn solution containing elements essential nutritionals ].', 'chromosome nondisjunction bipolar mitoses binucleated intermediates promote aneuploidy formation along multipolar mitoses rather chromosome loss micronuclei induced asbestos', 'correlation vertebral trabecular attenuation hounsfield units upper instrumented vertebra proximal junctional failure surgical treatment degenerative lumbar disease', 'chimeric perforator flap based descending branch lateral circumflex femoral artery tongue reconstruction advanced tongue cancer resection ].', 'effective aqueous arsenic removal using zero valent iron doped mwcnt synthesized situ cvd method using natural α', 'histone deacetylase inhibitor saha mediates mast cell death epigenetic silencing constitutively active d816v kit systemic mastocytosis', 'staging prognosis oropharyngeal carcinoma according 8th edition american join