

Nevetha NG


MDS202128


In [None]:
# Importing necessary packages
import os, re, json, math, nltk, pickle
import pandas as pd
import numpy as np
from collections import Counter
from nltk.tokenize import word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
# Mounting google drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
# Setting paths
path_to_json = 'pdf_json/'
drive_directory = 'drive/MyDrive/NLP/'

Mounted at /content/drive


In [None]:
%%time
# Unzipping json files from drive into colab memory
!unzip drive/My\ Drive/NLP/pdf_json.zip > /dev/null

CPU times: user 409 ms, sys: 58.3 ms, total: 468 ms
Wall time: 57.2 s



#### Defining a function to extract the text from the json files and indexing the content using paper_id, title, abstract and body_text fields



In [None]:
def extract_text(filename):
    file = open(filename)
    body_text = ""
    abstract = ""
    title = ""
    paper_id = ""

    paper_content = json.load(file)

    #get the paper_id
    if 'paper_id' in paper_content:
        paper_id = paper_content['paper_id']

    #get the title, if available
    if 'title' in paper_content['metadata']:
        title = paper_content['metadata']['title']

    #get abstract.text, if available
    if 'abstract' in paper_content:
        for abs in paper_content['abstract']:
            abstract = abstract + abs['text']
    if 'body_text' in paper_content:
        for bt in paper_content['body_text']:
            body_text = body_text + bt['text']
    file.close()

    return (title + ' ' + abstract + ' ' + body_text + ' ').lower()

In [None]:
%%time
# Reading all filenames of the json files
json_files = [pos_json for pos_json in os.listdir(path_to_json) if pos_json.endswith('.json')]

CPU times: user 17.4 ms, sys: 31 ms, total: 48.4 ms
Wall time: 50.6 ms


### 1. Corpus Creation

In [None]:
def create_corpus(path_to_json, files):
  # Creating corpus till the last document
  corpus = '\n'.join(extract_text(path_to_json+i) if re.search('\n',extract_text(path_to_json+i))
                     is None else re.sub('\n','', extract_text(path_to_json+i))
                      for i in files[:50000])

  if re.search('\n', extract_text(path_to_json+files[49999])) is not None:
    return corpus + re.sub('\n','', extract_text(path_to_json+files[49999]))
  # Adding the last document to the corpus without \n at the end
  return corpus + extract_text(path_to_json+files[49999])

# generating corpus
corpus = create_corpus(path_to_json, json_files)

# saving corpus
with open(drive_directory+'corpus.txt','w') as f:
  f.write(corpus)

### 2 Corpus Preprocessing

In [None]:
%%time
with open(drive_directory+'corpus.txt','r') as f:
  corpus = f.read()

CPU times: user 5.66 s, sys: 6.2 s, total: 11.9 s
Wall time: 21.5 s


In [None]:
#Defining few preprocessing functions
import nltk
nltk.download('words')
words = set(nltk.corpus.words.words())
words.add("integrated")
words.add("houses")
def numbers(corpus): # to remove numbers
  return re.sub("\d+", "", corpus)
def spl_chars(corpus): # to remove special characters
  return re.sub(r'[^a-zA-Z0-9.?! ]+', '', corpus)
def single_letters(corpus): # to remove single letters except 'i' and 'a'
  return " ".join(w for w in nltk.wordpunct_tokenize(corpus)
  if (w =='a' or w=='i' or len(w)>1))
def brackets(corpus): # to remove brackets
  return re.sub(r"[\([{})\]]", "", corpus)
def english_words(corpus): # to remove non-english words
  return " ".join(w for w in nltk.wordpunct_tokenize(corpus) if w.lower() in words)
def tokenization(corpus):
  return word_tokenize(corpus)

[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


In [None]:
def preprocess_pipeline(corpus):
# Defining our sequence of pre-processing steps
  corpus = spl_chars(corpus)
  corpus = numbers(corpus)
  corpus = single_letters(corpus)
  corpus = brackets(corpus)
  corpus = english_words(corpus)
  return corpus

def preprocess(corpus, batch_size=500):
# Pre-processing the whole corpus in batches to avoid memory-overflow
  docs = corpus.split('\n')
  n_batches = math.ceil(len(docs)/batch_size)
  for i in range(n_batches):
    batch_begin= batch_size*i
    batch = docs[batch_begin:batch_begin+batch_size]
    #batch = tokenization(batch)
    batch = preprocess_pipeline('\n'.join(batch))
    batch = tokenization(batch)
    with open(drive_directory+f'token_batch_{i+1}', 'wb') as f:
      pickle.dump(batch, f)
    print(f'Pre-processing {i+1}-th batch: Done')

In [None]:
preprocess(corpus)

Pre-processing 1-th batch: Done
Pre-processing 2-th batch: Done
Pre-processing 3-th batch: Done
Pre-processing 4-th batch: Done
Pre-processing 5-th batch: Done
Pre-processing 6-th batch: Done
Pre-processing 7-th batch: Done
Pre-processing 8-th batch: Done
Pre-processing 9-th batch: Done
Pre-processing 10-th batch: Done
Pre-processing 11-th batch: Done
Pre-processing 12-th batch: Done
Pre-processing 13-th batch: Done
Pre-processing 14-th batch: Done
Pre-processing 15-th batch: Done
Pre-processing 16-th batch: Done
Pre-processing 17-th batch: Done
Pre-processing 18-th batch: Done
Pre-processing 19-th batch: Done
Pre-processing 20-th batch: Done
Pre-processing 21-th batch: Done
Pre-processing 22-th batch: Done
Pre-processing 23-th batch: Done
Pre-processing 24-th batch: Done
Pre-processing 25-th batch: Done
Pre-processing 26-th batch: Done
Pre-processing 27-th batch: Done
Pre-processing 28-th batch: Done
Pre-processing 29-th batch: Done
Pre-processing 30-th batch: Done
Pre-processing 31-t

### 3. Vocabulary Count

In [None]:
token_files = [batch for batch in os.listdir(drive_directory) if batch.startswith('token_batch')]
len(token_files)

100

In [None]:
from collections import Counter
vocab = Counter()
token_files = [batch for batch in os.listdir(drive_directory) if batch.startswith('token_batch')]
for i in token_files:
  with open(drive_directory+i, 'r') as f:
    vocab.update(pickle.load(f))
print("The Vocabulary count of the corpus is:", len(vocab.keys()))

In [None]:
vocab_count = 60318

### 4. a) Building a Bigram Model

In [None]:
def createBigram(data):
# Creating Bigrams
  listOfBigrams = []
  bigramCounts = {}
  for i in range(len(data)-1):
    if i < len(data) -1 and data[i+1]:
      listOfBigrams.append((data[i], data[i+1]))
      if( data[i], data[i+1]) in bigramCounts:
        bigramCounts[(data[i], data[i+1])]+=1
      else:
          bigramCounts[(data[i], data[i+1])]=1
  return bigramCounts

In [None]:
bigrams = Counter()
token_files = [batch for batch in os.listdir(drive_directory) if batch.startswith('token_batch')]
for i in token_files:
  with open(drive_directory+i, 'rb') as f:
    data = pickle.load(f)
    bigrams.update(createBigram(data))

In [None]:
def bigramProbGen(bigrams):
# Generating Bigram Probabilities
  sorted_bigram_prob = {}
  for bigram in bigrams.keys():
    b = " ".join(w for w in (list(bigram)))
    p = (bigrams[bigram]+1)/(vocab[bigram[0]]+len(vocab))
    sorted_bigram_prob.update({b:p})
  return dict(sorted(sorted_bigram_prob.items(), key=lambda x:x[1], reverse = True))

In [None]:
sorted_bigram_prob = bigramProbGen(bigrams)

Saving the bigram model

In [None]:
with open(drive_directory+"bigrams.json", "wb") as outfile:
    pickle.dump(sorted_bigram_prob, outfile)

### b) Building a Trigram Model

In [None]:
def createTrigram(data):
# Creating Trigrams
  listOfTrigrams = []
  TrigramCounts = {}
  for i in range(len(data)-2):
    if i < len(data) -2 and data[i+2]:
      listOfTrigrams.append((data[i], data[i+1], data[i+2]))
      if( data[i], data[i+1], data[i+2]) in TrigramCounts:
        TrigramCounts[(data[i], data[i+1], data[i+2])]+=1
      else:
          TrigramCounts[(data[i], data[i+1], data[i+2])]=1
  return TrigramCounts

In [None]:
#taking only one third of the corpus for trigrams.(due to RAM issues)

In [None]:
trigrams = Counter()
for i in token_files[:33]:
  with open(drive_directory+i, 'rb') as f:
    data = pickle.load(f)
    trigrams.update(createTrigram(data))

In [None]:
sorted_trigrams = dict(sorted(trigrams.items(), key=lambda x:x[1], reverse = True))

In [None]:
sorted_trigram_prob = {}
for trigram in sorted_trigrams.keys():
  b = " ".join(w for w in ([trigram[0],trigram[1]]))
  t = " ".join(w for w in (list(trigram)))
  p = (sorted_trigrams[trigram]+1)/(bigram_prob[b]+vocab_count)
  sorted_trigram_prob.update({t:p})

saving the trigram model

In [None]:
with open(drive_directory+"trigrams.json", "wb") as outfile:
    pickle.dump(sorted_trigram_prob, outfile)

### Predicting next word:

Using Bigram Model

In [None]:
with open(drive_directory+"bigrams.json", "rb") as f:
    bigram_prob = pickle.load(f)

In [None]:
words = ['were', 'integrated', 'treatment', 'health', 'in']
for word in words:
  count = 0
  print("Predicted Words for:",word)
  for bigrams in bigram_prob.keys():
    bigram=bigrams.split(" ")[0]
    if count <10:
      if bigram==word:
        print({bigrams:bigram_prob[bigrams]})
        count+=1

Predicted Words for: were
{'were in': 0.050263097984826054}
{'were with': 0.03875427665411051}
{'were by': 0.03095498352307496}
{'were and': 0.028638042079531637}
{'were to': 0.027819443255924347}
{'were not': 0.025694638419894375}
{'were used': 0.024905909573085375}
{'were from': 0.02116166761658575}
{'were the': 0.02052471054377889}
{'were for': 0.019438896522327404}
Predicted Words for: integrated
{'integrated into': 0.022221611444748306}
{'integrated with': 0.012436956312614234}
{'integrated and': 0.00944109280305633}
{'integrated in': 0.007723281157667624}
{'integrated approach': 0.004686190168620391}
{'integrated the': 0.00393035304464936}
{'integrated to': 0.0036005332087347287}
{'integrated system': 0.0026523011804801626}
{'integrated care': 0.002322481344565531}
{'integrated moving': 0.002281253865076202}
Predicted Words for: treatment
{'treatment of': 0.1504473315699393}
{'treatment with': 0.05918391162284114}
{'treatment and': 0.05039287381359888}
{'treatment for': 0.0415279

Using Trigram Model

In [None]:
with open(drive_directory+"trigrams.json", "rb") as f:
    trigram_prob = pickle.load(f)

In [None]:
words = ['houses were', 'an integrated', 'and treatment', 'non health', 'work in']
for word in words:
  count=0
  print("Predictions for:",word)
  for trigram in trigram_prob.keys():
    if count <10:
      if trigram.split()[0]==word.split()[0] and trigram.split()[1]==word.split()[1]:
        print({trigram:trigram_prob[trigram]})
        count+=1

Predictions for: houses were
{'houses were al': 3.315759772510844e-05}
{'houses were temperature': 3.315759772510844e-05}
{'houses were for': 3.315759772510844e-05}
{'houses were reduced': 3.315759772510844e-05}
{'houses were all': 3.315759772510844e-05}
{'houses were or': 3.315759772510844e-05}
{'houses were the': 3.315759772510844e-05}
{'houses were free': 3.315759772510844e-05}
{'houses were near': 3.315759772510844e-05}
{'houses were not': 3.315759772510844e-05}
Predictions for: an integrated
{'an integrated approach': 0.0010942006932970353}
{'an integrated and': 0.0005636791450318061}
{'an integrated system': 0.0003813123628156335}
{'an integrated analysis': 0.0003647335644323451}
{'an integrated model': 0.0002321031773660378}
{'an integrated health': 0.00021552437898274938}
{'an integrated platform': 0.00018236678221617255}
{'an integrated view': 0.00014920918544959573}
{'an integrated strategy': 0.00014920918544959573}
{'an integrated one': 0.00014920918544959573}
Predictions fo

### 6 Perplexity Score

For Bigram Model

In [None]:
def perplexity_scoreb(sentence):
  N = len(sentence.split())
  bigrams = list(createBigram(word_tokenize(sentence)).keys())
  perplexity = 1
  for bigram in bigrams:
    key = " ".join(w for w in (list(bigram)))
    try:
      if bigram_prob[key]:
        p = bigram_prob[key]
        perplexity = perplexity*(1/p)
    except:
      pass
  perplexity = pow(perplexity,1/N)
  return perplexity

In [None]:
sentence1 = 'that the overall code stroke volume has decreased since the covid pandemic'
perplexity_scoreb(sentence1)

29.66642255856188

In [None]:
sentence2='half a century ago hypertension was not treatable'
perplexity_scoreb(sentence2)

408.35495849352606

In [None]:
sentence3='sarahs tv is broadcast an advert for private healthcare'
perplexity_scoreb(sentence3)

94.78888092781801

For Trigram Model

In [None]:
def perplexity_scoret(sentence):
  N = len(sentence.split())
  trigrams = list(createTrigram(word_tokenize(sentence)).keys())
  perplexity = 1
  for trigram in trigrams:
    key = " ".join(w for w in (list(trigram)))
    try:
      if trigram_prob[key]:
        p = trigram_prob[key]
        perplexity = perplexity*(1/p)
    except:
      pass
  perplexity = pow(perplexity,1/N)
  return perplexity

In [None]:
sentence1 = 'that the overall code stroke volume has decreased since the covid pandemic'
perplexity_scoret(sentence1)

3.227458410316034

In [None]:
sentence2='half a century ago hypertension was not treatable'
perplexity_scoret(sentence2)

63.514661887080486

In [None]:
sentence3='sarahs tv is broadcast an advert for private healthcare'
perplexity_scoret(sentence3)

1.0

We can see that the perplexity score has improved for the Trigram model