# Language Models
1. a probabilistic approach to model the corpus.
2. Can be used for variety of tasks such as next word prediction, ASR etc
3. N gram analysis

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import warnings
warnings.filterwarnings("ignore")

In [None]:
class LanguageModels:
  def __init__(self,corpus):
    '''
    Constructor function to initialise the class members
    @params
    :corpus list of sentences
    @return
    :None
    '''
    self.corpus = corpus
    self.ngrams = None
    self.ngrams_size = None
    self.vocab = None
    self.unseen_smooth_prob = None

  def V(self):
    '''
    Function to return the size of vocabulary
    @params
    :None
    @return
    :int
    '''
    return len(self.vocab)

  def preprocess(self):
    return None

  def build_ngrams(self,N=1):
    '''
    Function to build n grams of the texts
    @params
    :N ngram value by default 1
    @return
    :probabilistic model
    '''

    ngrams = {}
    for sentence in self.corpus:
      tokens = sentence.split() #splitting the sentence into tokens [words]

      #Adding delimeters to the sentence
      for delimters in range(N-1):
        tokens.insert(0,'<S>')
      if N!=1:
        tokens.append('</S>')
      for iter in range(0,len(tokens)):
        gram = tuple(tokens[iter:iter+N])
        for lowergrams in range(len(gram)):

          lowergram = gram[:lowergrams+1]

          if lowergram in ngrams:
            ngrams[lowergram] += 1
          else:
            ngrams[lowergram] = 1

    self.ngrams = ngrams

    size = {}
    for i in ngrams:
      for n in range(1,N+1):
        if len(i)==n:
          if n in size:
            size[n] +=1
          else:
            size[n] = 1
    self.ngrams_size = size

    vocab = []
    for i in ngrams:
      if len(i) == 1:
        vocab.append(i)

    self.vocab = vocab

  def perplexity(self,sentence,smoothing=False,display=False,N=1):
    sentence_prob,n = self.smooth_sentence_probability(sentence,display,N)

    if sentence_prob==0:
      if display:
        print("For the given sentence, the perplexity for",N,"gram =","inf")
      return 10^10

    #normalised probability is geometric mean of joint probability
    normalised_prob = (sentence_prob)**(1/n)
    #perplexity is inverse of normalised probability
    perplexity = round(1/normalised_prob,3)
    if display:
      print("For the given sentence, the perplexity for",N,"gram =",perplexity)

    return perplexity

  def sentence_probability(self, sentence,smoothing=False, N=1):
    '''
    Function to calculate probability of the sentence based on n gram analysis
    @params
    :sentence the sentence to which we need to calculate probability
    :smoothing to handle unseen words by default False
    :N ngram value by default 1
    @return
    :probability value [0-1]
    '''

    tokens = sentence.split()
    for delimters in range(N-1):
        tokens.insert(0,'<S>')
    if N!=1:
      tokens.append('</S>')



    prob_sentence = 1
    n_tokens = 0
    D = 0.75

    for iter in range(0,len(tokens)):
      ngrams_list = self.ngrams
      gram = tuple(tokens[iter:iter+N])
      if len(gram)>=N:
        lower_gram = tuple(gram[:len(gram)-1])
        if gram in ngrams_list and len(lower_gram)==0:
          c = 0
          for i in self.ngrams:
            if len(i)==1:
              c+=self.ngrams[i]

          prob_gram  = ngrams_list[gram]/c
          n_tokens += 1
        elif gram in ngrams_list and lower_gram in ngrams_list:
          prob_gram  = (ngrams_list[gram])/ngrams_list[lower_gram]
          n_tokens += 1
        else: #need to handle for unseen words using smoothing but for now 0
          #smooth for gram
          prob_gram = 0

        print(gram,lower_gram,prob_gram)
        prob_sentence = prob_sentence * prob_gram

    print("\nGiven sentence:",sentence)
    print("The probability of the given sentence is:",prob_sentence)

    return prob_sentence,n_tokens

  def get_backoff(self, lower_gram,D=0.75):
    #get all unique words
    lower_gram = list(lower_gram)
    words = []
    ngram = self.ngrams
    for i in ngram:
      if len(i)==1:
        words.append(i[0])
    words = list(set(words))

    counts = 0
    total_counts = 0
    for gram in ngram:
      if len(gram)==len(lower_gram)+1:
        for word in words:
          if gram==tuple(lower_gram+[word]):
            counts += 1
            total_counts += ngram[gram]
    if total_counts!=0:
      backoff_weight = (D*counts)/total_counts
    else:
      return 0

    return backoff_weight


  def next_word_prediction(self,sentence,N=1):
    '''
    Function to predict next word of sentence based on ngrams
    @params
    :sentence the sentence to which we need to predict next word
    :N ngram value by default 1
    @return
    :str next word
    '''

    tokens = sentence.split()
    #for delimters in range(N-1):
    #    tokens.insert(0,'<S>')
    if N==1:
      max_vocab = 0
      next_word = None
      for i in self.vocab:
        count = self.ngrams[i]
        if count>max_vocab:
          max_vocab = count
          next_word = i
    else:
      last_n_tokens = tokens[-(N-1):]
      max_vocab = 0
      next_word = None
      for i in self.vocab:
        gram = tuple(list(last_n_tokens)+[i[0]])
        if gram not in self.ngrams:
          count = 0
        else:
          count = self.ngrams[gram]
        #print(gram,count)
        if count>max_vocab:
          max_vocab = count
          next_word = i


    print("\nGiven sentence:",sentence)
    print("The next word could be:",next_word[0])
    return next_word[0]

  def smooth_sentence_probability(self, sentence, display=False,N=1):

    tokens = sentence.split()
    for delimters in range(N-1):
        tokens.insert(0,'<S>')
    if N!=1:
      tokens.append('</S>')

    prob_sentence = 1
    D = 0.25
    n_tokens = 0
    for iter in range(0,len(tokens)):
      ngrams_list = self.ngrams
      gram = tuple(tokens[iter:iter+N])

      #unigram case handling
      if len(gram)>=N:
        lower_gram = tuple(gram[:len(gram)-1])
        if gram in ngrams_list and len(lower_gram)==0:
          c = 0
          for i in self.ngrams:
            if len(i)==1:
              c+=self.ngrams[i]

          prob_gram  = ngrams_list[gram]/c
          n_tokens += 1
        elif gram in ngrams_list and lower_gram in ngrams_list:
          prob_gram  = (ngrams_list[gram]-D)/ngrams_list[lower_gram]
          n_tokens += 1
        #need to handle for unseen words using smoothing but for now 0
        else:
          lower_gram_backoff = lower_gram
          gram_backoff = gram[1:]
          k=0
          while gram_backoff not in ngrams_list:
            if len(lower_gram_backoff)==0:
              k=1
              break
            lower_gram_backoff = lower_gram_backoff[1:]
            gram_backoff = gram[1:]
          if k:
            c = 0
            for i in self.ngrams:
              if len(i)==1:
                c+=self.ngrams[i]

            prob_gram = (ngrams_list[('</S>',)]-D)/c
          elif lower_gram_backoff not in ngrams_list or gram_backoff not in ngrams_list:
            return 0,0
          else:
            backoff_weight = self.get_backoff(lower_gram_backoff)
            prob_gram = backoff_weight* ((ngrams_list[gram_backoff]-D)/ngrams_list[lower_gram_backoff])
            n_tokens += 1
        if display:
          print(gram,lower_gram,prob_gram)
        prob_sentence = prob_sentence * prob_gram
    if display:
      print("\nGiven sentence:",sentence)
      print("The probability of the given sentence is:",prob_sentence)

    return prob_sentence,n_tokens


## Prepraring corpus

In [None]:
df = pd.read_csv("/content/mammoreport - mammoreport.csv")
df.head()

Unnamed: 0,Features,Birads
0,Soft tissue mass lesion (23 x 20 mm) with spic...,4
1,Parenchyma is predominantly FATTY.No distinctl...,1
2,Parenchyma is predominantly GLANDULAR. No dist...,1
3,Parenchyma is predominantly GLANDULAR. No dist...,2
4,Parenchyma is GLANDULAR and FATTY. Small subce...,2


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 107 entries, 0 to 106
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Features  107 non-null    object
 1   Birads    107 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 1.8+ KB


In [None]:
df['Birads'].value_counts()

2    37
1    34
4    12
3    12
5     7
0     3
6     2
Name: Birads, dtype: int64

In [None]:
corpus = []
for record in df['Features'].values[3:]:
  temp = record.split(".")
  sentences = []
  for sentence in temp:
    # Define a regular expression pattern to match punctuation
    punctuation_pattern = re.compile(r'[^\w\s]')

    # Use the sub() method to replace punctuation with a null space
    result = punctuation_pattern.sub('', sentence)

    if result and result[0]==' ':
      result = result[1:]

    if len(result)>2:
      corpus.append(result)

In [None]:
print(corpus[:5])

['Parenchyma is predominantly GLANDULAR', 'No distinctly identifiable radio opaque mass', 'No parenchymal calcification', 'Vascular pattern appears normal', 'Pectoralis and retromammary space appear normal']


## Sentence Probability Leave one out cross validation

In [None]:
# Example run
print("Sentence Probability")
print("BiGram")
lm = LanguageModels(corpus)
lm.build_ngrams(N=2)
lm.perplexity("Parenchyma is predominantly",display=True,N=2)
print("-"*80)
print("TriGram")
lm = LanguageModels(corpus)
lm.build_ngrams(N=3)
lm.perplexity("Parenchyma is predominantly",display=True,N=3)
print("-"*80)
print("4 Gram")
lm = LanguageModels(corpus)
lm.build_ngrams(N=4)
lm.perplexity("Parenchyma is predominantly",display=True,N=4)
print("-"*80)

Sentence Probability
BiGram
('<S>', 'Parenchyma') ('<S>',) 0.0711344922232388
('Parenchyma', 'is') ('Parenchyma',) 0.9040697674418605
('is', 'predominantly') ('is',) 0.859375
('predominantly', '</S>') ('predominantly',) 1.0328449905482042

Given sentence: Parenchyma is predominantly
The probability of the given sentence is: 0.05708211355513701
For the given sentence, the perplexity for 2 gram = 2.046
--------------------------------------------------------------------------------
TriGram
('<S>', '<S>', 'Parenchyma') ('<S>', '<S>') 0.0711344922232388
('<S>', 'Parenchyma', 'is') ('<S>', 'Parenchyma') 0.9967948717948718
('Parenchyma', 'is', 'predominantly') ('Parenchyma', 'is') 0.8685897435897436
('is', 'predominantly', '</S>') ('is', 'predominantly') 0.10467956700833413

Given sentence: Parenchyma is predominantly
The probability of the given sentence is: 0.006447073852816596
For the given sentence, the perplexity for 3 gram = 5.373
-------------------------------------------------------

In [None]:
bigram_preplexity = []
trigram_perplexity = []
quadgram_perplixity = []

for iter in range(len(corpus)):

  train = corpus[:iter] + corpus[iter+1:]
  test = corpus[iter]

  lm = LanguageModels(train)
  lm.build_ngrams(N=2)
  bigram_preplexity.append(lm.perplexity(test,N=2))

  lm = LanguageModels(corpus)
  lm.build_ngrams(N=3)
  trigram_perplexity.append(lm.perplexity(test,N=3))

  lm = LanguageModels(corpus)
  lm.build_ngrams(N=4)
  quadgram_perplixity.append(lm.perplexity(test,N=4))


In [None]:
print("Average Perplexity over the sentences for bi gram:",sum(bigram_preplexity)/len(bigram_preplexity))
print("Average Perplexity over the sentences for tri gram:",sum(trigram_perplexity)/len(trigram_perplexity))
print("Average Perplexity over the sentences for quad gram:",sum(quadgram_perplixity)/len(quadgram_perplixity))

Average Perplexity over the sentences for bi gram: 3.292308325709077
Average Perplexity over the sentences for tri gram: 2.557284537968889
Average Perplexity over the sentences for quad gram: 2.3617557182067737


## Inference:
  <b> Lower the perplexity better is the model, Quadgram seems to be a better choice compared to bigrams and trigrams.</b>

## Next word Prediction

In [None]:
lm.next_word_prediction("Parenchyma is",N=3)


Given sentence: Parenchyma is
The next word could be: predominantly


'predominantly'

## Text Generation

In [None]:
n_words = int(input("No of words to be generated:"))
start_word = input("Starting word:")

final = start_word

for i in range(n_words):
  t  = lm.next_word_prediction(final,N=3)

  if t=='</S>':
    break
  else:
    final+=" "+t
print("----------------------------Final Generated Text-------------------------------")
print(final)

No of words to be generated:5
Starting word:Pectoralis

Given sentence: Pectoralis
The next word could be: and

Given sentence: Pectoralis and
The next word could be: retromammary

Given sentence: Pectoralis and retromammary
The next word could be: space

Given sentence: Pectoralis and retromammary space
The next word could be: appear

Given sentence: Pectoralis and retromammary space appear
The next word could be: normal
----------------------------Final Generated Text-------------------------------
Pectoralis and retromammary space appear normal


## Text Classification using Language Models

### Approach:
  1. Divide the datasets based on class labels
  2. Construct a language model for each class label
  3. Now given a test text find the join probabilities on each of the language model.
  4. Whichever class language model has highest probability that test text belongs to that class.

In [None]:
#Train Test Split
train_df = df.sample(frac=1)
test_df = df.iloc[list(set(df.index)-set(train_df.index))]

In [None]:
print(len(df))
print(len(train_df))
print(len(test_df))

107
107
0


In [None]:
#Corpus generation

train_corpus_class_wise = {}
test_corpus_class_wise = {}
labels = train_corpus_class_wise.keys()

for label in df["Birads"].value_counts().index:
  train_corpus_class_wise[label] = []
  test_corpus_class_wise[label] = []

for label in labels:
  for record in train_df[train_df['Birads']==label]['Features'].values:
    temp = record.split(".")
    sentences = []
    for sentence in temp[:len(temp)-1]:
      # Define a regular expression pattern to match punctuation
      punctuation_pattern = re.compile(r'[^\w\s]')

      # Use the sub() method to replace punctuation with a null space
      result = punctuation_pattern.sub('', sentence)

      if result and result[0]==' ':
        result = result[1:]

      if len(result)>2:
        train_corpus_class_wise[label].append(result)

for label in labels:
  for record in train_df[train_df['Birads']==label]['Features'].values:
    temp = record.split(".")
    sentences = []
    for sentence in temp[len(temp)-1:]:
      # Define a regular expression pattern to match punctuation
      punctuation_pattern = re.compile(r'[^\w\s]')

      # Use the sub() method to replace punctuation with a null space
      result = punctuation_pattern.sub('', sentence)

      if result and result[0]==' ':
        result = result[1:]

      if len(result)>2:
        test_corpus_class_wise[label].append(result)

print(train_corpus_class_wise)
print(test_corpus_class_wise)

{2: ['Parenchyma is predominantly GLANDULAR', 'A small well defined radiopaque shadow seen in the superior and outer quadrant of the right breast', 'No spiculated margins  parenchymal calcifications', 'Vascular pattern appears normal', 'Pectoralis and retromammary space appear normal', 'Nippleareolar outline skin  subcutaneous fat plane are normal', 'No skin thickening or irregularity', 'No enlargement of axillary lymphnodes seen', 'A small well defined radiopaque shadow seen in the superior and outer quadrant of the right breast', 'No spiculated margins  parenchymal calcifications', 'No axillary lymphadenopathy on either side', 'Parenchyma is predominantly dense glandular', 'No distinctly identifiable radio opaque mass', 'No parenchymal calcification', 'Vascular pattern appears normal', 'Pectoralis and retromammary space appear normal', 'Nippleareolar outline skin  subcutaneous fat plane are normal', 'No skin thickening or irregularity', 'No enlargement of axillary lymphnodes seen', '

In [None]:
#Class wise language model generation
language_class_wise = {}

for label in df["Birads"].value_counts().index:
  language_class_wise[label] = LanguageModels(train_corpus_class_wise[label])
  language_class_wise[label].build_ngrams(N=3)

In [None]:
test_data = []
y_true = []

for label in test_corpus_class_wise:
  y_true.extend([label]*len(test_corpus_class_wise[label]))

  for rec in test_corpus_class_wise[label]:
    test_data.append(rec)
print("length of y_true",len(y_true))
print("length of test data",len(test_data))

length of y_true 8
length of test data 8


In [None]:
y_pred = []

for i in [test_data[0]]:
  temp = {}
  for model in language_class_wise:
    temp[model]=language_class_wise[model].smooth_sentence_probability(i)[0]
    print("For class:",model,"Probability is:",temp[model])
  y_pred.append(list(dict(sorted(temp.items(), key=lambda item: item[1])).keys())[-1])

For class: 2 Probability is: 1.5758782873939244e-13
For class: 1 Probability is: 3.4067859981405407e-14
For class: 4 Probability is: 5.491241151993133e-12
For class: 3 Probability is: 1.1659378400017468e-11
For class: 5 Probability is: 2.420542775584304e-12
For class: 0 Probability is: 8.849114532292286e-12
For class: 6 Probability is: 1.2887371665710715e-10
