# Language Modeling using Ngram

In this Exercise, you are going to use NLTK which is a natural language processing library for python to create a bigram language model and its variation. You will build one model for each of the following type and calculate their perplexity:
- Unigram Model
- Bigram Model
- Bigram Model with add one estimation
- Bigram Model with Interpolation
- Bigram Model with Kneser-ney Interpolation
- Neural LM



In [1]:
# from google.colab import drive
# drive.mount('/content/drive')

In [2]:
# #download corpus
# import shutil
# shutil.copy("/content/drive/MyDrive/FRA 501 IntroNLP&DL/Dataset/BEST2010.zip", "/content/BEST2010.zip")
# !unzip BEST2010.zip

In [3]:
#First we import necessary library such as math, nltk, bigram, and collections.
import math
import nltk
import io
import random
from random import shuffle
from nltk import bigrams, trigrams
from collections import Counter, defaultdict
random.seed(999)

BEST2010 is a free Thai NLP dataset by NECTEC usually use as a standard benchmark for various NLP tasks includeing language modeling. BEST2010 is separated into 4 domain article, encyclopedia, news and novel. The data is already  tokenized using '|' as a separator.

For example,

ตาม|ที่|นางประนอม ทองจันทร์| |กับ| |ด.ช.กิตติพงษ์ แหลมผักแว่น| |และ| |ด.ญ.กาญจนา กรองแก้ว| |ป่วย|สงสัย|ติด|เชื้อ|ไข้|ขณะ|นี้|ยัง|ไม่|ดี|ขึ้น|

In [4]:
# We choose news domain as our dataset
best2010=[]
fp= io.open('Dataset/BEST2010/news.txt','r',encoding='utf-8')
for i,line in enumerate(fp):
    best2010.append(line.strip()[:-1])
fp.close()
all_vocabulary =set()
total_word_count =0
for line in best2010:
    for word in line.split('|'):        
        all_vocabulary.add(word)
        total_word_count+=1
print(best2010[:3])

['สงสัย|ติด|หวัด|นก| |อีก|คน|ยัง|น่า|ห่วง', 'ตาม|ที่|นางประนอม ทองจันทร์| |กับ| |ด.ช.กิตติพงษ์ แหลมผักแว่น| |และ| |ด.ญ.กาญจนา กรองแก้ว| |ป่วย|สงสัย|ติด|เชื้อ|ไข้|ขณะ|นี้|ยัง|ไม่|ดี|ขึ้น', 'หลัง|เข้า|เยี่ยม|ดู|อาการ|ผู้|ป่วย|แล้ว| |น.พ.จรัล|ประชุม|ร่วม|กับ|เจ้าหน้าที่|ทุก|ฝ่าย| |เพื่อ|สรุป|ผล|การ|ดำเนิน|การ| |รวม|ทั้ง|สอบสวน|โรค|ก่อน|ที่|ผู้|ป่วย|จะ|ถูก|ส่ง|มา|รักษา|ตัว| |จาก|นั้น|ร่วม|กัน|แถลง|ข่าว| |โดย| |น.พ.จรัล|กล่าว|ว่า| |ขณะ|นี้|ผู้|ป่วย|ทั้ง| |3| |ราย| |อาการ|ยัง|ทรง| |โดย|ใน|ราย|ของ| |ด.ช.กิตติพงษ์| |กับ| |ด.ญ.กาญจนา| |ปอด|หาย|เป็น|ปกติ|แล้ว| |คาด|ว่า|จะ|กลับ|บ้าน|ได้|ใน|ไม่|ช้า|นี้| |แต่|ใน|ราย|ของ|นางประนอม|อาการ|ยัง|น่า|เป็นห่วง| |ซึ่ง|ทั้ง| |3| |ราย| |ใน|ชั้น|นี้|ถือ|ว่า|เป็น|ผู้|ป่วย|อยู่|ใน|ขั้น|น่า|สงสัย|อาจ|ติด|เชื้อ|ไข้|หวัด|นก| |เพราะ|ตรวจ|พบ|ผู้|ป่วย|มี|อาการ|ปอด|บวม|ปอด|อักเสบ| |เนื่อง|จาก|ติด|เชื้อ|ไวรัส| |แต่|ยัง|สรุป|ไม่|ได้|ว่า|ติด|เชื้อ|ไข้|หวัด|นก|แน่ชัด|หรือ|ไม่| |ต้อง|รอ|ผล|ตรวจ|จาก|ห้อง|ปฏิบัติการ|ที่|ได้|ส่ง|ตัวอย่าง|เลือด| |ไป|ตรวจ|พิสูจน์|ที่|กรมวิทยาศาส

In [5]:
#For simplicity, we assumes that each line is a sentence.
print ('Total sentences in BEST2010 news dataset :\t'+ str(len(best2010)))
print ('Total word counts in BEST2010 news dataset :\t'+ str(total_word_count))
print ('Total vocabulary in BEST2010 news dataset :\t'+ str(len(all_vocabulary)))

Total sentences in BEST2010 news dataset :	30969
Total word counts in BEST2010 news dataset :	1660190
Total vocabulary in BEST2010 news dataset :	35488


We separate out input into 2 sets, train and test data with 70:30 ratio

In [6]:
sentences = best2010
# The data is separated to train and test set with 70:30 ratio.
train = sentences[:int(len(sentences)*0.7)]
test = sentences[int(len(sentences)*0.7):]

#Training data
train_vocabulary =set()
train_word_count =0
for line in train:
    for word in line.split('|'):        
        train_vocabulary.add(word)
        train_word_count+=1
print ('Total sentences in BEST2010 news training dataset :\t'+ str(len(train)))
print ('Total word counts in BEST2010 news training dataset :\t'+ str(train_word_count))
print ('Total vocabuary in BEST2010 news training dataset :\t'+ str(len(train_vocabulary)))
# We will use 1/vocab_size as a default value for unknown word
unk_value = math.pow(len(train_vocabulary),-1)
print(unk_value)

Total sentences in BEST2010 news training dataset :	21678
Total word counts in BEST2010 news training dataset :	1042797
Total vocabuary in BEST2010 news training dataset :	26240
3.8109756097560976e-05


# Unigram

In this section, we will demonstrate how to build a unigram language model <br>
**Important note:** <br>
**\<s\>** = sentence start symbol <br>
**\</s\>** = sentence end symbol 

In [7]:
def getUnigramModel(data,bool=0):
    if bool == 1:
        model = defaultdict(lambda: 0)
        print("model " + str(model))
        word_count =0
        for sentence in data:
            print("before += " +str(sentence))
            sentence +=  u'|</s>' #for unigram model we can always ignore <s>, since p(w0=<s>)=1
            print("after +=" +str(sentence))
            for w1 in sentence.split('|'):
                model[w1] +=1.0
                print("model[w1] " + str(model))
                word_count+=1
        for w1 in model:
            model[w1] = model[w1]/(word_count)
            print("model[w1]/(wordcount) "+str(model))
        return model
    else :
        model = defaultdict(lambda: 0)
        word_count =0
        for sentence in data:
            sentence +=  u'|</s>' #for unigram model we can always ignore <s>, since p(w0=<s>)=1
            for w1 in sentence.split('|'):
                model[w1] +=1.0
                word_count+=1
        for w1 in model:
            model[w1] = model[w1]/(word_count)
        return model

In [8]:
GeeGee = sentences[:int(len(sentences)*0.00005)]
print(len(GeeGee))
print(GeeGee)
dd = getUnigramModel(GeeGee,1)
dd
# print(dd[u'นายก'])

1
['สงสัย|ติด|หวัด|นก| |อีก|คน|ยัง|น่า|ห่วง']
model defaultdict(<function getUnigramModel.<locals>.<lambda> at 0x000001EE8D43F4C0>, {})
before += สงสัย|ติด|หวัด|นก| |อีก|คน|ยัง|น่า|ห่วง
after +=สงสัย|ติด|หวัด|นก| |อีก|คน|ยัง|น่า|ห่วง|</s>
model[w1] defaultdict(<function getUnigramModel.<locals>.<lambda> at 0x000001EE8D43F4C0>, {'สงสัย': 1.0})
model[w1] defaultdict(<function getUnigramModel.<locals>.<lambda> at 0x000001EE8D43F4C0>, {'สงสัย': 1.0, 'ติด': 1.0})
model[w1] defaultdict(<function getUnigramModel.<locals>.<lambda> at 0x000001EE8D43F4C0>, {'สงสัย': 1.0, 'ติด': 1.0, 'หวัด': 1.0})
model[w1] defaultdict(<function getUnigramModel.<locals>.<lambda> at 0x000001EE8D43F4C0>, {'สงสัย': 1.0, 'ติด': 1.0, 'หวัด': 1.0, 'นก': 1.0})
model[w1] defaultdict(<function getUnigramModel.<locals>.<lambda> at 0x000001EE8D43F4C0>, {'สงสัย': 1.0, 'ติด': 1.0, 'หวัด': 1.0, 'นก': 1.0, ' ': 1.0})
model[w1] defaultdict(<function getUnigramModel.<locals>.<lambda> at 0x000001EE8D43F4C0>, {'สงสัย': 1.0, 'ติด': 

defaultdict(<function __main__.getUnigramModel.<locals>.<lambda>()>,
            {'สงสัย': 0.09090909090909091,
             'ติด': 0.09090909090909091,
             'หวัด': 0.09090909090909091,
             'นก': 0.09090909090909091,
             ' ': 0.09090909090909091,
             'อีก': 0.09090909090909091,
             'คน': 0.09090909090909091,
             'ยัง': 0.09090909090909091,
             'น่า': 0.09090909090909091,
             'ห่วง': 0.09090909090909091,
             '</s>': 0.09090909090909091})

In [9]:
model = getUnigramModel(train)

In [10]:
def getLnValue(x):
    if x >0.0:
        return math.log(x)
    else:
        return math.log(unk_value)

In [11]:
#problability of 'นายก'
print(getLnValue(model[u'นายก']))
#for example, problability of 'นายกรัฐมนตรี' which is an unknown word is equal to
print(getLnValue(model[u'นายกรัฐมนตรี']))
#problability of 'นายก' 'ได้' 'ให้' 'สัมภาษณ์' 'กับ' 'สื่อ'
prob = getLnValue(model[u'นายก'])+getLnValue(model[u'ได้'])+ getLnValue(model[u'ให้'])+getLnValue(model[u'สัมภาษณ์'])+getLnValue(model[u'กับ'])+getLnValue(model[u'สื่อ'])+getLnValue(model['</s>'])
print ('Problability of a sentence', math.exp(prob))


-6.551526663995246
-10.175040243058024
Problability of a sentence 5.617210748667918e-18


## TODO #1 **Calculate perplexity**

In order to compare language model we need to calculate perplexity. In this task you should write a perplexity calculation code for the unigram model. The result perplexity should be around 556.39 and
476.07 on train and test data.

In [12]:
Unigram_model = getUnigramModel(train)

In [13]:
def calculate_sentence_ln_prob(sentence, model):
    word = sentence.split('|')
    ln_prob = 0
    # for ไล่เเต่ละคำใน sentence เพื่อคำนวณ LnValue --> sum ln_prob ทุกคำ
    for i in word: 
        ln_prob += getLnValue(model[i])
    # return ln_prob
    return ln_prob

def perplexity_unigram(test,model):
    ln_prob = 0
    word_count = 0
    # for ไล่เเต่ละ sentence --> คำนวณ calculate_sentence_ln_prob ของเเต่ละ sentence --> sum ln_prob ทุก sentence
    for sentence in test:
        sentence += u'|</s>'
        ln_prob += calculate_sentence_ln_prob(sentence,model)
        word_count += len(sentence.split('|'))
    # return exp(-ln_prob/word_count)
    return math.exp(-ln_prob/word_count)

In [14]:
print(perplexity_unigram(train,Unigram_model))
print(perplexity_unigram(test,Unigram_model))

556.3925994212195
476.0687892303532


# Bigram

Next, you will create a better language model than a unigram (which is not much to compare with). But first, it is very tedious to count every pair of words that occur in our corpus by ourselves. In this case, nltk provide us a simple library which will do it for us.

In [15]:
#example of nltk usage for bigram
sentence = 'I always search google for an answer .'

print('This is how nltk generate bigram.')
for w1,w2 in bigrams(sentence.split(), pad_right=True, pad_left=True):
    print (w1,w2)
print('None is used as a start and end of sentence symbol.')

This is how nltk generate bigram.
None I
I always
always search
search google
google for
for an
an answer
answer .
. None
None is used as a start and end of sentence symbol.


Now, you should be able to implement a bigram model by yourself. Also, you must create a new perplexity calculation for bigram. The result perplexity should be around 58.78 and 146.26 on train and test data.

## TODO #2 **Create a Bigram Model**

In [16]:
def getBigramModel(data):
    ###FILL YOUR CODE HERE###
    unigram_count = defaultdict(lambda: 0.0)
    bigram_count = defaultdict(lambda: 0.0)
    model = defaultdict(lambda: 0.0)
    # for เเต่ละ sentence
    #   for เเต่ละ token ในรูปเเบบ bigram ที่ generate ขึ้นมา
    #     bigram_count[?] = ?
    #     unigram_count[?] = ?
    for sentence in data:
        for w1,w2 in bigrams(sentence.split('|'), pad_right=True, pad_left=True):
            bigram_count[w1,w2] += 1.0
            unigram_count[w1] += 1

    # for ไล่เเต่ละ token ใน bigram ทั้งหมด
    #   model[?] = ?

    for i in bigram_count:
        model[i] = bigram_count[i]/unigram_count[i[0]]
    return model

Bigram_model = getBigramModel(train)
# Bigram_model = getBigramModel(['สงสัย|ติด|หวัด|นก| |อีก|คน|ยัง|น่า|ห่วง','สงสัย|ติด|หวัด|นก'])
# Bigram_model

## TODO #3 **Calculate Perplexity for Bigram Model**



In [17]:
def calculate_sentence_ln_prob(sentence, model):
    # คำนวณจาก getBigramModel อย่าเติม <s> หรือ </s> เอง !!!
    # for ไล่เเต่ละคำใน sentence เพื่อคำนวณ LnValue --> sum ln_prob ทุกคำ
    ln_prob = 0
    for w1,w2 in bigrams(sentence.split('|'), pad_right=True, pad_left=True):
        ln_prob += getLnValue(model[w1,w2])
        # return ln_prob
    return ln_prob

def perplexity_bigram(test,model):
    ln_prob = 0
    word_count = 0
    # for ไล่เเต่ละ sentence --> คำนวณ calculate_sentence_ln_prob ของเเต่ละ sentence --> sum ln_prob ทุก sentence
    for sentence in test:
        word_count += len(sentence.split('|')) + 1 # The number of words that are even
        ln_prob += calculate_sentence_ln_prob(sentence,model)
    
    return math.exp(-ln_prob/word_count)

In [18]:
def calculate_sentence_ln_prob(sentence, model):
    # คำนวณจาก getBigramModel อย่าเติม <s> หรือ </s> เอง !!!
    # for ไล่เเต่ละคำใน sentence เพื่อคำนวณ LnValue --> sum ln_prob ทุกคำ
    ln_prob = 0
    for w1,w2 in bigrams(sentence.split('|'), pad_right=True, pad_left=True):
        ln_prob += getLnValue(model[w1,w2])
        # return ln_prob
    return ln_prob

def perplexity_bigram(test,model):
    ln_prob = 0
    word_count = 0
    # for ไล่เเต่ละ sentence --> คำนวณ calculate_sentence_ln_prob ของเเต่ละ sentence --> sum ln_prob ทุก sentence
    for sentence in test:
        word_count += len(sentence.split('|')) + 1 # The number of words that are even
        ln_prob += calculate_sentence_ln_prob(sentence,model)
    
    return math.exp(-ln_prob/word_count)

In [19]:
print (perplexity_bigram(train,Bigram_model) )
print (perplexity_bigram(test, Bigram_model))

# 58.78942889767147
# 146.26539331038614

58.78942889767147
146.26539331038614


# Smoothing

Usually any ngram models have a sparsity problem, which means it does not have every possible ngram of words in the dataset. Smoothing techniques can alleviate this problem. In this section, you will implement two basic smoothing methods laplace smoothing and interpolation for bigram.

## TODO #4 **Bigram with add-one estimation**

In [20]:
#Laplace Smoothing
def getBigramWithAddOneEstimation(data):
    ###FILL YOUR CODE HERE###
    unigram_count = defaultdict(lambda: 0.0)
    bigram_count = defaultdict(lambda: 0.0)
    model = defaultdict(lambda: 0.0)
    # for เเต่ละ sentence
    #   for เเต่ละ token ในรูปเเบบ bigram ที่ generate ขึ้นมา
    #     bigram_count[?] = ?
    #     unigram_count[?] = ?
    for sentence in data:
        for w1,w2 in bigrams(sentence.split('|'), pad_right=True, pad_left=True):
            bigram_count[w1,w2] += 1.0
            unigram_count[w1] += 1
  
    # for ไล่เเต่ละ token ใน bigram ทั้งหมด
    #   model[?] = ?
    for i in bigram_count:
        model[i] = (bigram_count[i]+1)/(unigram_count[i[0]]+len(unigram_count))
    return model

AddOne_model = getBigramWithAddOneEstimation(train)
print (perplexity_bigram(train,AddOne_model) )
print (perplexity_bigram(test, AddOne_model))

# 974.8134581679766
# 1098.1622194979489

974.8134581679766
1098.1622194979489


## TODO #5 **Bigram with Interpolation**
lambda value is 0.7 for bigram, 0.25 for unigram, and 0.05 for unknown word

In [21]:
#interpolation
def getBigramWithInterpolation(data):
    #Fill code here
    unigram_count = defaultdict(lambda: 0.0)
    bigram_count = defaultdict(lambda: 0.0)
    model = defaultdict(lambda: 0.0)
    word_count = 0
    lambda2 = 0.7
    lambda1 = 0.25
    lambda0 = 0.05
    # for เเต่ละ sentence
    #   for เเต่ละ token ใน bigram ที่ generate ขึ้นมา
    #     bigram_count[?] = ?
    #     unigram_count[?] = ?
    for sentence in data:
        for w1,w2 in bigrams(sentence.split('|'), pad_right=True, pad_left=True):
            bigram_count[w1,w2] += 1.0
            unigram_count[w1] += 1
            if w1 != None:
                word_count +=1
    print('word_count '+'\t : '+str(word_count))
    print("Unigram_count "+'\t : '+str(len(unigram_count)))
    print("Bigram_count "+'\t : '+str(len(bigram_count)))
    print("Unknow_Word "+'\t : '+str(1/len(unigram_count)))
    # for เเต่ละ key ใน bigrams
    #   bigram_prob
    #   unigram_prob
    #   model[key] = สูตร bigram, unigram, unk_value (1/vocab)
    for i in bigram_count:
        bigram_prob = bigram_count[i]/unigram_count[i[0]]
        unigram_prob = unigram_count[i[1]]/(word_count)
        model[i] = (lambda2*bigram_prob)+(lambda1*unigram_prob)+(lambda0*(1/(len(unigram_count)-1)))
    return model
    
inter_model = getBigramWithInterpolation(train)
print (perplexity_bigram(train,inter_model))        
print (perplexity_bigram(test,inter_model))

# 73.38409869825665
# 172.67485908813356
# 3.8109756097560976e-05

word_count 	 : 1042797
Unigram_count 	 : 26241
Bigram_count 	 : 224393
Unknow_Word 	 : 3.810830379939789e-05
73.38409869825665
172.67485908813356


# Language modeling on multiple domains

Sometimes, we do not have enough data to create a language model for a new domain. In that case, we can improvised by combining several models to improve result on the new domain.

In this exercise you will try to merge two language models from news and article domains to create a language model for the encyclopedia domain.

In [22]:
# create article data
encyclo_data=[]
fp= io.open('Dataset/BEST2010/encyclopedia.txt','r',encoding='utf-8')
for i,line in enumerate(fp):
    encyclo_data.append(line.strip()[:-1])
fp.close()
print("len encyclo "+"\t : "+str(len(encyclo_data)))
print(encyclo_data[:3])

len encyclo 	 : 50631
['\ufeff|เครื่องมือ|ทุ่น|แรง|และ|เครื่อง|จักรกล', 'เครื่องมือ|ทุ่น|แรง', 'หมาย|ถึง| |เครื่องมือ|ที่|เกษตรกร|ใช้|ใน|การ|ทำ|งาน|เกษตร|เพื่อ|ช่วย|ลด|ความ|ยากลำบาก|ใน|การ|ทำ|งาน']


First, you should try to calculate perplexity of your bigram with interpolation using "news data" (train) on "encyclopedia data" (test). The result perplexity should be around 727.35.

For your information, a bigram model with interpolation using "ariticle data" (train) to test on "encyclopedia data" (test) has a perplexity of 505.79.

In [23]:
article_data=[]
fp= io.open('Dataset/BEST2010/article.txt','r',encoding='utf-8')
for i,line in enumerate(fp):
    article_data.append(line.strip()[:-1])
fp.close()
news_data=[]
fp= io.open('Dataset/BEST2010/news.txt','r',encoding='utf-8')
for i,line in enumerate(fp):
    news_data.append(line.strip()[:-1])
fp.close()
print("len(article)"+"\t : "+str(len(article_data)))
print(article_data[:3])
print("len(news)"+"\t : "+str(len(news_data)))
print(news_data[:2])

len(article)	 : 16990
['กฎหมาย|กับ|การ|เบียดบัง|คน|จน', 'จาก|ต้นฉบับ|เรื่อง| |"|บท|นำ|:| |คน|จน|ภาย|ใต้|ความ|สัมพันธ์|ทาง|กฎหมาย|"', 'ไพสิฐ พาณิชย์กุล']
len(news)	 : 30969
['สงสัย|ติด|หวัด|นก| |อีก|คน|ยัง|น่า|ห่วง', 'ตาม|ที่|นางประนอม ทองจันทร์| |กับ| |ด.ช.กิตติพงษ์ แหลมผักแว่น| |และ| |ด.ญ.กาญจนา กรองแก้ว| |ป่วย|สงสัย|ติด|เชื้อ|ไข้|ขณะ|นี้|ยัง|ไม่|ดี|ขึ้น']


In [24]:
sentences = article_data
# The data is separated to train and test set with 70:30 ratio.
article_train = sentences[:int(len(sentences)*0.7)]
article_test = sentences[int(len(sentences)*0.7):]
sentences = news_data
News_train = sentences[:int(len(sentences)*0.7)]
News_test = sentences[int(len(sentences)*0.7):]

#Training data
train_vocabulary =set()
train_word_count =0
for line in article_train:
    for word in line.split('|'):        
        train_vocabulary.add(word)
        train_word_count+=1
print ('Total sentences in article_train news training dataset :\t'+ str(len(article_train)))
print ('Total word counts in word_count news training dataset : \t'+ str(train_word_count))
print ('Total vocabuary in vocabulary_train news training dataset :\t'+ str(len(train_vocabulary)))

Total sentences in article_train news training dataset :	11893
Total word counts in word_count news training dataset : 	821549
Total vocabuary in vocabulary_train news training dataset :	20669


In [25]:
inter_article = getBigramWithInterpolation(article_train)           #article train 70%
inter_news = getBigramWithInterpolation(News_train)                 #News train 70%

word_count 	 : 821549
Unigram_count 	 : 20670
Bigram_count 	 : 172841
Unknow_Word 	 : 4.837929366231253e-05
word_count 	 : 1042797
Unigram_count 	 : 26241
Bigram_count 	 : 224393
Unknow_Word 	 : 3.810830379939789e-05


In [26]:
# print perplexity of bigram with interpolation on article data        
# 727.3502637212223
print (perplexity_bigram(encyclo_data,inter_news))
print (perplexity_bigram(encyclo_data,inter_article))

727.3502637212223
567.3844103654618


## TODO #6 
Write a model that produce 450.0 or less perplexity on encyclopedia data without using data from the encyclopedia as training data. (Hint : Try to combine a model with news data and a model with article data together.)

In [27]:
def combine_ditionary(dict1,dict2):
    sum_model = defaultdict(lambda: 0.0)
    for i in dict1:
        if i in dict1 and i in dict2:
            sum_model[i] += ((dict1[i] + dict2[i])/2)
            # dict3 = {**dict1, **dict2, i : sum_b}
    dict3 = {**dict1,**dict2,**sum_model,}
    return dict3,sum_model
combined_dit,sum_model = combine_ditionary(inter_model,inter_article)
print ('inter_article'+'\t'+ str(len(inter_article)))
print ('inter_modal'+'\t'+ str(len(inter_model)))
print ('combined_model'+'\t'+ str(len(combined_dit)))
print ('sum_medel'+'\t'+ str(len(sum_model)))

inter_article	355570
inter_modal	303756
combined_model	572321
sum_medel	87005


In [28]:
def combine_dataset(data1,data2):
    Data = data1+data2
    if data1 == data2 : 
        print(1)
    model = getBigramWithInterpolation(Data)
    return model
combined_data = combine_dataset(article_train,News_train)
print ('combined_data'+'\t'+ str(len(combined_data)))

word_count 	 : 1864346
Unigram_count 	 : 40135
Bigram_count 	 : 344834
Unknow_Word 	 : 2.4915908807773765e-05
combined_data	344834


In [29]:
# 428.85251789073953 (on combined data)
print('Perplexity of combine Bigram model with interpolation smoothing on encyclopedia test data',perplexity_bigram(encyclo_data, combined_dit))
print('Perplexity of combine Bigram model with interpolation smoothing on encyclopedia test data',perplexity_bigram(encyclo_data, combined_data))

Perplexity of combine Bigram model with interpolation smoothing on encyclopedia test data 447.21136425151093
Perplexity of combine Bigram model with interpolation smoothing on encyclopedia test data 461.2407768214586


## TODO #7 
## Kneser-ney on "News"

<!-- Reimplement equation 4.33 in SLP textbook (https://lagunita.stanford.edu/c4x/Engineering/CS-224N/asset/slp4.pdf) -->

Implement Bigram Knerser-ney LM. The result perplexity should be around 71.14054002208687 and 174.02464248000433 on train and test data. 


In [30]:
# Fill codehere
#-------------------------------------------
# Create unigram and bigram counting table
def getKnerser_ney_LM(data):
    unigram_count = defaultdict(lambda: 0.0)
    bigram_count = defaultdict(lambda: 0.0)
    model = defaultdict(lambda: 0.0)
    word_count = 0
    lambda1 = 0.75
    d = 0.75
    for sentence in data:
        for w1,w2 in bigrams(sentence.split('|'), pad_right=True, pad_left=True):
            bigram_count[w1,w2] += 1.0
            unigram_count[w1] += 1
            if w1 != None:
                word_count +=1
    print('word_count '+'\t : '+str(word_count))
    print("Unigram_count "+'\t : '+str(len(unigram_count)))
    print("Bigram_count "+'\t : '+str(len(bigram_count)))
    print("Unknow_Word "+'\t : '+str(1/len(unigram_count)))
    
    Dist_PT_of_Wi = defaultdict(lambda: 0.0)
    Dist_PT_of_W_i = defaultdict(lambda: 0.0)
    for k in bigram_count:
        Dist_PT_of_Wi[k[1]] += 1 
        Dist_PT_of_W_i[k[0]] += 1
    
    print("Dist_PT_of_Wi "+'\t : '+str(len(Dist_PT_of_Wi)))
    print("Dist_PT_of_W_i "+'\t : '+str(len(Dist_PT_of_Wi)))

    for i in bigram_count:


        K =     max([bigram_count[i]-d,0])/unigram_count[i[0]]
        N =     (lambda1*(Dist_PT_of_W_i[i[0]]))/unigram_count[i[0]]
        S =     (Dist_PT_of_Wi[i[1]])/len(bigram_count)

        model[i] = K+(N*S)
    print("Finish")
    return model
Knerser_ney_LM = getKnerser_ney_LM(train)
print (perplexity_bigram(train,Knerser_ney_LM))
print (perplexity_bigram(test,Knerser_ney_LM))

# 71.14054002208687
# 174.02464248000433 Wrong

word_count 	 : 1042797
Unigram_count 	 : 26241
Bigram_count 	 : 224393
Unknow_Word 	 : 3.810830379939789e-05
Dist_PT_of_Wi 	 : 26241
Dist_PT_of_W_i 	 : 26241
Finish
71.14054002208687
155.09274968738495


## TODO #8
## Neural LM 
do it on news corpus that we splitted into train and test sets at the beginning of this exercise. 

In [31]:
#find the perplexity of the model
# คำนวนจาก loss

#there are many ways to do this. e.g.:
#https://machinelearningmastery.com/develop-word-based-neural-language-models-python-keras/


In [32]:
# We choose news domain as our dataset
best2010=[]
fp= io.open('Dataset/BEST2010/news.txt','r',encoding='utf-8')
for i,line in enumerate(fp):
    best2010.append(line.strip()[:-1])
fp.close()
all_vocabulary =set()
total_word_count =0
for line in best2010:
    for word in line.split('|'):        
        all_vocabulary.add(word)
        total_word_count+=1
print(best2010[:3])

['สงสัย|ติด|หวัด|นก| |อีก|คน|ยัง|น่า|ห่วง', 'ตาม|ที่|นางประนอม ทองจันทร์| |กับ| |ด.ช.กิตติพงษ์ แหลมผักแว่น| |และ| |ด.ญ.กาญจนา กรองแก้ว| |ป่วย|สงสัย|ติด|เชื้อ|ไข้|ขณะ|นี้|ยัง|ไม่|ดี|ขึ้น', 'หลัง|เข้า|เยี่ยม|ดู|อาการ|ผู้|ป่วย|แล้ว| |น.พ.จรัล|ประชุม|ร่วม|กับ|เจ้าหน้าที่|ทุก|ฝ่าย| |เพื่อ|สรุป|ผล|การ|ดำเนิน|การ| |รวม|ทั้ง|สอบสวน|โรค|ก่อน|ที่|ผู้|ป่วย|จะ|ถูก|ส่ง|มา|รักษา|ตัว| |จาก|นั้น|ร่วม|กัน|แถลง|ข่าว| |โดย| |น.พ.จรัล|กล่าว|ว่า| |ขณะ|นี้|ผู้|ป่วย|ทั้ง| |3| |ราย| |อาการ|ยัง|ทรง| |โดย|ใน|ราย|ของ| |ด.ช.กิตติพงษ์| |กับ| |ด.ญ.กาญจนา| |ปอด|หาย|เป็น|ปกติ|แล้ว| |คาด|ว่า|จะ|กลับ|บ้าน|ได้|ใน|ไม่|ช้า|นี้| |แต่|ใน|ราย|ของ|นางประนอม|อาการ|ยัง|น่า|เป็นห่วง| |ซึ่ง|ทั้ง| |3| |ราย| |ใน|ชั้น|นี้|ถือ|ว่า|เป็น|ผู้|ป่วย|อยู่|ใน|ขั้น|น่า|สงสัย|อาจ|ติด|เชื้อ|ไข้|หวัด|นก| |เพราะ|ตรวจ|พบ|ผู้|ป่วย|มี|อาการ|ปอด|บวม|ปอด|อักเสบ| |เนื่อง|จาก|ติด|เชื้อ|ไวรัส| |แต่|ยัง|สรุป|ไม่|ได้|ว่า|ติด|เชื้อ|ไข้|หวัด|นก|แน่ชัด|หรือ|ไม่| |ต้อง|รอ|ผล|ตรวจ|จาก|ห้อง|ปฏิบัติการ|ที่|ได้|ส่ง|ตัวอย่าง|เลือด| |ไป|ตรวจ|พิสูจน์|ที่|กรมวิทยาศาส

In [33]:
sentences = best2010
# The data is separated to train and test set with 70:30 ratio.
news_train = sentences[:int(len(sentences)*0.7)]
news_test = sentences[int(len(sentences)*0.7):]

#Training data
train_vocabulary =set()
train_word_count =0
for line in news_train:
    for word in line.split('|'):        
        train_vocabulary.add(word)
        train_word_count+=1
vocab_size  = len(train_vocabulary)
print ('Total sentences in BEST2010 news training dataset :\t'+ str(len(news_train)))
print ('Total word counts in BEST2010 news training dataset :\t'+ str(train_word_count))
print ('Total vocabuary in BEST2010 news training dataset :\t'+ str(vocab_size))
# We will use 1/vocab_size as a default value for unknown word
unk_value = math.pow(len(train_vocabulary),-1)
print(unk_value)
print(news_train[:2])


Total sentences in BEST2010 news training dataset :	21678
Total word counts in BEST2010 news training dataset :	1042797
Total vocabuary in BEST2010 news training dataset :	26240
3.8109756097560976e-05
['สงสัย|ติด|หวัด|นก| |อีก|คน|ยัง|น่า|ห่วง', 'ตาม|ที่|นางประนอม ทองจันทร์| |กับ| |ด.ช.กิตติพงษ์ แหลมผักแว่น| |และ| |ด.ญ.กาญจนา กรองแก้ว| |ป่วย|สงสัย|ติด|เชื้อ|ไข้|ขณะ|นี้|ยัง|ไม่|ดี|ขึ้น']


In [34]:
import keras 
print(keras.__version__)



2.11.0


model 1

In [35]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Input, Dense , Conv1D , Embedding , Flatten , LSTM
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
import numpy as np

model 2

In [36]:

from numpy import array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Embedding
 

### ////////////////////////////////////////////////////////////Model 1: One-Word-In, One-Word-Out Sequences//////////////////////////////////////////////////////////////////

In [37]:
# source text
# train = " Jack and Jill went up the hill\nTo fetch a pail of water\nJack fell down and broke his crown\nAnd Jill came tumbling after\n "

# train = ['สงสัย|ติด|หวัด|นก| |อีก|คน|ยัง|น่า|ห่วง']
# train0 = ['ตาม|ที่|นางประนอม ทองจันทร์| |กับ| |ด.ช.กิตติพงษ์ แหลมผักแว่น| |และ| |ด.ญ.กาญจนา กรองแก้ว| |ป่วย|สงสัย|ติด|เชื้อ|ไข้|ขณะ|นี้|ยัง|ไม่|ดี|ขึ้น']
# train = ['สงสัย|ติด|หวัด|นก| |อีก|คน|ยัง|น่า|ห่วง', 'สงสัย|ตาม|ที่|นางประนอม ทองจันทร์| |กับ| |ด.ช.กิตติพงษ์ แหลมผักแว่น| |และ| |ด.ญ.กาญจนา กรองแก้ว| |ป่วย|สงสัย|ติด|เชื้อ|ไข้|ขณะ|นี้|ยัง|ไม่|ดี|ขึ้น']
print(len(news_train))
train0 = news_train[:int(len(news_train)*0.1)]
test0 = news_test[:int(len(news_test)*0.1)]
print(len(train0))
print(len(test0))
train1 = '\n'.join(train0)
test1 = '\n'.join(test0)
print(len(train1))
print(len(test1))

21678
2167
929
504597
223243


In [38]:
def encoded_X_y(train,tokenizer = None):
    # one hot encode outputs
    if tokenizer == None:
        tokenizer = Tokenizer(split=' ')
        tokenizer.fit_on_texts([train])

    encoded = tokenizer.texts_to_sequences([train])[0]  # text to number
    # print('Encoded :', encoded)
    print('Encoded Size: %d' % len(encoded))
    print('Encoded[]:' ,encoded[:])

    # determine the vocabulary size
    vocab_size = len(tokenizer.word_index) + 1          # vocab +1 for 1....22 Not 0.....21
    print('Vocabulary Size: %d' % vocab_size)

    # create word -> word sequences
    sequences = list()
    for i in range(1, len(encoded)):
        sequence = encoded[i-1:i+1]
        sequences.append(sequence)                         # bigram
    print('Total Sequences: %d' % len(sequences))
    print('Sequences[]: ', sequences[0:2])


    # split into X and y elements
    sequences = np.array(sequences)
    X, y = sequences[:,0],sequences[:,1]                # Work1 and Work2
    print('X: ', X,len(X))
    print('y: ', y,len(y))

    # one hot encode outputs
    y = to_categorical(y, num_classes=vocab_size)       # row24 column22
    print('y one hot: ', y.shape)
    return X,y,vocab_size,tokenizer
# X,y = encoded_X_y(news_train[:int(len(news_train)*0.1)])
X,y,vocab_size,tokenizer = encoded_X_y(train1)

Encoded Size: 95844
Encoded[]: [143, 71, 25, 22, 55, 19, 30, 167, 543, 89, 1, 2289, 3054, 35, 308, 350, 2290, 3055, 7, 308, 863, 2291, 3056, 72, 143, 71, 28, 29, 56, 16, 30, 6, 106, 46, 83, 45, 749, 216, 93, 17, 72, 33, 69, 61, 658, 152, 110, 35, 99, 84, 353, 53, 640, 73, 2, 301, 2, 102, 66, 309, 36, 122, 1, 17, 72, 8, 117, 107, 14, 131, 40, 15, 34, 110, 41, 267, 77, 27, 69, 61, 658, 31, 3, 56, 16, 17, 72, 66, 90, 57, 93, 30, 790, 27, 5, 57, 12, 308, 350, 2290, 35, 308, 863, 2291, 314, 395, 11, 298, 33, 361, 3, 8, 180, 80, 9, 5, 6, 1492, 16, 26, 5, 57, 12, 2289, 93, 30, 167, 601, 32, 66, 90, 57, 5, 502, 16, 175, 3, 11, 17, 72, 39, 5, 864, 167, 143, 140, 71, 28, 29, 25, 22, 47, 115, 50, 17, 72, 4, 93, 314, 535, 314, 750, 231, 15, 71, 28, 196, 26, 30, 640, 6, 9, 3, 71, 28, 29, 25, 22, 1071, 44, 6, 20, 273, 73, 115, 15, 222, 503, 1, 9, 107, 362, 504, 13, 115, 618, 1, 3057, 7, 254, 906, 32, 8, 161, 73, 5, 38, 516, 38, 16, 69, 61, 658, 31, 59, 3, 15, 2, 309, 36, 50, 3, 17, 1, 114, 91, 483, 

In [39]:
# define model
model_1 = Sequential()
model_1.add(Embedding(vocab_size, 10, input_length=1))
model_1.add(LSTM(50))
model_1.add(Dense(vocab_size, activation='softmax'))
print(model_1.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 1, 10)             52900     
                                                                 
 lstm (LSTM)                 (None, 50)                12200     
                                                                 
 dense (Dense)               (None, 5290)              269790    
                                                                 
Total params: 334,890
Trainable params: 334,890
Non-trainable params: 0
_________________________________________________________________
None


In [40]:
# compile network
model_1.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit network
model_1.fit(X, y, epochs=10, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1eec3fae3a0>

In [41]:

# evaluate
in_text = 'ที่'
# in_text = 'jill'
# print(in_text)
encoded = tokenizer.texts_to_sequences([in_text])[0]
# print('encode1 token \t:',encoded)
encoded = np.array(encoded)
# print('encode array \t: ',encoded)
predict_x = model_1.predict(encoded) 
# print('predict_x \t: ',predict_x)
classes_x= np.argmax(predict_x,axis=1)
yhat = int(classes_x[0])
# print(yhat,type(yhat))
# yhat = model.predict_classes(encoded, verbose=0)
for word, index in tokenizer.word_index.items():
    #  print(type(index),type(yhat))
    # print(index,yhat)
    if index+1 == yhat:
        print(word)
    if index == yhat:
        print(word)

ว่า
มี


### Perplexity Model 1

In [42]:
model_1.evaluate(X, y)



[4.762932777404785, 0.18666987121105194]

In [43]:
def Perplexity_Model_1(train,model,tokenizer):
    X,y,vocab_size,token = encoded_X_y(train,tokenizer)
    loss_value,accuracy = model.evaluate(X, y)
    perplexity = math.exp(loss_value)
    return perplexity

In [46]:
print('Perplexity_Train \t:',Perplexity_Model_1(train1,model_1,tokenizer))

Encoded Size: 95844
Encoded[]: [143, 71, 25, 22, 55, 19, 30, 167, 543, 89, 1, 2289, 3054, 35, 308, 350, 2290, 3055, 7, 308, 863, 2291, 3056, 72, 143, 71, 28, 29, 56, 16, 30, 6, 106, 46, 83, 45, 749, 216, 93, 17, 72, 33, 69, 61, 658, 152, 110, 35, 99, 84, 353, 53, 640, 73, 2, 301, 2, 102, 66, 309, 36, 122, 1, 17, 72, 8, 117, 107, 14, 131, 40, 15, 34, 110, 41, 267, 77, 27, 69, 61, 658, 31, 3, 56, 16, 17, 72, 66, 90, 57, 93, 30, 790, 27, 5, 57, 12, 308, 350, 2290, 35, 308, 863, 2291, 314, 395, 11, 298, 33, 361, 3, 8, 180, 80, 9, 5, 6, 1492, 16, 26, 5, 57, 12, 2289, 93, 30, 167, 601, 32, 66, 90, 57, 5, 502, 16, 175, 3, 11, 17, 72, 39, 5, 864, 167, 143, 140, 71, 28, 29, 25, 22, 47, 115, 50, 17, 72, 4, 93, 314, 535, 314, 750, 231, 15, 71, 28, 196, 26, 30, 640, 6, 9, 3, 71, 28, 29, 25, 22, 1071, 44, 6, 20, 273, 73, 115, 15, 222, 503, 1, 9, 107, 362, 504, 13, 115, 618, 1, 3057, 7, 254, 906, 32, 8, 161, 73, 5, 38, 516, 38, 16, 69, 61, 658, 31, 59, 3, 15, 2, 309, 36, 50, 3, 17, 1, 114, 91, 483, 

In [47]:
print('Perplexity_Test \t:',Perplexity_Model_1(test1,model_1,tokenizer))

Encoded Size: 36096
Encoded[]: [69, 61, 227, 321, 1274, 148, 58, 69, 61, 17, 115, 463, 293, 267, 522, 17, 72, 228, 187, 1, 50, 5, 100, 2518, 3, 4, 94, 296, 57, 11, 124, 317, 205, 57, 7, 124, 682, 65, 57, 7, 4, 17, 682, 55, 54, 57, 32, 5, 94, 16, 4, 73, 115, 133, 9, 73, 90, 57, 276, 596, 10, 180, 80, 9, 42, 55, 205, 57, 211, 273, 73, 115, 133, 55, 130, 232, 42, 2, 10, 24, 400, 96, 1, 9, 43, 73, 234, 27, 398, 94, 57, 27, 192, 17, 324, 992, 123, 991, 3694, 17, 2052, 217, 2518, 9, 345, 152, 277, 42, 463, 1, 329, 84, 353, 53, 640, 241, 7, 774, 73, 2, 301, 2, 12, 624, 1, 116, 14, 148, 332, 282, 400, 474, 17, 1, 9, 43, 73, 234, 551, 323, 8, 169, 60, 489, 35, 1568, 53, 95, 482, 670, 932, 138, 122, 1530, 8, 343, 180, 14, 123, 18, 9, 37, 303, 148, 169, 635, 145, 826, 1936, 797, 10, 53, 11, 588, 5, 2, 324, 728, 58, 119, 77, 239, 3, 127, 2, 725, 147, 75, 129, 111, 90, 669, 40, 5, 100, 2518, 537, 9, 1234, 211, 213, 55, 111, 54, 845, 19, 104, 81, 725, 147, 75, 129, 10, 1990, 27, 415, 69, 61, 632, 27

In [45]:
def

SyntaxError: invalid syntax (1102136943.py, line 1)

### ////////////////////////////////////////////////////////////Model 2: Line-by-Line Sequence//////////////////////////////////////////////////////////////////

In [None]:
# source text
train = " Jack and Jill went up the hill\nTo fetch a pail of water\nJack fell down and broke his crown\nAnd Jill came tumbling after\n "

# train = ['สงสัย|ติด|หวัด|นก| |อีก|คน|ยัง|น่า|ห่วง']
# train = ['ตาม|ที่|นางประนอม ทองจันทร์| |กับ| |ด.ช.กิตติพงษ์ แหลมผักแว่น| |และ| |ด.ญ.กาญจนา กรองแก้ว| |ป่วย|สงสัย|ติด|เชื้อ|ไข้|ขณะ|นี้|ยัง|ไม่|ดี|ขึ้น']
# train = ['สงสัย|ติด|หวัด|นก| |อีก|คน|ยัง|น่า|ห่วง', 'สงสัย|ตาม|ที่|นางประนอม ทองจันทร์| |กับ| |ด.ช.กิตติพงษ์ แหลมผักแว่น| |และ| |ด.ญ.กาญจนา กรองแก้ว| |ป่วย|สงสัย|ติด|เชื้อ|ไข้|ขณะ|นี้|ยัง|ไม่|ดี|ขึ้น']
# train = '\n'.join(train)

# print(type(train))
# print(train)
# print([train])

In [None]:
# integer encode text
tokenizer = Tokenizer()
tokenizer.fit_on_texts([train])
# create line-based sequences
sequences = list()
for line in train.split('\n'):
    encoded = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(encoded)):
        sequence = encoded[:i+1]
        sequences.append(sequence)
print('Total Sequences: %d' % len(sequences))

In [None]:

# pad input sequences
max_length = max([len(seq) for seq in sequences])
sequences = pad_sequences(sequences, maxlen=max_length, padding='pre')
print('Max Sequence Length: %d' % max_length)

In [None]:
# split into input and output elements
sequences = array(sequences)
X, y = sequences[:,:-1],sequences[:,-1]
y = to_categorical(y, num_classes=vocab_size)

In [None]:

# define model
model_2 = Sequential()
model_2.add(Embedding(vocab_size, 10, input_length=max_length-1))
model_2.add(LSTM(50))
model_2.add(Dense(vocab_size, activation='softmax'))
print(model_2.summary())
# compile network
model_2.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit network
model_2.fit(X, y, epochs=500, verbose=2)

In [None]:
# generate a sequence from a language model
def generate_seq(model, tokenizer, max_length, seed_text, n_words):
    in_text = seed_text

    # generate a fixed number of words
    for _ in range(n_words):
        # encode the text as integer
        encoded = tokenizer.texts_to_sequences([in_text])[0]
        # pre-pad sequences to a fixed length
        encoded = pad_sequences([encoded], maxlen=max_length, padding='pre')
        # predict probabilities for each word
        predict_x=model.predict(encoded)
        classes_x=np.argmax(predict_x,axis=1)
        yhat = classes_x
        # map predicted word index to word
        out_word = ''
        for word, index in tokenizer.word_index.items():
            if index == yhat:
                out_word = word
            break
        # append to input
        in_text += ' ' + out_word
    return in_text

In [None]:

# prepare the tokenizer on the source text
tokenizer = Tokenizer()
tokenizer.fit_on_texts([train])

# determine the vocabulary size
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)

# create line-based sequences
sequences = list()
for line in train.split('\n'):
    encoded = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(encoded)):
        sequence = encoded[:i+1]
        sequences.append(sequence)
print('Total Sequences: %d' % len(sequences))

# pad input sequences
max_length = max([len(seq) for seq in sequences])
sequences = pad_sequences(sequences, maxlen=max_length, padding='pre')
print('Max Sequence Length: %d' % max_length)

# split into input and output elements
sequences = array(sequences)
X, y = sequences[:,:-1],sequences[:,-1]
y = to_categorical(y, num_classes=vocab_size)

# define model
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=max_length-1))
model.add(LSTM(50))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())

# compile network
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# fit network
model.fit(X, y, epochs=500, verbose=2)

In [None]:
# evaluate model
print(generate_seq(model_2, tokenizer, max_length-1, 'Jack', 4))
print(generate_seq(model_2, tokenizer, max_length-1, 'Jill', 4))