## Language Modelling
### Kaustav Vats | 2016048

In [40]:
import codecs
import string
from nltk.tokenize import sent_tokenize, word_tokenize
import re, os, operator
import pickle
from matplotlib import pyplot as plt
from random import randint as randi
from math import log10
import numpy as np
from tqdm import tqdm_notebook as tqdm

In [42]:
def GetUniGramTokens(folderName):
    folder = "20_newsgroups/" + folderName + "/"
    tokens = []
    for file in os.listdir(folder):
        f = open(folder+file, 'r')
        text = pre_processing(f.read())
        tokens += word_tokenize(text)
        f.close()
    return tokens

def GetBiGramTokens(folderName):
    folder = "20_newsgroups/" + folderName + "/"
    tokens = []
    for file in os.listdir(folder):
        f = open(folder+file, 'r')
        text = pre_processing(f.read())
        tkn = word_tokenize(text)
        for i in range(len(tkn)-1):
            w = tkn[i] + " " + tkn[i+1]
            tokens.append(w)
        f.close()
    return tokens

def GetTriGramTokens(folderName):
    folder = "20_newsgroups/" + folderName + "/"
    tokens = []
    for file in os.listdir(folder):
        f = open(folder+file, 'r')
        text = pre_processing(f.read())
        tkn = word_tokenize(text)
        for i in range(len(tkn)-2):
            w = tkn[i] + " " + tkn[i+1] + " " + tkn[i+2]
            tokens.append(w)
        f.close()
    return tokens
        
def pre_processing(text):
    text = text.strip()
    text = text.lower()
#     text = re.sub(r'[^\w]', ' ', text) # Word Preprocessing
    return text

def GetFreq(tokens):
    bow = {}
    for tkn in tokens:
        if (tkn in bow):
            bow[tkn] += 1
        else:
            bow[tkn] = 1
    return bow

# def SmoothingProbs(Bow, Tokens, K=1):
#     for key in Bow:
#         Bow[key] = (Bow[key]+K)/(len(Tokens) + len(Bow)*K)
#     return Bow

def UniGramModel(folderName):
#     if (os.path.isfile(folderName+"_UniGramModel")):
#         file = open(folderName+"_UniGramModel", "rb")
#         SortedBow = pickle.load(file)
#         file.close()
#         return SortedBow
    Tokens = GetUniGramTokens(folderName)
    BOW = GetFreq(Tokens)
#     BOW = SmoothingProbs(BOW, Tokens, K=1)
#     SortedBow = sorted(BOW.items(), key=operator.itemgetter(1), reverse = True)
#     file = open(folderName+"_UniGramModel", "wb")
#     pickle.dump(SortedBow, file)
#     file.close()
    return BOW, Tokens

def BiGramModel(folderName):
#     if (os.path.isfile(folderName+"_BiGramModel")):
#         file = open(folderName+"_BiGramModel", "rb")
#         SortedBow = pickle.load(file)
#         file.close()
#         return SortedBow
    Tokens = GetBiGramTokens(folderName)
    BOW = GetFreq(Tokens)
#     BOW = SmoothingProbs(BOW, Tokens, K=1)
#     SortedBow = sorted(BOW.items(), key=operator.itemgetter(1), reverse = True)
#     file = open(folderName+"_BiGramModel", "wb")
#     pickle.dump(SortedBow, file)
#     file.close()
    return BOW, Tokens

def TriGramModel(folderName):
#     if (os.path.isfile(folderName+"_TriGramModel")):
#         file = open(folderName+"_TriGramModel", "rb")
#         SortedBow = pickle.load(file)
#         file.close()
#         return SortedBow
    Tokens = GetTriGramTokens(folderName)
    BOW = GetFreq(Tokens)
#     BOW = SmoothingProbs(BOW, Tokens, K=1)
#     SortedBow = sorted(BOW.items(), key=operator.itemgetter(1), reverse = True)
#     file = open(folderName+"_TriGramModel", "wb")
#     pickle.dump(SortedBow, file)
#     file.close()
    return BOW, Tokens

def FitUni(Tokens, Bows, Classes=2, K=1):
    Vocab = list(set(list(Bows[0].keys())+list(Bows[1].keys())))
    LikliProba = np.zeros((len(Vocab), Classes))
    for i in range(len(Vocab)):
        word = Vocab[i]
        for j in range(Classes):
            if word in Bows[j]:
                LikliProba[i, j] = (Bows[j][word] + K)/(len(Tokens[j]) + K*len(Vocab))
            else:
                LikliProba[i, j] = K/(len(Tokens[j]) + K*len(Vocab))
    return LikliProba

def FitBi(Tokens, BiBow, UniBow):
    Vocab = list(set(list(BiBow[0].keys())+list(BiBow[1].keys())))
    LikliProba = np.zeros((len(Vocab), 2))
    for i in range(len(Vocab)):
        word = Vocab[i]
        for j in range(2):
            firstWord = word.split(" ")[0]
            if word in BiBow[j] and firstWord in UniBow[j]:
                LikliProba[i, j] = (BiBow[j][word] + 1)/(UniBow[j][firstWord] + len(Vocab))
            elif (word in BiBow[j] and firstWord not in UniBow[j]):
                LikliProba[i, j] = (BiBow[j][word] + 1)/(1 + len(Vocab))
            elif (word not in BiBow[j] and firstWord in UniBow[j]):
                LikliProba[i, j] = 1/(UniBow[j][firstWord] + len(Vocab))
            else:
                LikliProba[i, j] = 1/(1+ len(Vocab))
    return LikliProba
    
def PredictUni(tokens, Liklihood)

## Training Models

In [43]:
# UnigramModel
bow_baseball_1, tkn_baseball_1 = UniGramModel("rec.sport.baseball")
bow_motorcycle_1, tkn_motorcycle_1 = UniGramModel("rec.motorcycles")
LikliProba_1 = FitUni([tkn_baseball_1, tkn_motorcycle_1], [bow_baseball_1, bow_motorcycle_1])
np.save("UniGramModel", LikliProba_1)

# BiGramModel
bow_baseball_2, tkn_baseball_2 = BiGramModel("rec.sport.baseball")
bow_motorcycle_2, tkn_motorcycle_2 = BiGramModel("rec.motorcycles")
LikliProba_2 = FitBi([tkn_baseball_2, tkn_motorcycle_2], [bow_baseball_2, bow_motorcycle_2], [bow_baseball_1, bow_motorcycle_1])
np.save("BiGramModel", LikliProba_2)

# TriGramMode
# bow_baseball_3, tkn_baseball_3 = TriGramModel("rec.sport.baseball")
# bow_motorcycle_3, tkn_motorcycle_3 = TriGramModel("rec.motorcycles")
# LikliProba_3 = Fit([tkn_baseball_3, tkn_motorcycle_3], [bow_baseball_3, bow_motorcycle_3])
# np.save("TriGramModel", LikliProba_3)

In [38]:
def getUniGramTokens(sent):
    text = pre_processing(sent)
    tokens = word_tokenize(text)
    tokens = np.asarray(tokens)
    return tokens

## Sentence Generation

## Predicting probability

In [39]:
sentence = str(input())
tkn = getUniGramTokens(sentence)
print(tkn)

Hello world this is kaustav
['hello' 'world' 'this' 'is' 'kaustav']


## Calculating perplexity