# Quadgram model

In [1]:
from nltk.util import ngrams
from nltk.corpus import stopwords
import nltk
from collections import Counter
from collections import defaultdict

## Preprocessing the tokens

In [2]:
def getTokens(file):
    tokens=[]
    with open(file) as f:
        for line in f:
            if not line.isspace():
                line=line.lower()
                line=line.replace('.',' ').replace(',',' ').replace(':',' ').replace(';',' ').replace('!',' ').replace('?',' ').replace('(',' ').replace(')',' ').replace("-"," ").replace("' ",' ').replace('"',' ').replace("_"," ").replace('[',' ').replace(']',' ')
            items=line.split()
            tokens+=items
    return tokens

## Building a probability table that stores the probabilities of different tokens with a particular trigram

In [3]:
def build_quadgram_table(trigrams,quadgrams):
    trigram_table=Counter(trigrams)#stores the frequencies of each trigram
    quadgram_table=Counter(quadgrams)#stores the frequencies of each quadgram

    for key in quadgram_table:
        tri=key[0:3]
        quadgram_table[key]=(quadgram_table[key]/trigram_table[tri])#Now the quadgram table will contain probabilities of each quadgram
    return quadgram_table   

In [4]:
def build_prob_table(quadgram_table):
    prob_table = defaultdict(dict)

    #Constructing a probabiltity table which looks like this:
    #{tri1:{token1:0.2, token5:0.5, tokenn:0.3}
    # tri2:{........}
    # tri3:{        }
    # .....
    # trin:{........}}
    for quad in quadgram_table:
        p=quadgram_table[quad]
        tri=quad[0:3]
        token=quad[3]
        prob_table[tri][token]=p

    for tri in prob_table:
        prob_table[tri]=sorted(prob_table[tri].items(), key=lambda x: x[1], reverse=True)
        
    return prob_table

## Function to predict the next word 

In [8]:
def predict(word,x=1):
    max_prob=0
    result='0'
    
    n=len(word)
    if(x>n):
        n=n
    else:
        n=x
    result=word[n-1]
    #print(result[0])

    #print("The next word is:", result[0])
    return result[0]

## Main driver function

In [9]:
if __name__=='__main__':
    
    tokens=getTokens('/home/meghana/nltk_data/corpora/gutenberg/corpusfile.txt')
    tokenSet=set(tokens)
    
    trigrams=list(ngrams(tokens,3))
    trigramSet=set(trigrams)
    quadgrams=list(ngrams(tokens,4))
    quadgramSet=set(quadgrams)

    print(len(trigramSet))
    print(len(quadgramSet))
    
    quadgram_table=build_quadgram_table(trigrams,quadgrams)
    prob_table=build_prob_table(quadgram_table)
    
    #s=input("Enter a string:")
    s="she is the youngest of"# a sentence in corpus. We need to find the next word for this sentence
    x=int(input("enter n:"))

    
    s=s.lower()
    s=s.replace('.',' ').replace(',',' ').replace(':',' ').replace(';',' ').replace('!',' ').replace('?',' ').replace('(',' ').replace(')',' ').replace("-"," ").replace("' ",' ').replace('"',' ').replace("_"," ").replace('[',' ').replace(']',' ')                                 
    s_tokens=s.split()
    
    s_trigrams=list(ngrams(s_tokens, 3))
    n=len(s_trigrams)
    req_trigram=s_trigrams[n-1]#taking the last trigram to predict the next word
    word=prob_table[req_trigram]#word is a list of tokens that are possible to occur after the required trigram
    result=predict(word,x)
    print("The next word is:", result)
   
    

1416799
1858040
enter n:2
The next word is: his
