In [1]:
import nltk
from nltk.corpus import udhr

In [2]:
#English data set
english = udhr.raw('English-Latin1')
english_train, english_dev, english_test = english[0:1000], english[1000:1100], udhr.words('English-Latin1')[0:1000]

In [3]:
#French data set
french = udhr.raw('French_Francais-Latin1')
french_train, french_dev, french_test = french[0:1000], french[1000:1100], udhr.words('French_Francais-Latin1')[0:1000]

In [4]:
#Italian data set
italian = udhr.raw('Italian_Italiano-Latin1')
italian_train, italian_dev, italian_test = italian[0:1000], italian[1000:1100], udhr.words('Italian_Italiano-Latin1')[0:1000]

In [5]:
#Spanish data set
spanish = udhr.raw('Spanish_Espanol-Latin1')
spanish_train, spanish_dev, spanish_test = spanish[0:1000], spanish[1000:1100], udhr.words('Spanish_Espanol-Latin1')[0:1000]

In [6]:
#Convert the raw text into nGram by using characterList
def buildCharacterNGram(rawText, nGramN):
    wordList = nltk.word_tokenize(rawText.lower())
    nGramList = []
    for word in wordList:
        nGrams = nltk.ngrams(word, nGramN)#, pad_left='True', pad_right='True'
        for nGram in nGrams:
            nGramList.append(nGram)
    return nGramList

In [7]:
# calculate frequancy distribution any n-gram
def calculateFrequencyDistributionOfEachNgram(nGramList, nGramN):
    if nGramN == 1 :
        frequencyDistribution = nltk.FreqDist(nGramList)
    else:
        if nGramN == 2 :
            frequencyDistribution = nltk.ConditionalFreqDist((((t0), t1) for t0, t1 in nGramList))
        elif nGramN == 3 :
            frequencyDistribution = nltk.ConditionalFreqDist((((t0, t1), t2) for t0, t1, t2 in nGramList))
        else:
            print("The nGram parameter is wrong. Unigram, bigram and trigram are supported now")
    return frequencyDistribution

In [8]:
# calculate unigam probability distribution
def calculateUnigramProbabilityDistribution(rawText, nGramN):
    
    nGramList = buildCharacterNGram(rawText, nGramN)
    frequencyDistribution = calculateFrequencyDistributionOfEachNgram(nGramList, nGramN)
    unigramFList = {}
    probabilityDistribution = {}
    totalChar = 0
    for key, value in frequencyDistribution.items():
        totalChar += frequencyDistribution[key]
        unigramFList[key[0]] = value
        
    for key, value in frequencyDistribution.items():
        probabilityDistribution[key[0]] = frequencyDistribution[key] / totalChar
    #print(probabilityDistribution)
    return (unigramFList, totalChar, probabilityDistribution)

In [9]:
# calculate biigam probability distribution
def calculateBigramProbabilityDistribution(rawText, nGramN, unigramFrequencyDistribution):

    nGramList = buildCharacterNGram(rawText, nGramN)
    bigramFrequencyDistribution = calculateFrequencyDistributionOfEachNgram(nGramList, nGramN)
    
    
    bigramFList = {}
    for key, value in bigramFrequencyDistribution.items():
        for k,v in value.items():
            bigramFList[key, k] = v;

    probabilityDistribution = {}
    for key, value in bigramFrequencyDistribution.items():
        for k,v in value.items():
            probabilityDistribution[key, k] = bigramFrequencyDistribution[key][k] / unigramFrequencyDistribution[k]
            
    #print(probabilityDistribution)
    return (bigramFList, probabilityDistribution)
    

In [10]:
# calculate trigam probability distribution
def calculateTrigramProbabilityDistribution(rawText, nGramN, 
                                            bigramFrequencyDistribution, unigramFrequencyDistribution):
    
    nGramList = buildCharacterNGram(rawText, nGramN)
    trigramFrequencyDistribution = calculateFrequencyDistributionOfEachNgram(nGramList, nGramN)
    
    probabilityDistribution = {}
    for key,  valueList   in trigramFrequencyDistribution.items():
        for k, values in valueList.items():
            probabilityDistribution[key[0],key[1], k] = values / bigramFrequencyDistribution[key[0],key[1]]
    #print(probabilityDistribution)
    return probabilityDistribution

In [11]:
#Calculate the probability of a word using trained language mode and Back off smoothing
def calculateProbabilityOfTesData(word, nGramN, trainedModel, lang):
    predictedProbability = 1
    if nGramN == 1:
        for character in word:
            if character in trainedModel[lang, "unigram", "probability"]:
                predictedProbability *= trainedModel[lang, "unigram", "probability"][character]
            else:
                predictedProbability *= (1 / trainedModel[lang, "unigram", "total"])
                #print("Character misssing: " , character)
    elif nGramN == 2:
        nGramlist = buildCharacterNGram(word, nGramN)
        for bigram in nGramlist:
            if bigram in trainedModel[lang, "bigram", "probability"]:
                predictedProbability *= trainedModel[lang, "bigram", "probability"][bigram]
            else:
                if bigram[0] in trainedModel[lang, "unigram", "probability"]:
                    predictedProbability *= trainedModel[lang, "unigram", "probability"][bigram[0]]
                else:
                    predictedProbability *= (1 / trainedModel[lang, "unigram", "total"])
                #print("bigram Missing : ", bigram[0], bigram[1])

    elif nGramN == 3:
        nGramlist = buildCharacterNGram(word, nGramN)
        for trigram in nGramlist:
            if trigram in trainedModel[lang, "trigram", "probability"]:
                predictedProbability *= trainedModel[lang, "trigram", "probability"][trigram]
            else:
                if(trigram[0], trigram[1]) in trainedModel[lang, "bigram", "probability"]:
                    predictedProbability *= trainedModel[lang, "bigram", "probability"][(trigram[0], trigram[1])]
                    #print("missing: last-2")
                else:
                    if trigram[0] in trainedModel[lang, "unigram", "probability"]:
                        predictedProbability *= trainedModel[lang, "unigram", "probability"][trigram[0]]
                        #print("missing: last-1")
                    else: 
                        predictedProbability *= (1 / trainedModel[lang, "unigram", "total"])
                        #print("missing: last")

    else:
        print("The nGram parameter is wrong. Unigram, bigram and Trigram are supported now")
    return predictedProbability

In [12]:
#Train and generate the language model for all 4 languages
def generateAndStoreTrainedLanguageModel():
    trainedModel = dict()
    frqUnigramE, total, probUnigramE = calculateUnigramProbabilityDistribution(english_train, 1)
    trainedModel["E", "unigram", "frequency"] = frqUnigramE
    trainedModel["E", "unigram", "total"] = total
    trainedModel["E", "unigram", "probability"] = probUnigramE
    frqBigramE, probBigramE = calculateBigramProbabilityDistribution(english_train, 2, frqUnigramE)
    trainedModel["E", "bigram", "frequency"] = frqBigramE
    trainedModel["E", "bigram", "probability"] = probBigramE
    probTrigramE = calculateTrigramProbabilityDistribution(english_train, 3, frqBigramE, frqUnigramE)
    trainedModel["E", "trigram", "probability"] = probTrigramE
    
    frqUnigramF, total, probUnigramF = calculateUnigramProbabilityDistribution(french_train, 1)
    trainedModel["F", "unigram", "frequency"] = frqUnigramF
    trainedModel["F", "unigram", "total"] = total
    trainedModel["F", "unigram", "probability"] = probUnigramF
    frqBigramF, probBigramF = calculateBigramProbabilityDistribution(french_train, 2, frqUnigramF)
    trainedModel["F", "bigram", "frequency"] = frqBigramF
    trainedModel["F", "bigram", "probability"] = probBigramF
    probTrigramF = calculateTrigramProbabilityDistribution(french_train, 3, frqBigramF, frqUnigramF)
    trainedModel["F", "trigram", "probability"] = probTrigramF

    frqUnigramI, total, probUnigramI = calculateUnigramProbabilityDistribution(italian_train, 1)
    trainedModel["I", "unigram", "frequency"] = frqUnigramI
    trainedModel["I", "unigram", "total"] = total
    trainedModel["I", "unigram", "probability"] = probUnigramI
    frqBigramI, probBigramI = calculateBigramProbabilityDistribution(italian_train, 2, frqUnigramI)
    trainedModel["I", "bigram", "frequency"] = frqBigramI
    trainedModel["I", "bigram", "probability"] = probBigramI
    probTrigramI = calculateTrigramProbabilityDistribution(italian_train, 3, frqBigramI, frqUnigramI)
    trainedModel["I", "trigram", "probability"] = probTrigramI
    
    frqUnigramS, total, probUnigramS = calculateUnigramProbabilityDistribution(spanish_train, 1)
    trainedModel["S", "unigram", "frequency"] = frqUnigramS
    trainedModel["S", "unigram", "total"] = total
    trainedModel["S", "unigram", "probability"] = probUnigramS
    frqBigramS, probBigramS = calculateBigramProbabilityDistribution(spanish_train, 2, frqUnigramS)
    trainedModel["S", "bigram", "frequency"] = frqBigramS
    trainedModel["S", "bigram", "probability"] = probBigramS
    probTrigramS = calculateTrigramProbabilityDistribution(spanish_train, 3, frqBigramS, frqUnigramS)
    trainedModel["S", "trigram", "probability"] = probTrigramS
    return trainedModel

In [13]:
#calculate the accuracy  of two language model using trained model trainedModel
def acuracyCalculation (trainedModel, wordList, lang1, lang2, nGram, testSet):
    TP = 0
    FN = 0
    for word in wordList:
        if calculateProbabilityOfTesData(word.lower(), nGram, trainedModel, lang1) >= calculateProbabilityOfTesData(word.lower(), nGram, trainedModel, lang2):
            TP += 1  
        else:
            FN += 1    
    Accuracy = TP / (TP + FN)
    print("Accuracy of {} vs {} using {}-gram model with {} testSet is {}%".format(lang1, lang2, nGram, testSet, Accuracy*100))


### Problem 01 : (Language Model Creation)

In [14]:
trainedModel = generateAndStoreTrainedLanguageModel()
acuracyCalculation(trainedModel, english_test, "E", "F", 1, "English")
acuracyCalculation(trainedModel, english_test, "E", "F", 2, "English")
acuracyCalculation(trainedModel, english_test, "E", "F", 3, "English")

acuracyCalculation(trainedModel, french_test, "F", "E", 1, "Frernch")
acuracyCalculation(trainedModel, french_test, "F", "E", 2, "French")
acuracyCalculation(trainedModel, french_test, "F", "E", 3, "French")



Accuracy of E vs F using 1-gram model with English testSet is 78.9%
Accuracy of E vs F using 2-gram model with English testSet is 81.89999999999999%
Accuracy of E vs F using 3-gram model with English testSet is 86.7%
Accuracy of F vs E using 1-gram model with Frernch testSet is 71.8%
Accuracy of F vs E using 2-gram model with French testSet is 78.5%
Accuracy of F vs E using 3-gram model with French testSet is 85.2%


### Problem 02: (Language Model Comparison)

In [15]:
acuracyCalculation(trainedModel, spanish_test, "S", "I", 1, "Spanish")
acuracyCalculation(trainedModel, spanish_test, "S", "I", 2, "Spanish")
acuracyCalculation(trainedModel, spanish_test, "S", "I", 3, "Spanish")

acuracyCalculation(trainedModel, italian_test, "I", "S", 1, "Italian")
acuracyCalculation(trainedModel, italian_test, "I", "S", 2, "Italian")
acuracyCalculation(trainedModel, italian_test, "I", "S", 3, "Italian")

Accuracy of S vs I using 1-gram model with Spanish testSet is 71.0%
Accuracy of S vs I using 2-gram model with Spanish testSet is 85.1%
Accuracy of S vs I using 3-gram model with Spanish testSet is 85.6%
Accuracy of I vs S using 1-gram model with Italian testSet is 58.5%
Accuracy of I vs S using 2-gram model with Italian testSet is 75.5%
Accuracy of I vs S using 3-gram model with Italian testSet is 83.8%


#### Since we get the lowest accuracey for the pair Italian vs Spanish when it is tested with Spanish test data  in 1-gram accuracy 71% and tested with Italian test data 1-gram accuracy 58.5%, this pair is harder to distinguish