In [3]:
%pip install tabulate
from tabulate import tabulate

Note: you may need to restart the kernel to use updated packages.


In [2]:
import csv
from itertools import count
import operator
import math

EPSILON = 0.000001

def main():
    # Calculate all the ps and qs
    # Eg hamiltonWordProb['congress'] = 0.005
    # hamilton_word_prob['piech'] = 0.0
    # hamilton_word_prob['the'] = 0.001

    hamilton_word_prob = make_word_prob_map('kashmir.txt')
    madison_word_prob = make_word_prob_map('punjab.txt')

    

    # Get the word count of the unknown document
    # Eg unknown_doc_count['congress'] = 5
    unknown_doc_count, n_words = make_word_count_map('Kashmir bard.txt')

    #print("hamilton['congress']\t", hamilton_word_prob['congress'])
   # print("madison['congress']\t",  madison_word_prob['congress'])
   # print("doc_count['congress']\t", unknown_doc_count['congress'])
   # print("n_words", n_words)

    hamilton_term = calc_log_pr_doc_given_author(hamilton_word_prob, unknown_doc_count)
    madison_term = calc_log_pr_doc_given_author(madison_word_prob, unknown_doc_count)
    print("log P(D|H)\t", hamilton_term)
    print("log P(D|M)\t",madison_term)

    print('diff\t', hamilton_term - madison_term)

def calc_log_pr_doc_given_author(prob_map, counts):
    """
    How likely is the document, given the counts of words in the doc
    and the authors prob_map
    """
    data = []
    log_prob = math.log(1)
    for word_i, c_i in counts.items():
        p_i = get_word_prob(prob_map, word_i)
        log_prob += c_i * math.log(p_i)
        lst = [word_i, c_i, log_prob]
        data.append(lst)
        # print(word_i,"      ", c_i,"   ", log_prob,"   ")
    print(tabulate(data))
    return log_prob


def calcLogProbDoc(wordProbMap, countMap):
    logProb = math.log(1)
    for wordi in countMap:
        ci = countMap[wordi]
        pi = get_word_prob(wordProbMap, wordi)
        logProb += ci * math.log(pi)
    return logProb

# If a word is in a probability dictionary, return its probability
# otherwise, return epsilon
def get_word_prob(word_prob_map, word):
    if word in word_prob_map:
        return word_prob_map[word]
    return EPSILON

# From a file name, approximate the probability of a word
# being generated from the same distribution as the file.
# Assume that each word is produced independently, regardless
# of order.
def make_word_prob_map(fileName):
    wordMap, nWords = make_word_count_map(fileName)
    probabilityMap = {}
    for word in wordMap:
        count = wordMap[word]
        p = float(count) / nWords
        probabilityMap[word] = p
    return probabilityMap

# From a file name, count the number of times each word exists
# in that file. Return the result as a map (aka a dictionary)
def make_word_count_map(fileName):
    wordMap = {}
    nWords = 0
    data1 = []
    with open(fileName , encoding='utf-8') as f:
        for line in f:
            words = line.split(' ')
            for word in words:
                word = standardize(word)
                add_word_to_count_map(wordMap, word)
                nWords+= 1
                #lst1 = [word,wordMap[word]]
                #data1.append(lst1)
                #print(word, " ",wordMap[word] )
        #print(tabulate(data1))
    return wordMap, nWords

# Add a word to a count map. Makes sure not to crash if the
# word has not been seen before.
def add_word_to_count_map(wordMap, word):
    if not word in wordMap:
        wordMap[word] = 0
    wordMap[word] += 1
    
# Standardizes a word. For now, we are just going to make it
# lower case.
def standardize(word):
    standard = word.lower().strip()
    # remove punctuation
    standard = ''.join([i for i in standard if i.isalpha()])
    return standard

if __name__ == '__main__':
    main()

-----------------  --  ----------
the                25    -55.7928
history             1    -60.2758
of                 17   -110.917
kashmir            10   -151.693
is                  3   -193.139
a                   7   -216.83
complex             1   -222.006
and                21   -278.522
contested           1   -292.338
one                 1   -306.153
involving           1   -319.969
multiple            1   -333.784
actors              1   -347.6
perspectives        1   -361.415
here                1   -375.231
brief               1   -389.046
summary             1   -402.862
main                1   -416.677
events              1   -430.493
issues              1   -444.309
that                1   -458.124
have                4   -513.386
shaped              1   -517.869
regions             1   -531.685
past                1   -545.5
present             1   -559.316
                   11   -608.629
region              2   -616.784
in                  8   -645.317
northwestern

In [5]:
from tabulate import tabulate
import csv
from itertools import count
import operator
import math

EPSILON = 0.000001

def main():
    # Calculate all the ps and qs
    # Eg hamiltonWordProb['congress'] = 0.005
    # hamilton_word_prob['piech'] = 0.0
    # hamilton_word_prob['the'] = 0.001

    #hamilton_word_prob = make_word_prob_map('hamilton.txt')
    #madison_word_prob = make_word_prob_map('madison.txt')
    first_word_prob = make_word_prob_map('gilgit.txt')
    second_word_prob = make_word_prob_map('punjab.txt')
    third_word_prob = make_word_prob_map('kashmir.txt')
    fourth_word_prob = make_word_prob_map('kpk.txt')
    fifth_word_prob = make_word_prob_map('lahore.txt')

    

    # Get the word count of the unknown document
    # Eg unknown_doc_count['congress'] = 5
    unknown_doc_count, n_words = make_word_count_map('kashmir_bard.txt')

    #print("hamilton['congress']\t", hamilton_word_prob['congress'])
    #print("madison['congress']\t",  madison_word_prob['congress'])
    #print("doc_count['congress']\t", unknown_doc_count['congress'])
    print("n_words", n_words)

    first_term = calc_log_pr_doc_given_author(first_word_prob, unknown_doc_count)
    second_term = calc_log_pr_doc_given_author(second_word_prob, unknown_doc_count)
    third_term=  calc_log_pr_doc_given_author(third_word_prob, unknown_doc_count)
    fourth_term = calc_log_pr_doc_given_author(fourth_word_prob, unknown_doc_count)
    fifth_term = calc_log_pr_doc_given_author(fifth_word_prob, unknown_doc_count)
    
    print("log P(D|A)\t", first_term)
    print("log P(D|B)\t", second_term)
    print("log P(D|C)\t", third_term)
    print("log P(D|D)\t", fourth_term)
    print("log P(D|E)\t", fifth_term)
    
    
  
    def compare_var(var1, var2):
        result = var1 - var2
        if result > 0:
            return var1
        else:
            return var2

    unk = compare_var(first_term, second_term)
    unk = compare_var(unk, third_term)
    unk = compare_var(unk, fourth_term)
    unk = compare_var(unk, fifth_term)
    
    print('difference\t', unk)



def calc_log_pr_doc_given_author(prob_map, counts):
    """
    How likely is the document, given the counts of words in the doc
    and the authors prob_map
    """
    data = []
    log_prob = math.log(1)
    for word_i, c_i in counts.items():
        p_i = get_word_prob(prob_map, word_i)
        log_prob += c_i * math.log(p_i)
        lst = [word_i, c_i, log_prob]
        data.append(lst)
        # print(word_i,"      ", c_i,"   ", log_prob,"   ")
    print(tabulate(data))
    return log_prob


def calcLogProbDoc(wordProbMap, countMap):
    logProb = math.log(1)
    for wordi in countMap:
        ci = countMap[wordi]
        pi = get_word_prob(wordProbMap, wordi)
        logProb += ci * math.log(pi)
    return logProb

# If a word is in a probability dictionary, return its probability
# otherwise, return epsilon
def get_word_prob(word_prob_map, word):
    if word in word_prob_map:
        return word_prob_map[word]
    return EPSILON

# From a file name, approximate the probability of a word
# being generated from the same distribution as the file.
# Assume that each word is produced independently, regardless
# of order.
def make_word_prob_map(fileName):
    wordMap, nWords = make_word_count_map(fileName)
    probabilityMap = {}
    for word in wordMap:
        count = wordMap[word]
        p = float(count) / nWords
        probabilityMap[word] = p
    return probabilityMap

# From a file name, count the number of times each word exists
# in that file. Return the result as a map (aka a dictionary)
def make_word_count_map(fileName):
    wordMap = {}
    nWords = 0
    data1 = []
    with open(fileName , encoding='utf-8') as f:
        for line in f:
            words = line.split(' ')
            for word in words:
                word = standardize(word)
                add_word_to_count_map(wordMap, word)
                nWords+= 1
                #lst1 = [word,wordMap[word]]
                #data1.append(lst1)
                #print(word, " ",wordMap[word] )
        #print(tabulate(data1))
    return wordMap, nWords

# Add a word to a count map. Makes sure not to crash if the
# word has not been seen before.
def add_word_to_count_map(wordMap, word):
    if not word in wordMap:
        wordMap[word] = 0
    wordMap[word] += 1
    
# Standardizes a word. For now, we are just going to make it
# lower case.
def standardize(word):
    standard = word.lower().strip()
    # remove punctuation
    standard = ''.join([i for i in standard if i.isalpha()])
    return standard

if __name__ == '__main__':
    main()


# In[ ]:






n_words 292
----------------  --  ----------
the               19    -41.7473
history            2    -53.2524
of                 6    -70.4256
kashmir            5    -95.7227
is                 1   -100.782
a                 11   -142.655
rich               1   -148.408
and               16   -197.12
complex            2   -224.751
tapestry           1   -238.567
woven              1   -252.382
from               1   -258.135
ancient            2   -269.64
kingdoms           1   -283.456
religious          1   -289.208
influences         1   -294.961
political          1   -308.776
struggles          1   -322.592
heres              1   -336.407
brief              1   -350.223
overview           1   -364.038
in                 6   -385.37
paragraph          1   -399.186
                   8   -429.639
early              1   -443.455
nestled            1   -449.207
amidst             1   -463.023
majestic           1   -476.838
himalayas          1   -482.591
kashmirs           4   -53