Import stuff

In [164]:
import nltk
from nltk.corpus import abc
from nltk.tokenize import RegexpTokenizer, word_tokenize
from numpy import log
import sys

START_SYMBOL = '<START>'
END_SYMBOL = '<END>'

Declare documents

Document 1 is a collection of rural news articles and document 2 is a collection of scientific news articles. All articles are from the Australian Broadcasting Commission.

In [165]:
rm_punct = RegexpTokenizer(r"[\w'-]+")

FILE1 = 'rural.txt'
FILE2 = 'science.txt'

doc1_words = rm_punct.tokenize(abc.raw(FILE1).lower())
doc2_words = rm_punct.tokenize(abc.raw(FILE2).lower())

# doc1_words = word_tokenize(abc.raw(FILE1))
# doc2_words = word_tokenize(abc.raw(FILE2))

Funciton to generate unigrams as dictionary where key=word and value=count

In [166]:

def generate_unigrams(doc):
    unigrams = {}

    for word in doc:
        if word not in unigrams:
            unigrams[word] = 0
        unigrams[word] += 1

    return unigrams

Function to generate bigrams.

Returns a dictionary where each key is a word (word1 in a bigram), and each value is another dictionary. The second dictionary has words as keys (word2 in a bigram) and the count of the bigram (word1 word2) as values. So dict[word1][word2] is the count of bigram (word1 word2)

In [167]:
def generate_bigrams(doc):
    bigrams = {}
    prev = START_SYMBOL
    # bigrams[prev] = {}
    # doc = doc[1:]

    for word in doc:
        if prev not in bigrams:
            bigrams[prev] = {}
        if word not in bigrams[prev]:
            bigrams[prev][word] = 0
        bigrams[prev][word] += 1
        prev = word

    # Handle end of file
    # This might have to be removed if I try to do text generation
    if prev not in bigrams:
        bigrams[prev] = {}
    bigrams[prev][END_SYMBOL] = 1
    
    return bigrams

Function to get word counts from bigrams

In [168]:
def word_count_from_bigrams(b):
    counts = {}
    for word1 in b:
        counts[word1] = 0
        for word2 in b[word1]:
            counts[word1] += b[word1][word2]

    return counts

Generate unigram and bigram sets for each document

In [169]:
u1 = generate_unigrams(doc1_words)
u2 = generate_unigrams(doc2_words)

b1 = generate_bigrams(doc1_words)
b2 = generate_bigrams(doc2_words)


Write unigrams and bigrams to files

In [170]:
original_stdout = sys.stdout

with open('Outputs/word_ct_from_b1.txt', 'w') as f:
    sys.stdout = f

    counts = word_count_from_bigrams(b1)
    for word in counts:
        print(word, counts[word])
    sys.stdout = original_stdout

with open('Outputs/word_ct_from_u1.txt', 'w') as f:
    sys.stdout = f

    for word in u1:
        print(word, u1[word])

    sys.stdout = original_stdout

with open('Outputs/b1_counts.txt', 'w') as f:
    sys.stdout = f

    for word1 in b1:
        for word2 in b1[word1]:
            print(word1, word2, b1[word1][word2])

    sys.stdout = original_stdout

with open('Outputs/word_ct_from_b2.txt', 'w') as f:
    sys.stdout = f

    counts = word_count_from_bigrams(b2)
    for word in counts:
        print(word, counts[word])
    sys.stdout = original_stdout

with open('Outputs/word_ct_from_u2.txt', 'w') as f:
    sys.stdout = f

    for word in u2:
        print(word, u2[word])

    sys.stdout = original_stdout

with open('Outputs/b2_counts.txt', 'w') as f:
    sys.stdout = f

    for word1 in b2:
        for word2 in b2[word1]:
            print(word1, word2, b2[word1][word2])

    sys.stdout = original_stdout

Function to reshape bigrams into a dict where key = bigram (word1 word2) and value=count

In [171]:
def reshape_bigram(b):
    reshaped_b = {}

    for word1 in b:
        for word2 in b[word1]:
            reshaped_b[word1 + " " + word2] = b[word1][word2]

    return reshaped_b

Function to sort unigrams by frequency

In [172]:
def unigram_frequency(u):
    return sorted(u.items(), key=lambda x:x[1], reverse=True)

Function to sort bigrams by frequency

In [173]:
def bigram_frequency(b):
    reshaped_b = reshape_bigram(b)

    return unigram_frequency(reshaped_b)


Write unigram and bigram frequencies to files

In [174]:
u1_freq = unigram_frequency(u1)
u2_freq = unigram_frequency(u2)

b1_freq = bigram_frequency(b1)
b2_freq = bigram_frequency(b2)

original_stdout = sys.stdout

with open('Outputs/sorted_u1_counts.txt', 'w') as f:
    sys.stdout = f
    for u in u1_freq:
        print(u[0], u[1])

with open('Outputs/sorted_u2_counts.txt', 'w') as f:
    sys.stdout = f
    for u in u2_freq:
        print(u[0], u[1])

with open('Outputs/sorted_b1_counts.txt', 'w') as f:
    sys.stdout = f
    for u in b1_freq:
        print(u[0], u[1])

with open('Outputs/sorted_b2_counts.txt', 'w') as f:
    sys.stdout = f
    for u in b2_freq:
        print(u[0], u[1])


sys.stdout = original_stdout

Set some values for computing probabilities

In [175]:
n1 = len(doc1_words)
n2 = len(doc2_words)

uniques1 = []
for word in doc1_words:
    uniques1.append(word)

uniques2 = []
for word in doc2_words:
    uniques2.append(word)

uniques_combined = set(uniques1 + uniques2)
v = len(uniques_combined)

b1_reshaped = reshape_bigram(b1)
b2_reshaped = reshape_bigram(b2)

Method to get count of a bigram

In [176]:
def get_bigram_count(word1, word2, b):
    if word1 in b:
        if word2 in b[word1]:
            return b[word1][word2]
    return 0

Method to get count of a unigram

In [177]:
def get_unigram_count(word, u):
    if word in u:
        return u[word]
    return 0

Computing probability of a bigram

In [178]:
def bigram_probability(word1, word2, b, u, v):
    return ((get_bigram_count(word1, word2, b) + 1) / (get_unigram_count(word1, u) + v))


Computing the probability of a sentence

In [179]:
def sent_probability(sent, b, u, v):
    sent = rm_punct.tokenize(sent.lower())
    prob = 0
    prev = START_SYMBOL

    for word in sent:
        prob = prob + log(bigram_probability(prev, word, b, u, v))
        prev = word
    
    word = END_SYMBOL
    prob = prob * bigram_probability(prev, word, b, u, v)

    return prob        

Method to compare probabilities

In [180]:
def get_probabilities(sent, source):
    p1 = sent_probability(sent, b1, u1, v)
    p2 = sent_probability(sent, b2, u2, v)

    print("Sentence: ", sent)
    print("Source: ", source)
    print("Probability of sentence in document 1: ", p1)
    print("Probability of sentence in document 2: ", p2)
    if(p1 > p2):
        print("Prediction: doc1")
    else:
        print("Prediction: doc2")
    print('\n')

Testing

In [182]:
original_stdout = sys.stdout

with open("Output/sentence_probabilities.txt", "w") as f:

    sys.stdout = f

    get_probabilities("A small number of Western Australian bananas have been sent to the eastern states after a sharp drop in demand.", "doc1")

    get_probabilities("The long paddock is getting crowded around Narrabri, in north-west New South Wales, where 25,000 head of livestock are searching for feed on the stock routes.", "doc1")

    get_probabilities("Dr Bernd Irlenbusch of the London School of Economics adds he was surprised with the extent of punishment.", 'doc2')

    get_probabilities("Australians are in a prime position to see Mercury moving across the Sun this week, an event they won't be able to see again for another 26 years.", 'doc2')

sys.stdout = original_stdout

FileNotFoundError: [Errno 2] No such file or directory: 'Output/sentence_probabilities.txt'