In [112]:
# initializations

import string
import re
from collections import Counter
from itertools import product
from itertools import permutations
import random
from numpy import cumsum
import numpy as np

In [113]:
# read and process the script

with open('script.txt', encoding="utf-8") as f:
    data = f.read()

def process_text(data):
    data = data.lower()
    data = re.sub(r"[^a-z ]+", "", data)
    data = " ".join(data.split())
    return data

# make everything lower case
# remove all non-alphabetic characters except spaces
# remove consecutive spaces
data = process_text(data)

In [114]:
# n-grams for computing probability transition table

# all possible characters
allchar = " " + string.ascii_lowercase

# generate all possible n-grams from the data
def ngram(n, data=data):
    # create a dictionary with all possible n-grams as keys
    d = dict.fromkeys(["".join(i) for i in product(allchar, repeat=n)], 0)
    # count the number of occurrences of each n-gram
    d.update(Counter([data[i : i + n] for i in range(len(data) - n + 1)]))
    return d

In [115]:
# question 1

# Script chosen: Rush (script.txt)

In [116]:
# question 2

# unigram transition probability table
unigram = ngram(1)
unigram_prob = {ch: round(unigram[ch] / len(data), 4) for ch in unigram}

# write the unigram probabilities (27 numbers) to a file (comma separated)
with open("unigram_prob.txt", "w") as f:
    f.write(",".join([str(unigram_prob[ch]) for ch in allchar]))


In [117]:
# question 3

# bigram transition probability table
bigram = ngram(2)
# without laplace smoothing
bigram_prob = {ch: round(bigram[ch] / unigram[ch[0]], 4) for ch in bigram}

# write the bigram probabilities (27*27 numbers) to a file (comma separated)
with open("bigram_prob.txt", "w") as f:
    for ch1 in allchar:
        f.write(",".join([str(bigram_prob[ch1 + ch2]) for ch2 in allchar]) + "\n")

In [118]:
# question 4

# with Laplace smoothing
bigram_prob_laplace = {ch: round((bigram[ch] + 1) / (unigram[ch[0]] + 27), 5) for ch in bigram}

# write the bigram probabilities (27*27 numbers) to a file (comma separated)
with open("bigram_prob_laplace.txt", "w") as f:
    for ch1 in allchar:
        f.write(",".join([str(bigram_prob_laplace[ch1 + ch2]) for ch2 in allchar]) + "\n")

In [119]:
# trigam transition probability table

trigram = ngram(3)
# with laplace smoothing
trigram_prob_laplace = {ch: (trigram[ch] + 1) / (bigram[ch[:2]] + 27) for ch in trigram}

In [120]:
# generate sentences using the n-gram model

def weighted_choice(collection, weights):
    """Randomly choose an element from collection according to weights"""
    weights = np.array(weights)
    weights_sum = weights.sum()
    weights = weights.cumsum() / weights_sum
    x = random.random()
    for i in range(len(weights)):
        if x < weights[i]:
            return collection[i]
        
def gen_bi(ch):
    '''Generate the second character of a bigram given the first character'''
    w = [bigram_prob_laplace[ch + i] for i in allchar]
    return weighted_choice(allchar, w)[0]

def gen_tri(ch):
    '''Generate the third character of a trigram given the first two characters'''
    w = [trigram_prob_laplace[ch + i] for i in allchar]
    return weighted_choice(allchar, w)[0]

def gen_sen(ch, num):
    '''Generate a sentence of length num given the first character ch'''
    # use bigram model to generate the second character
    res = ch + gen_bi(ch)
    for _ in range(num - 2):
        # switching to bigram model if the current two-character sequence never occurs in the data
        if bigram[res[-2:]] == 0:
            res += gen_bi(res[-1])
        else:
            res += gen_tri(res[-2:])
    return res

In [121]:
# question 5

# generate 26 sentences of length 1000
sentences = []
with open("sentences.txt", "w") as f:
    for ch in string.ascii_lowercase:
        sentence = gen_sen(ch, 1000)
        f.write(sentence + "\n")
        sentences.append(sentence)

In [122]:
# question 6

# example of a sentence generated by the model
print(gen_sen("a", 1000))

alfpxcqued intioned the the aheneens in reame nis sunting the the a ables haver whital clas a comeresinamqlses avestinalton wattlent bliki one is inal hured jamet loss ore alles rolwtromitelly you commicg him iff spiters coles byvers thed vo ingsegintene then ittrair ted paught laupquaughtur but itchaki a thaidefpen an skend take burd one re bozd thering james forstep to puyqrake wat an thespiki inrvg ted iosees falow up is baa astars kas lospan aging nikin din to biket lont wousnis mes trus whygre cand what hits evg hadut mait is ane conabor carlento of you daysx gralfceles tryoune alles awar graces a james suzy jamen barted ond at this setd jame tion coul gethiser the criche a wit in nou whowithe me subbtwo se ople craceske per hees dopay huzy but racks of mare james tarovert by th kyout up disits the plest grand of conow exto go whimpitherles pur inurnes thes niki peniking a brat ggdrignd niki con ey tobhks of thista the hermd luktimpbeadentezn hint tv queing thimzolfbjamenightbecon

In [123]:
# read and process the fake script

with open('fake_script.txt', encoding="utf-8") as f:
    fake_data = f.read()
fake_data = process_text(fake_data)

In [124]:
# hyperparameters for a Naive Bayes classifier

# prior probabilities
P_script = 0.8
P_fake_script = 0.2

In [125]:
# question 7

# computing likelihoods for fake script i.e. Pr(character | fake_script)
fake_unigram = ngram(1, data=fake_data)
fake_unigram_prob = {ch: round(fake_unigram[ch] / len(fake_data), 4) for ch in fake_unigram}

# write the fake unigram probabilities (27 numbers) to a file (comma separated)
with open('fake_likelihood.txt', 'w') as f:
    f.write(','.join([str(fake_unigram_prob[ch]) for ch in allchar]))


In [126]:
# question 8

# computing posterior probabilities for fake script i.e. Pr(fake_script | character)

with open('fake_posterior.txt', 'w') as f:
    f.write(','.join([str(round(P_fake_script * fake_unigram_prob[ch] / (P_fake_script * fake_unigram_prob[ch] + P_script * unigram_prob[ch]), 4)) for ch in allchar]))

In [129]:
# question 9

# classify the sentences generated in question 5 as real or fake
with open('classification.txt', 'w') as f:
    classifications = []
    for sentence in sentences:
        real = np.log10(P_script)
        fake = np.log10(P_fake_script)
        for ch in sentence:
            real += np.log10(unigram_prob[ch])
            fake += np.log10(fake_unigram_prob[ch])
        if real > fake:
            classifications.append('0')
        else:
            classifications.append('1')

    f.write(','.join(classifications))
