# Accessing the Files

In [1]:
import nltk
from nltk import ngrams
import os

# Get the number format such as: 001, 002, 010, 100
def getNum(i):
    num = ""
    if len(str(i)) == 1:
        num = "00" + str(i)
    if len(str(i)) == 2:
        num = "0" + str(i)
    if len(str(i)) == 3:
        num = str(i)
    return num

# Return a tokensized text of the Transcripts
def getMovieTranscripts(low = True):
    movieScripts = []
    movieScores = ""
    if low == True:
        movieScores = "Bad"
    else:
        movieScores = "Good"
    #Taking advantage of the naming convension for the files, iterate through the files.
    for i in list(range(1,101)):
        #Check if the file exists.
        if os.path.isfile('data\\Processed'+ movieScores + 'Scripts\\' + getNum(i) + '-transcript.txt'): 
            f = open('data\\Processed'+ movieScores + 'Scripts\\' + getNum(i) + '-transcript.txt', 'r')
            content = f.read()
            # Normalize the whitespaces and lowercase everything
            newContent = " ".join(content.lower().split())
            movieScripts += nltk.word_tokenize(newContent)
    return addStartTokens(movieScripts)

# Adds a Start Token before each sentence. 
def addStartTokens(script):
    sentenceList = []
    sentence = ['[START]']
    newList = ['[START]']
    count = 1
    for token in script:
        newList.append(token)
        sentence.append(token)  
        if token in ['.', '!', '?']:
            newList.insert(count + 1, '[START]')
            sentenceList.append(sentence)
            sentence = ['[START]']
            count += 1
        count += 1
    if newList[-1] == '[START]':
        newList.pop()
    return newList, sentenceList

badMovieScripts, badSentenceList = getMovieTranscripts(low = True)
print("Number of Tokenized Words for Bad Movies:", len(badMovieScripts))
print("Number of sentences for Bad Movies:", len(badSentenceList))
print("Vocab Size:", len(set(badMovieScripts)))

goodMovieScripts, goodSentenceList = getMovieTranscripts(low = False)
print("Number of Tokenized Words for Good Movies:", len(goodMovieScripts))
print("Number of sentences for Good Movies:", len(goodSentenceList))
print("Vocab Size:", len(set(goodMovieScripts)))

Number of Tokenized Words for Bad Movies: 806955
Number of sentences for Bad Movies: 92115
Vocab Size: 29506
Number of Tokenized Words for Good Movies: 1119560
Number of sentences for Good Movies: 125230
Vocab Size: 30427


# Bigram Dictionary

Create a dictionary such that calling dictionary['[START]'] returns a dictionary of all of the words that came after it along with its counts.

In [2]:
from collections import defaultdict

def createBigramCount(scripts):
    model = defaultdict(lambda: defaultdict(lambda: 0))
    bigrams = ngrams(scripts, 2)
    for i in scripts:
        for word1, word2 in bigrams:
            model[word1][word2] += 1
    return model

model = createBigramCount(badMovieScripts)
#model['[START]']

#The dictionary is structured like this: [word1 : [word2: Count(word2)]]
count = 0
for word2 in model['[START]']:
    if count < 20:
        print('[START]', word2, model['[START]'][word2])
    count += 1

[START] oh 1893
[START] you 4665
[START] what 2739
[START] that 1480
[START] it 2441
[START] got 97
[START] i 9058
[START] get 557
[START] is 375
[START] ready 31
[START] three 35
[START] things 18
[START] whoa 134
[START] u.s. 3
[START] they 809
[START] a 673
[START] amy 13
[START] piss 2
[START] listen 163
[START] when 187


# Bigram Model

Given the movies script, the starting word, number of sentences, and alpha value, generate its own sentences.

In [14]:
import numpy as np

def bigramPredict(scripts, word, length):
    bigramModel = createBigramCount(scripts)
    sentence = word.split()
    #print(np.random.choice(list(model[sentence[-1]]), 1, p = [float(i)/sum(model[sentence[-1]].values()) for i in model[sentence[-1]].values()]))
    count = 0
    # Punctuations will be used to determine the ending of a sentence.
    punctuations = ['.', '!', '?']
    while count < length:
        randomToken = max(bigramModel[sentence[-1]], key=bigramModel[sentence[-1]].get)  
        #randomToken = np.random.choice(list(bigramModel[sentence[-1]]), 1, p = [float(i + alpha)/(sum(bigramModel[sentence[-1]].values()) + (alpha * len(bigramModel[sentence[-1]].keys()))) for i in bigramModel[sentence[-1]].values()])[0]
        #randomToken = np.random.choice(list(bigramModel[sentence[-1]]), 1, p = [float(i)/(sum(bigramModel[sentence[-1]].values())) for i in bigramModel[sentence[-1]].values()])[0]
        if randomToken in punctuations:
            count += 1
        sentence.append(randomToken)
    return sentence

# Trigram Dictionary

Create a dictionary such that calling dictionary['[START]', 'oh'] returns a dictionary of all of the words that came after these two pairs along with its counts.

In [4]:
def createTrigramCount(scripts):
    model = defaultdict(lambda: defaultdict(lambda: 0))
    trigrams = ngrams(scripts, 3)
    for i in scripts:
        for word1, word2, word3 in trigrams:
            model[(word1, word2)][word3] += 1
    return model

model = createTrigramCount(badMovieScripts)
#The dictionary is structured like this: [(word1, word2) : [word 3: Count(word3)]]
word1_word2 = ('[START]', 'oh')
print("The highest count after 'you, just' is: ", max(model[word1_word2], key=model[word1_word2].get))
for word3 in model[word1_word2]:
    print(word1_word2, word3, model[word1_word2][word3])

The highest count after 'you, just' is:  ,
('[START]', 'oh') , 1415
('[START]', 'oh') no 5
('[START]', 'oh') my 46
('[START]', 'oh') how 1
('[START]', 'oh') god 11
('[START]', 'oh') . 140
('[START]', 'oh') yeah 19
('[START]', 'oh') right 2
('[START]', 'oh') dropped 1
('[START]', 'oh') on 1
('[START]', 'oh') well 3
('[START]', 'oh') coopie 1
('[START]', 'oh') hey 1
('[START]', 'oh') and 2
('[START]', 'oh') hi 3
('[START]', 'oh') listen 1
('[START]', 'oh') really 1
('[START]', 'oh') look 2
('[START]', 'oh') shit 6
('[START]', 'oh') fuck 1
('[START]', 'oh') ! 172
('[START]', 'oh') harsh 1
('[START]', 'oh') ohh 1
('[START]', 'oh') all 1
('[START]', 'oh') that 1
('[START]', 'oh') simon 1
('[START]', 'oh') ugly 1
('[START]', 'oh') : 1
('[START]', 'oh') hello 1
('[START]', 'oh') thanks 1
('[START]', 'oh') cute 1
('[START]', 'oh') oh 3
('[START]', 'oh') wait 1
('[START]', 'oh') love 1
('[START]', 'oh') we 2
('[START]', 'oh') were 1
('[START]', 'oh') yes 3
('[START]', 'oh') now 2
('[START]', 'o

# Trigram Model

Given the movies script, the starting word, number of sentences, and alpha value, generate its own sentences.

In [15]:
from collections import defaultdict
def trigramPredict(scripts, word, length):
    bigramModel = createBigramCount(scripts)
    trigramModel = createTrigramCount(scripts)
    sentence = []
    sentence.append(word)
    #sentence.append(np.random.choice(list(bigramModel[word]), 1, p = [float(i + alpha)/(sum(bigramModel[word].values()) + (alpha * len(bigramModel[word].keys()))) for i in bigramModel[word].values()])[0])
    #sentence.append(np.random.choice(list(bigramModel[word]), 1, p = [float(i)/(sum(bigramModel[word].values())) for i in bigramModel[word].values()])[0])
    sentence.append(max(bigramModel['[START]'], key=bigramModel['[START]'].get))
    
    count = 0
    punctuations = ['.', '!', '?']
    while count < length:
        if len(trigramModel[(sentence[-2], sentence[-1])]) == 0:
            pick = max(bigramModel[sentence[-1]], key=bigramModel[sentence[-1]].get)
            #pick = np.random.choice(list(bigramModel[sentence[-1]]), 1, p = [float(i + alpha)/(sum(bigramModel[sentence[-1]].values()) + (alpha * len(bigramModel[sentence[-1]].keys()))) for i in bigramModel[sentence[-1]].values()])[0]
            #pick = np.random.choice(list(bigramModel[sentence[-1]]), 1, p = [float(i)/(sum(bigramModel[sentence[-1]].values())) for i in bigramModel[sentence[-1]].values()])[0]
            if pick in punctuations:
                count += 1
            sentence.append(pick)
        else:
            maxToken = max(trigramModel[(sentence[-2], sentence[-1])], key=trigramModel[(sentence[-2], sentence[-1])].get)
            #randomToken = np.random.choice(list(trigramModel[(sentence[-2], sentence[-1])]), 1, p = [float(i + alpha)/(sum(trigramModel[(sentence[-2], sentence[-1])].values()) + (alpha * len(trigramModel[(sentence[-2], sentence[-1])].keys()))) for i in trigramModel[(sentence[-2], sentence[-1])].values()])[0]
            #randomToken = np.random.choice(list(trigramModel[(sentence[-2], sentence[-1])]), 1, p = [float(i)/(sum(trigramModel[(sentence[-2], sentence[-1])].values())) for i in trigramModel[(sentence[-2], sentence[-1])].values()])[0]
            if maxToken in punctuations:
                count += 1
            sentence.append(maxToken)
            
    return sentence

# Generating Sentences

To make the sentences presentable, remove the start token, and correct the spacings in between words and punctuations.

In [16]:
def generate(ls):
    output = ls[1].capitalize()
    for token in ls[2:]:
        if token == '[START]':
            pass
        elif output[-1] in ['.', '!', '?'] or token == 'i':
            output += " " + token.capitalize()
        elif token in [',', ':', '.', '!', '?'] or token[0] == "'" or token == "n't":
            output += token
        else:
            output += " " + token
    return output

print("### Trigram - Bad Movies ###")
print(generate(trigramPredict(badMovieScripts, '[START]', 1)))
print()
print("### Trigram - Good Movies ###")
print(generate(trigramPredict(goodMovieScripts, '[START]', 1)))

### Trigram - Bad Movies ###
I'm not gon na be a good time.

### Trigram - Good Movies ###
I'm not going to be a little more.


# Probability of Sentences

Calculate the probability of a sentence being generated by the n-gram models.

In [127]:
import math

def bigramProb(sentence, scripts):
    s = sentence
    bigramModel = createBigramCount(scripts)
    prob = 0
    index = 0
    product = 0
    for word in s[1:]:
        #print((s[index], word))
        if sum(bigramModel[s[index]].values()) != 0:
            product = bigramModel[s[index]][word]/sum(bigramModel[s[index]].values())
            product = math.log(product, 2)
            prob += product
        product = 0
        index += 1
    return prob/len(s)

def trigramProb(sentence, scripts):
    s = sentence
    print(s)
    bigramModel = createBigramCount(scripts)
    trigramModel = createTrigramCount(scripts)
    #Always get the start token: log(1) = 0
    prob = 0
    #P(W2)
    if bigramModel[s[0]][s[1]] != 0 and sum(bigramModel[s[0]].values()) != 0:
        p = bigramModel[s[0]][s[1]]/sum(bigramModel[s[0]].values())
        prob += math.log(p, 2)
    print("w1", prob)
    #P(W3 - onwards)
    index = 2
    product = 0
    for word in s[2:]:
        freq = trigramModel[(s[index-2], s[index -1])][s[index]]
        total = sum(trigramModel[(s[index-2], s[index -1])].values())
        if freq != 0 and total != 0:
            product = freq/total
            product = math.log(product, 2)
            prob += product
        if product == 0 and bigramModel[s[index - 1]][word] != 0 and sum(bigramModel[s[index - 1]].values()) != 0:
            product = bigramModel[s[index - 1]][word]/sum(bigramModel[s[index - 1]].values())
            product = math.log(product, 2)
            prob += product
        if product == 0:
            product = (1 + 0.1) / (len(bigramModel[s[index - 1]].values()) + (0.1*(len(set(badMovieScripts)))))
            product = math.log(product, 2)
            prob += product
        print(index, "- ", product)
        product = 0
        index += 1
    return pow(2, -(prob/len(s)))

#EXAMPLE
s = ['[START]', 'you', 'have', 'got', 'superstar', 'written', 'aii', 'over', 'you', '.']
print("Bigram: Prob(", s, ") = ", bigramProb(s, badMovieScripts))
print("Trigram: Prob(", s, ") = ", trigramProb(s, badMovieScripts))

Bigram: Prob( ['[START]', 'you', 'have', 'got', 'superstar', 'written', 'aii', 'over', 'you', '.'] ) =  -5.1787040961277295
['[START]', 'you', 'have', 'got', 'superstar', 'written', 'aii', 'over', 'you', '.']
w1 -4.303502779419425
2 -  -4.9492566264140425
3 -  -6.638073837180718
4 -  -3.321928094887362
5 -  -2.8073549220576046
6 -  -3.1699250014423126
7 -  -5.149747119504682
8 -  -1.0
9 -  -0.4150374992788438
Trigram: Prob( ['[START]', 'you', 'have', 'got', 'superstar', 'written', 'aii', 'over', 'you', '.'] ) =  9.034736893409807


In [162]:
import random
np.random.seed(1)
random.shuffle(badSentenceList)
random.shuffle(goodSentenceList)

trainSet_Bad = badSentenceList[: 82904] # 90%
trainSet_Good = goodSentenceList[: 112707] # 90%
flattenBad = [sentence for sublist in trainSet_Bad for sentence in sublist]
flattenGood = [sentence for sublist in trainSet_Good for sentence in sublist]

testSet_Bad = badSentenceList[82904:] # 10%
testSet_Good = goodSentenceList[112707:] # 10%

# Perplexity of Trigram

In [214]:
perplexitySentences = []

def perplexity(testSet):
    bigramModel = createBigramCount(flattenBad)
    trigramModel = createTrigramCount(flattenBad)
    summation = 0
    words = 0
    for s in testSet:
        # substract 1 to ignore the start token
        words += (len(s) - 1)
        pSentence = 0
        for index, word in enumerate(s):
            # P(word | [START])
            if index == 1:
                p = (bigramModel['[START]'][s[index]] + 0.1)/(sum(bigramModel['[START]'].values()) + (0.1 * len(bigramModel['[START]'].values())))
                summation += math.log(p, 2)
                pSentence += math.log(p, 2)
            elif index > 1:
                freq = trigramModel[(s[index-2], s[index -1])][s[index]]
                if freq != 0:
                    total = sum(trigramModel[(s[index-2], s[index -1])].values())
                    p = (freq + 0.1)/(total + (0.1 * len(trigramModel[(s[index-2], s[index -1])].values())))
                    summation += math.log(p, 2)
                    pSentence += math.log(p, 2)
                else:
                    # Backoff
                    p = (bigramModel[index - 1][s[index]] + 0.1)/(sum(bigramModel[index-1].values()) + (0.1 * len(bigramModel[index-1].values())))
                    if p != 0:
                        summation += math.log(p, 2)
                        pSentence += math.log(p, 2)
                    else:
                        p = 0.1/(total + (0.1 * len(bigramModel[s[index -1]].values())))
                        summation += math.log(p, 2)
                        pSentence += math.log(p, 2)
        perplexitySentences.append((pSentence, s))
    average = summation/words
    return pow(2, -average)

print(perplexity(testSet_Bad))

#Bad: 111.97985098906223     69.3270132684494
#Good: 113.56405507656099    77.69581596661679

# perplexityList = sorted(perplexitySentences, key=lambda x: x[0])

# for i in perplexityList[0:10]:
#     print(len(i[1]))

69.3270132684494


# Perplexity of Bigram

In [215]:
def perplexityBigram(testSet):
    bigramModel = createBigramCount(flattenBad)
    summation = 0
    words = 0
    for s in testSet:
        # substract 1 to ignore the start token
        words += (len(s) - 1)
        for index, word in enumerate(s):
            # P(word | [START])
            if index > 1:
                p = (bigramModel[index - 1][s[index]] + 0.1)/(sum(bigramModel[index - 1].values()) + (0.1 * len(bigramModel[index - 1].values())))
                summation += math.log(p, 2)
    average = summation/words
    return pow(2, -average)

print(perplexityBigram(testSet_Bad))

#Bad: 111.97985098906223
#Good: 113.56405507656099

#Bad: 194.99396460330078
#Bad: 274.03572856788634

194.99396460330078


In [15]:
def cost(train, test):
    match = 0
    total = 0
    bigramModel = createBigramCount(train)
    trigramModel = createTrigramCount(train)

    for sentence in test:
        #print(sentence)
        index = 0
        maxToken = ''
        for word in range(0, len(sentence) - 2):
            wordPair = (sentence[index], sentence[index + 1])
            if len(trigramModel[wordPair]) == 0:
                if len(bigramModel[sentence[index + 1]]) == 0:
                    #print(wordPair)
                    maxToken = ''
                else:
                    maxToken = max(bigramModel[sentence[index + 1]], key=bigramModel[sentence[index + 1]].get)
            else:
                maxToken = max(trigramModel[wordPair], key=trigramModel[wordPair].get)
            #print(maxToken, sentence[index + 2])
            if maxToken == sentence[index + 2]:
                match += 1
            total += 1
            index += 1
    return match/total
        
print("Accuracy of Bad Movies: ", cost(flattenBad, testSet_Bad))
print("Accuracy of Good Movies: ", cost(flattenGood, testSet_Good))


Accuracy of Bad Movies:  0.23698533858550858
Accuracy of Good Movies:  0.22967978907992878


In [82]:
def cost_of_Bigram(train, test):
    match = 0
    total = 0
    bigramModel = createBigramCount(train)

    for sentence in test:
        #print(sentence)
        index = 0
        for word in range(0, len(sentence) - 2):
            wordPair = (sentence[index])
            if len(bigramModel[sentence[index]]) == 0:
                #print(wordPair)
                maxToken = ''
            else:
                maxToken = max(bigramModel[wordPair], key=bigramModel[wordPair].get)
            #print(maxToken, sentence[index + 2])
            if maxToken == sentence[index + 1]:
                match += 1
            total += 1
            index += 1
    return match/total
        
print("Accuracy of Bad Movies: ", cost_of_Bigram(flattenBad, testSet_Bad))
print("Accuracy of Good Movies: ", cost_of_Bigram(flattenGood, testSet_Good))

Accuracy of Bad Movies:  0.14571517301122897
Accuracy of Good Movies:  0.13743605921986615


# Word2Vec Word Embedding

In [13]:
from gensim.models import Word2Vec
for sublist in badSentenceList:
    del sublist[0]
print(badSentenceList[0:10]) 

[['oh', ',', 'shit', '.'], ['you', 'just', 'got', 'wolfed', '.'], ['what', '?'], ['that', 'is', 'an', 'official', 'trademark', 'that', 'i', 'am', 'getting', 'registered', '.'], ['it', "'s", 'a', 'lot', 'of', 'stuff', 'you', 'got', 'ta', 'do', ',', 'hoops', 'you', 'got', 'ta', 'jump', 'through', '.'], ['got', 'ta', 'get', 'on', 'the', 'internet', '.'], ['got', 'ta', 'go', 'to', 'some', 'stupidass', 'website', 'where', 'you', 'register', 'a', 'catch', 'phrase', '.'], ['i', 'wanted', '``', 'bam', ',', "''", 'but', 'emeril', 'had', 'taken', 'it', '.'], ['i', "'m", 'rambling', ',', 'man', '.'], ['get', 'up', ',', 'man', '.']]


In [4]:
from gensim.models import Word2Vec
model = Word2Vec(badSentenceList, size=200, window=5, min_count=1, workers=4, iter=500)
model.save("word2vec_Bad_200")

In [12]:
for sublist in goodSentenceList:
    del sublist[0]
print(goodSentenceList[0:10]) 

[['mr.', 'dufresne', ',', 'describe', 'the', 'confrontation', 'you', 'had', 'with', 'your', 'wife', 'the', 'night', 'she', 'was', 'murdered', '.'], ['it', 'was', 'very', 'bitter', '.'], ['she', 'said', 'she', 'was', 'glad', 'i', 'knew', ',', 'that', 'she', 'hated', 'all', 'the', 'sneaking', 'around', '.'], ['and', 'she', 'said', 'that', 'she', 'wanted', 'a', 'divorce', 'in', 'reno', '.'], ['what', 'was', 'your', 'response', '?'], ['i', 'told', 'her', 'i', 'would', 'not', 'grant', 'one', '.'], ['``', 'i', "'ll", 'see', 'you', 'in', 'hell', 'before', 'i', 'see', 'you', 'in', 'reno', '.'], ["''", 'those', 'were', 'your', 'words', ',', 'according', 'to', 'your', 'neighbors', '.'], ['if', 'they', 'say', 'so', '.'], ['i', 'really', 'do', "n't", 'remember', '.']]


In [7]:
model = Word2Vec(goodSentenceList, size=200, window=5, min_count=1, workers=4, iter=500)
model.save("word2vec_Good_200")

In [15]:
import csv

with open("Tokenized Bad Sentences.csv", "w", newline="", encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerows(badSentenceList)

In [42]:
with open('output.csv', 'r') as f:
    reader = csv.reader(f)
    your_list = list(reader)
    
print(your_list[3789:3790])    

[['hey', ',', 'nate', '.']]
