# Accessing the Files

In [163]:
import nltk
from nltk import ngrams
import os

# Get the number format such as: 001, 002, 010, 100
def getNum(i):
    num = ""
    if len(str(i)) == 1:
        num = "00" + str(i)
    if len(str(i)) == 2:
        num = "0" + str(i)
    if len(str(i)) == 3:
        num = str(i)
    return num

# Return a tokensized text of the Transcripts
def getMovieTranscripts(low = True):
    movieScripts = []
    movieScores = ""
    if low == True:
        movieScores = "Bad"
    else:
        movieScores = "Good"
    #Taking advantage of the naming convension for the files, iterate through the files.
    for i in list(range(1,101)):
        #Check if the file exists.
        if os.path.isfile('data\\Processed'+ movieScores + 'Scripts\\' + getNum(i) + '-transcript.txt'): 
            f = open('data\\Processed'+ movieScores + 'Scripts\\' + getNum(i) + '-transcript.txt', 'r')
            content = f.read()
            # Normalize the whitespaces and lowercase everything
            newContent = " ".join(content.lower().split())
            movieScripts += nltk.word_tokenize(newContent)
    return addStartTokens(movieScripts)

# Adds a Start Token before each sentence. 
def addStartTokens(script):
    newList = ['[START]']
    count = 1
    for token in script:
        newList.append(token)  
        if token in ['.', '!', '?']:
            newList.insert(count + 1, '[START]')
            count += 1
        count += 1
    if newList[-1] == '[START]':
        newList.pop()
    return newList

badMovieScripts = getMovieTranscripts(low = True)
print("Number of Tokenized Words for Bad Movies:", len(badMovieScripts))

goodMovieScripts = getMovieTranscripts(low = False)
print("Number of Tokenized Words for Good Movies:", len(goodMovieScripts))

Number of Tokenized Words for Bad Movies: 806955
Number of Tokenized Words for Good Movies: 1119560


# Bigram Dictionary

Create a dictionary such that calling dictionary['[START]'] returns a dictionary of all of the words that came after it along with its counts.

In [186]:
from collections import defaultdict

def createBigramCount(scripts):
    model = defaultdict(lambda: defaultdict(lambda: 0))
    bigrams = ngrams(scripts, 2)
    for i in scripts:
        for word1, word2 in bigrams:
            model[word1][word2] += 1
    return model

model = createBigramCount(badMovieScripts)
#model['[START]']

#The dictionary is structured like this: [word1 : [word2: Count(word2)]]
count = 0
for word2 in model['[START]']:
    if count < 20:
        print('[START]', word2, model['[START]'][word2])
    count += 1

[START] oh 1893
[START] you 4665
[START] what 2739
[START] that 1480
[START] it 2441
[START] got 97
[START] i 9058
[START] get 557
[START] is 375
[START] ready 31
[START] three 35
[START] things 18
[START] whoa 134
[START] u.s. 3
[START] they 809
[START] a 673
[START] amy 13
[START] piss 2
[START] listen 163
[START] when 187


# Bigram Model

Given the movies script, the starting word, number of sentences, and alpha value, generate its own sentences.

In [168]:
import numpy as np

def bigramPredict(scripts, word, length, alpha):
    bigramModel = createBigramCount(scripts)
    sentence = word.split()
    #print(np.random.choice(list(model[sentence[-1]]), 1, p = [float(i)/sum(model[sentence[-1]].values()) for i in model[sentence[-1]].values()]))
    count = 0
    # Punctuations will be used to determine the ending of a sentence.
    punctuations = ['.', '!', '?']
    while count < length:
        #sentence.append(max(bigramModel[sentence[-1]], key=bigramModel[sentence[-1]].get))  
        randomToken = np.random.choice(list(bigramModel[sentence[-1]]), 1, p = [float(i + alpha)/(sum(bigramModel[sentence[-1]].values()) + (alpha * len(bigramModel[sentence[-1]].keys()))) for i in bigramModel[sentence[-1]].values()])[0]
        if randomToken in punctuations:
            count += 1
        sentence.append(randomToken)
    return sentence

# Trigram Dictionary

Create a dictionary such that calling dictionary['[START]', 'oh'] returns a dictionary of all of the words that came after these two pairs along with its counts.

In [170]:
def createTrigramCount(scripts):
    model = defaultdict(lambda: defaultdict(lambda: 0))
    trigrams = ngrams(scripts, 3)
    for i in scripts:
        for word1, word2, word3 in trigrams:
            model[(word1, word2)][word3] += 1
    return model

model = createTrigramCount(badMovieScripts)
#The dictionary is structured like this: [(word1, word2) : [word 3: Count(word3)]]
word1_word2 = ('[START]', 'oh')
print("The highest count after 'you, just' is: ", max(model[word1_word2], key=model[word1_word2].get))
for word3 in model[word1_word2]:
    print(word1_word2, word3, model[word1_word2][word3])

The highest count after 'you, just' is:  ,
('[START]', 'oh') , 1415
('[START]', 'oh') no 5
('[START]', 'oh') my 46
('[START]', 'oh') how 1
('[START]', 'oh') god 11
('[START]', 'oh') . 140
('[START]', 'oh') yeah 19
('[START]', 'oh') right 2
('[START]', 'oh') dropped 1
('[START]', 'oh') on 1
('[START]', 'oh') well 3
('[START]', 'oh') coopie 1
('[START]', 'oh') hey 1
('[START]', 'oh') and 2
('[START]', 'oh') hi 3
('[START]', 'oh') listen 1
('[START]', 'oh') really 1
('[START]', 'oh') look 2
('[START]', 'oh') shit 6
('[START]', 'oh') fuck 1
('[START]', 'oh') ! 172
('[START]', 'oh') harsh 1
('[START]', 'oh') ohh 1
('[START]', 'oh') all 1
('[START]', 'oh') that 1
('[START]', 'oh') simon 1
('[START]', 'oh') ugly 1
('[START]', 'oh') : 1
('[START]', 'oh') hello 1
('[START]', 'oh') thanks 1
('[START]', 'oh') cute 1
('[START]', 'oh') oh 3
('[START]', 'oh') wait 1
('[START]', 'oh') love 1
('[START]', 'oh') we 2
('[START]', 'oh') were 1
('[START]', 'oh') yes 3
('[START]', 'oh') now 2
('[START]', 'o

# Trigram Model

Given the movies script, the starting word, number of sentences, and alpha value, generate its own sentences.

In [145]:
from collections import defaultdict
def trigramPredict(scripts, word, length, alpha):
    bigramModel = createBigramCount(scripts)
    trigramModel = createTrigramCount(scripts)
    sentence = []
    sentence.append(word)
    sentence.append(np.random.choice(list(bigramModel[word]), 1, p = [float(i + alpha)/(sum(bigramModel[word].values()) + (alpha * len(bigramModel[word].keys()))) for i in bigramModel[word].values()])[0])
    count = 0
    punctuations = ['.', '!', '?']
    while count < length:
        if len(trigramModel[(sentence[-2], sentence[-1])]) == 0:
            pick = np.random.choice(list(bigramModel[sentence[-1]]), 1, p = [float(i + alpha)/(sum(bigramModel[sentence[-1]].values()) + (alpha * len(bigramModel[sentence[-1]].keys()))) for i in bigramModel[sentence[-1]].values()])[0]
            if pick in punctuations:
                count += 1
            sentence.append(pick)
        else:
            #sentence.append(max(trigramModel[(sentence[-2], sentence[-1])], key=trigramModel[(sentence[-2], sentence[-1])].get))
            randomToken = np.random.choice(list(trigramModel[(sentence[-2], sentence[-1])]), 1, p = [float(i + alpha)/(sum(trigramModel[(sentence[-2], sentence[-1])].values()) + (alpha * len(trigramModel[(sentence[-2], sentence[-1])].keys()))) for i in trigramModel[(sentence[-2], sentence[-1])].values()])[0]
            if randomToken in punctuations:
                count += 1
            sentence.append(randomToken)
            
    return sentence

# Generating Sentences

To make the sentences presentable, remove the start token, and correct the spacings in between words and punctuations.

In [221]:
def generate(ls):
    output = ls[1].capitalize()
    for token in ls[2:]:
        if token == '[START]':
            pass
        elif output[-1] in ['.', '!', '?'] or token == 'i':
            output += " " + token.capitalize()
        elif token in [',', ':', '.', '!', '?'] or token[0] == "'" or token == "n't":
            output += token
        else:
            output += " " + token
    return output

print("### Bigram - Bad Movies ###")
print(generate(bigramPredict(badMovieScripts, '[START]', 5, 0.1)))
print()
print("### Bigram - Good Movies ###")
print(generate(bigramPredict(goodMovieScripts, '[START]', 5, 0.1)))
print()
print("### Trigram - Bad Movies ###")
print(generate(trigramPredict(badMovieScripts, '[START]', 5, 0.1)))
print()
print("### Trigram - Good Movies ###")
print(generate(trigramPredict(goodMovieScripts, '[START]', 5, 0.1)))

### Bigram - Bad Movies ###
''lovecake'' she's being so inconspicuous and skilled enough for this to. Come on, I missed. I'm not remember. You. They are showing off.

### Bigram - Good Movies ###
Fuck what was determined to. Go. Let's right here. That glowers in nepal? Alex.

### Trigram - Bad Movies ###
Are you kidding me. Then the water. If your interests are noble, but I want save passage for me? Come on! He once killed his own daughter betraying me to say everything twice.

### Trigram - Good Movies ###
What could they be a woman soon soon you gon na make anyone'pay'. Hotel freezer day m.s. I wanted to get into character. Men have emptied entire clips at them the devil do you say yes! You mind if I ever thank you, sir.


# Probability of Sentences

Calculate the probability of a sentence being generated by the n-gram models.

In [224]:
def bigramProb(sentence, scripts, alpha):
    s = sentence.split()
    bigramModel = createBigramCount(scripts)
    prob = float(bigramModel[s[0]][s[1]] + alpha)/(sum(bigramModel[s[0]].values()) + (alpha * len(bigramModel[s[0]].keys())))
    index = 1
#     print(prob)
    for word in s[2:]:
        product = float(bigramModel[s[index]][word] + alpha)/(sum(bigramModel[s[index]].values()) + (alpha * len(bigramModel[s[index]].keys())))
#         print(product)
        prob *= product
        index += 1
    return prob


def trigramProb(sentence, scripts, alpha):
    s = sentence.split()
    bigramModel = createBigramCount(scripts)
    prob = float(bigramModel[s[0][1]][s[2]] + alpha)/(sum(bigramModel[s[0][1]].values()) + (alpha * len(bigramModel[s[0][1]].keys())))
    index = 1
#     print(prob)
    for word in s[3:]:
        product = float(bigramModel[s[index][index + 1]][word] + alpha)/(sum(bigramModel[s[index][index + 1]].values()) + (alpha * len(bigramModel[s[index][index + 1]].keys())))
#         print(product)
        prob *= product
        index += 1
    return prob

# EXAMPLE
s = "[START] you don't worry."
print("Bigram: Prob(", s, ") = ", bigramProb(s, badMovieScripts, 0.1))
print("Trigram: Prob(", s, ") = ", trigramProb(s, badMovieScripts, 0.1))

Bigram: Prob( [START] you don't worry. ) =  2.341232113375408e-07
Trigram: Prob( [START] you don't worry. ) =  0.007246376811594203
