# Accessing the Files

In [16]:
import nltk
from nltk import ngrams
import os

# Get the number format such as: 001, 002, 010, 100
def getNum(i):
    num = ""
    if len(str(i)) == 1:
        num = "00" + str(i)
    if len(str(i)) == 2:
        num = "0" + str(i)
    if len(str(i)) == 3:
        num = str(i)
    return num

# Return a tokensized text of the Transcripts and a raw text of the Transcripts
def getMovieTranscripts(low = True):
    movieScripts = []
    movieScores = ""
    if low == True:
        movieScores = "Bad"
    else:
        movieScores = "Good"
    #Taking advantage of the naming convension for the files, iterate through the files.
    for i in list(range(1,101)):
        #Check if the file exists.
        if os.path.isfile('data\\Processed'+ movieScores + 'Scripts\\' + getNum(i) + '-transcript.txt'): 
            f = open('data\\Processed'+ movieScores + 'Scripts\\' + getNum(i) + '-transcript.txt', 'r')
            content = f.read()
            # Normalize the whitespaces and lowercase everything
            newContent = " ".join(content.lower().split())
            movieScripts += nltk.word_tokenize(newContent)
    return movieScripts

badMovieScripts = getMovieTranscripts(low = True)
print("Number of Tokenized Words for Bad Movies:", len(badMovieScripts))

goodMovieScripts = getMovieTranscripts(low = False)
print("Number of Tokenized Words for Good Movies:", len(goodMovieScripts))

Number of Tokenized Words for Bad Movies: 714839
Number of Tokenized Words for Good Movies: 994330


# Bigram Model

In [17]:
from collections import defaultdict

def createBigramCount(scripts):
    model = defaultdict(lambda: defaultdict(lambda: 0))
    bigrams = ngrams(scripts, 2)
    for i in scripts:
        for word1, word2 in bigrams:
            model[word1][word2] += 1
    return model

model = createBigramCount(badMovieScripts)

#The dictionary is structured like this: [word1 : [word2: Count(word2)]]
for word2 in model['oh']:
    print(word1, word2, model['oh'][word2])

The highest count after 'oh' is:  ,
oh , 1754
oh no 12
oh god 24
oh my 86
oh how 2
oh . 167
oh yeah 24
oh right 2
oh dropped 1
oh on 1
oh well 4
oh coopie 1
oh hey 10
oh and 2
oh okay 2
oh hi 9
oh listen 1
oh really 1
oh look 2
oh shit 7
oh fuck 1
oh ! 209
oh harsh 1
oh burn 1
oh i 10
oh ohh 1
oh you 3
oh this 1
oh think 1
oh got 1
oh nice 1
oh oh 10
oh damn 1
oh does 1
oh all 1
oh here 2
oh that 2
oh people 1
oh simon 1
oh ugly 1
oh : 1
oh hello 1
oh # 1
oh mmm 1
oh thanks 1
oh cute 1
oh ooh 1
oh wait 1
oh love 1
oh we 2
oh were 1
oh yes 6
oh honey 3
oh dear 2
oh beautiful 1
oh now 2
oh barb 1
oh ? 6
oh boy 3
oh err 1
oh right.. 1
oh snap 1
oh yea 1
oh wait.. 1
oh great 2
oh peter 1
oh ah 1
oh what 1
oh get 1
oh but 1
oh it 2
oh leonard 1
oh ms. 1
oh very 1
oh ready 1
oh if 1
oh fred 2
oh dumdum 1
oh hell 1
oh thats 1
oh jesus 1
oh by 1
oh sure 1
oh ' 1
oh forget 1
oh a 1
oh sorry 1
oh too 1
oh weird 1
oh just 1
oh l 1
oh thank 1
oh licking 1


In [114]:
import numpy as np

def bigramPredict(scripts, word, length):
    bigramModel = createBigramCount(scripts)
    sentence = word.split()
    #print(np.random.choice(list(model[sentence[-1]]), 1, p = [float(i)/sum(model[sentence[-1]].values()) for i in model[sentence[-1]].values()]))
    count = 0
    punctuations = ['.', '!', '?']
    while count < length:
        #sentence.append(max(bigramModel[sentence[-1]], key=bigramModel[sentence[-1]].get))  
        randomToken = np.random.choice(list(bigramModel[sentence[-1]]), 1, p = [float(i)/sum(bigramModel[sentence[-1]].values()) for i in bigramModel[sentence[-1]].values()])[0]
        if randomToken in punctuations:
            count += 1
        sentence.append(randomToken)
    return sentence

# Trigram Model

In [48]:
def createTrigramCount(scripts):
    model = defaultdict(lambda: defaultdict(lambda: 0))
    trigrams = ngrams(scripts, 3)
    for i in scripts:
        for word1, word2, word3 in trigrams:
            model[(word1, word2)][word3] += 1
    return model

model = createTrigramCount(badMovieScripts)
#The dictionary is structured like this: [(word1, word2) : [word 3: Count(word3)]]
word1_word2 = ('you', 'just')
print("The highest count after 'you, just' is: ", max(model[word1_word2], key=model[word1_word2].get))
for word3 in model[word1_word2]:
    print(word1_word2, word3, model[word1_word2][word3])

The highest count after 'you, just' is:  got
('you', 'just') got 12
('you', 'just') walk 1
('you', 'just') do 8
('you', 'just') ca 4
('you', 'just') name 1
('you', 'just') get 4
('you', 'just') gon 1
('you', 'just') wish 1
('you', 'just') say 3
('you', 'just') drive 1
('you', 'just') wait 1
('you', 'just') have 3
('you', 'just') kept 2
('you', 'just') said 4
('you', 'just') keep 1
('you', 'just') found 1
('you', 'just') never 3
('you', 'just') heard 1
('you', 'just') shut 1
('you', 'just') go 1
('you', 'just') wrecked 1
('you', 'just') fart 1
('you', 'just') hear 1
('you', 'just') kissed 1
('you', 'just') pop 1
('you', 'just') had 2
('you', 'just') leave 5
('you', 'just') let 1
('you', 'just') stuff 1
('you', 'just') passing 1
('you', 'just') stay 1
('you', 'just') come 1
('you', 'just') give 2
('you', 'just') listen 2
('you', 'just') taik 1
('you', 'just') iove 1
('you', 'just') wanted 2
('you', 'just') the 1
('you', 'just') take 5
('you', 'just') confess 1
('you', 'just') met 1
('you

In [100]:
from collections import defaultdict
def trigramPredict(scripts, word, length):
    bigramModel = createBigramCount(scripts)
    trigramModel = createTrigramCount(scripts)
    sentence = []
    sentence.append(word)
    sentence.append(np.random.choice(list(bigramModel[word]), 1, p = [float(i)/sum(bigramModel[word].values()) for i in bigramModel[word].values()])[0])
    count = 0
    punctuations = ['.', '!', '?']
    while count < length:
        if len(trigramModel[(sentence[-2], sentence[-1])]) == 0:
            pick = np.random.choice(list(bigramModel[sentence[-1]]), 1, p = [float(i)/sum(bigramModel[sentence[-1]].values()) for i in bigramModel[sentence[-1]].values()])[0]
            if pick in punctuations:
                count += 1
            sentence.append(pick)
        else:
            #sentence.append(max(trigramModel[(sentence[-2], sentence[-1])], key=trigramModel[(sentence[-2], sentence[-1])].get))
            randomToken = np.random.choice(list(trigramModel[(sentence[-2], sentence[-1])]), 1, p = [float(i)/sum(trigramModel[(sentence[-2], sentence[-1])].values()) for i in trigramModel[(sentence[-2], sentence[-1])].values()])[0]
            if randomToken in punctuations:
                count += 1
            sentence.append(randomToken)
            
    return sentence

In [116]:
def generate(ls):
    output = ""
    for token in ls:
        if len(output) == 0:
            output += token.capitalize()
        elif output[-1] in ['.', '!', '?'] or token == 'i':
            output += " " + token.capitalize()
        elif token in [',', ':', '.', '!', '?'] or token[0] == "'" or token == "n't":
            output += token
        else:
            output += " " + token
    return output

print("Bigram")
print(generate(bigramPredict(badMovieScripts, 'the', 2)))
print(generate(bigramPredict(goodMovieScripts, 'the', 2)))

print("Trigram")
print(generate(trigramPredict(badMovieScripts, 'the', 2)))
print(generate(trigramPredict(goodMovieScripts, 'the', 2)))

Bigram
The bliss, tuesday. Ragged, you and wrapups.
The waiting that's nothing that? He is this random, because we're trying to you have peace, or a second darkness, let yourself the gun in jeopardy!
Trigram
The fish is fresh, bouncing off the bed! You drew that?
The stairs, picks up a cd case from a vendor in the distance, a girl from the ground in bright balls of flames. Alfredo pay attention.
