# Accessing the Files

In [163]:
import nltk
from nltk import ngrams
import os

# Get the number format such as: 001, 002, 010, 100
def getNum(i):
    num = ""
    if len(str(i)) == 1:
        num = "00" + str(i)
    if len(str(i)) == 2:
        num = "0" + str(i)
    if len(str(i)) == 3:
        num = str(i)
    return num

# Return a tokensized text of the Transcripts and a raw text of the Transcripts
def getMovieTranscripts(low = True):
    movieScripts = []
    movieScores = ""
    if low == True:
        movieScores = "Bad"
    else:
        movieScores = "Good"
    #Taking advantage of the naming convension for the files, iterate through the files.
    for i in list(range(1,101)):
        #Check if the file exists.
        if os.path.isfile('data\\Processed'+ movieScores + 'Scripts\\' + getNum(i) + '-transcript.txt'): 
            f = open('data\\Processed'+ movieScores + 'Scripts\\' + getNum(i) + '-transcript.txt', 'r')
            content = f.read()
            # Normalize the whitespaces and lowercase everything
            newContent = " ".join(content.lower().split())
            movieScripts += nltk.word_tokenize(newContent)
    return addStartTokens(movieScripts)

def addStartTokens(script):
    newList = ['[START]']
    count = 1
    for token in script:
        newList.append(token)  
        if token in ['.', '!', '?']:
            newList.insert(count + 1, '[START]')
            count += 1
        count += 1
    if newList[-1] == '[START]':
        newList.pop()
    return newList

badMovieScripts = getMovieTranscripts(low = True)
print("Number of Tokenized Words for Bad Movies:", len(badMovieScripts))

goodMovieScripts = getMovieTranscripts(low = False)
print("Number of Tokenized Words for Good Movies:", len(goodMovieScripts))

Number of Tokenized Words for Bad Movies: 806955
Number of Tokenized Words for Good Movies: 1119560


# Bigram Model

In [167]:
from collections import defaultdict

def createBigramCount(scripts):
    model = defaultdict(lambda: defaultdict(lambda: 0))
    bigrams = ngrams(scripts, 2)
    for i in scripts:
        for word1, word2 in bigrams:
            model[word1][word2] += 1
    return model

model = createBigramCount(badMovieScripts)
#model['[START]']

#The dictionary is structured like this: [word1 : [word2: Count(word2)]]
for word2 in model['[START]']:
    print('[START]', word2, model['[START]'][word2])

[START] oh 1893
[START] you 4665
[START] what 2739
[START] that 1480
[START] it 2441
[START] got 97
[START] i 9058
[START] get 557
[START] is 375
[START] ready 31
[START] three 35
[START] things 18
[START] whoa 134
[START] u.s. 3
[START] they 809
[START] a 673
[START] amy 13
[START] piss 2
[START] listen 163
[START] when 187
[START] how 753
[START] bloody 3
[START] just 443
[START] do 1007
[START] or 100
[START] let 817
[START] hang 33
[START] there 609
[START] all 649
[START] everyone 69
[START] bottoms 1
[START] sick 2
[START] mark 171
[START] cave 1
[START] beaver 1
[START] your 300
[START] the 1591
[START] did 270
[START] yeah 1431
[START] wake 19
[START] flavaflav 1
[START] she 968
[START] will 64
[START] look 487
[START] unbelievable 7
[START] come 1098
[START] goodbye 53
[START] are 429
[START] grow 2
[START] hey 925
[START] happy 31
[START] take 213
[START] dyke 2
[START] thanks 224
[START] yo 57
[START] he 1399
[START] we 1965
[START] call 52
[START] tails 3
[START] punch 1
[S

[START] flying 6
[START] makes 5
[START] figure 3
[START] may 41
[START] took 1
[START] recently 1
[START] plus 8
[START] heard 4
[START] step 23
[START] chris 8
[START] such 10
[START] sarcasm 1
[START] am 37
[START] traffic 3
[START] beep 6
[START] new 9
[START] women 10
[START] party 10
[START] relax 30
[START] turning 1
[START] magic 4
[START] serendipity 1
[START] stalked 1
[START] um 54
[START] desperately 2
[START] san 2
[START] same 14
[START] nathalie 4
[START] sure 101
[START] again 19
[START] authorities 1
[START] caught 2
[START] $ 16
[START] awesome 12
[START] biggest 1
[START] victoria 3
[START] fantastic 9
[START] whoo 52
[START] fashion 2
[START] lucky 8
[START] anyways 5
[START] dubbed 1
[START] currently 1
[START] however 13
[START] software 4
[START] degree 1
[START] hopefully 2
[START] follow 31
[START] family 3
[START] yup 9
[START] sexy 2
[START] meet 22
[START] nope 16
[START] rick 1
[START] want 22
[START] early 2
[START] chicks 4
[START] work 7
[START] understo

[START] 899 1
[START] 900 1
[START] 901 1
[START] 902 1
[START] 903 1
[START] 904 1
[START] 905 1
[START] 906 1
[START] 907 1
[START] 908 1
[START] porky 1
[START] 909 1
[START] 910 1
[START] 911 3
[START] 912 1
[START] 913 1
[START] 914 1
[START] 915 1
[START] 916 1
[START] 917 1
[START] 918 1
[START] 919 1
[START] 920 1
[START] 921 1
[START] 922 1
[START] 923 1
[START] 924 1
[START] 925 1
[START] 927 1
[START] 928 1
[START] 929 1
[START] 930 1
[START] 931 1
[START] 932 1
[START] 933 1
[START] 934 1
[START] 936 1
[START] 937 1
[START] 938 1
[START] 939 1
[START] 940 1
[START] 941 1
[START] 942 1
[START] 943 1
[START] 944 1
[START] 945 1
[START] 946 1
[START] 947 1
[START] 948 1
[START] 949 1
[START] 950 1
[START] 951 1
[START] 952 1
[START] 953 1
[START] 954 1
[START] 955 1
[START] 956 1
[START] 957 1
[START] 960 1
[START] 961 1
[START] 962 1
[START] 963 1
[START] 964 1
[START] 965 1
[START] 966 1
[START] 967 1
[START] 968 1
[START] 970 1
[START] 971 1
[START] 972 1
[START] 973 1
[STA

[START] p.o.b 3
[START] limo 3
[START] passenger 1
[START] already 4
[START] brian 20
[START] behind 8
[START] larry 44
[START] bob 5
[START] certainly 10
[START] gather 2
[START] somewhere 5
[START] lf 26
[START] eat 16
[START] bull 3
[START] cow 2
[START] apparently 8
[START] melbourne 1
[START] ; 2
[START] sydney 1
[START] sunny 1
[START] adelaide 1
[START] around 7
[START] soon 14
[START] yourself 2
[START] unsolvable 1
[START] starkman 1
[START] seriously 10
[START] interested 1
[START] kick 6
[START] loosely 1
[START] hence 2
[START] com 2
[START] mrs. 25
[START] leonard 3
[START] australia 1
[START] nature 3
[START] satisfy 1
[START] lead 4
[START] its 12
[START] forward 5
[START] loving 1
[START] shoulders 1
[START] legs 1
[START] hips 2
[START] firm 1
[START] surrounding 1
[START] lily 1
[START] forgot 1
[START] dgame 1
[START] lady 11
[START] second 7
[START] mind 8
[START] stick 12
[START] friend 6
[START] clean 5
[START] nearsighted 1
[START] ricki 1
[START] kiss 9
[START] 

[START] nilbog 2
[START] alone 5
[START] evidently 1
[START] joshua 23
[START] sing 10
[START] row 1
[START] holly 4
[START] country 2
[START] typical 4
[START] hospitality 1
[START] tighten 1
[START] dinner 6
[START] josh 4
[START] haha 7
[START] eggs 1
[START] eugh 7
[START] bacon 2
[START] special 6
[START] free 5
[START] d'you 2
[START] appetising 1
[START] provocative 1
[START] repulsive 1
[START] symbol 1
[START] flesh 1
[START] hamburgers 1
[START] steaks 1
[START] smelly 1
[START] nests 1
[START] clusters 1
[START] vicious 3
[START] mm 9
[START] mmmm 2
[START] monstrous 1
[START] drew 8
[START] ddad 1
[START] together 3
[START] upstairs 3
[START] touch 4
[START] argh 5
[START] ingrate 1
[START] movie=troll2 1
[START] carnival 1
[START] julius 1
[START] flame 1
[START] rayne 4
[START] domastir 5
[START] kagan 10
[START] katarin 1
[START] brimstone 5
[START] blood 4
[START] dearest 1
[START] thus 1
[START] scratch 2
[START] dhamphir 2
[START] mostly 3
[START] generally 1
[START] 

[START] skip 3
[START] buthey 1
[START] hoid 9
[START] absoiuteiy 1
[START] cook 3
[START] ciowns 1
[START] fer 1
[START] hoiia 1
[START] giddyup 2
[START] feiias 1
[START] candy 5
[START] oopsies 1
[START] yasie 1
[START] precious 1
[START] weicome 1
[START] foie 1
[START] consider 1
[START] shoo 3
[START] jumbo 1
[START] dexterdo 1
[START] el 18
[START] bratz 5
[START] 5 3
[START] aiabama 1
[START] boring 2
[START] horribie 2
[START] ohno 1
[START] nuhuh 1
[START] deiete 1
[START] est 2
[START] biue 2
[START] badumcha 1
[START] oakmeai 2
[START] stiff 1
[START] diva 6
[START] judges 1
[START] bag 2
[START] exhibit 2
[START] aiiamerican 1
[START] stiii 1
[START] actuaiiy 2
[START] express 2
[START] outrageous 1
[START] congratuiations 1
[START] subtitied 1
[START] addie 4
[START] textopolis 1
[START] devil 2
[START] crier 1
[START] morning 7
[START] monkey 3
[START] meh 7
[START] practicing 1
[START] g'day 1
[START] konnichiwa 1
[START] gene 54
[START] working 2
[START] rocket 1
[STAR

[START] titanic 1
[START] pacino 1
[START] top 5
[START] cici 1
[START] pit 2
[START] cheesecake 1
[START] lam 1
[START] gloobledy 1
[START] blobbledobble 1
[START] hibbledy 1
[START] qu 1
[START] mayor 1
[START] dulciwhata 1
[START] badabada 1
[START] bongi 1
[START] para 1
[START] coodlee 1
[START] muleteers 1
[START] milady 1
[START] pokee 1
[START] dulcinea 1
[START] wha 4
[START] dunk 1
[START] burn 3
[START] espn 1
[START] triplets 1
[START] twins 3
[START] wet 1
[START] minka 1
[START] image 2
[START] citizen 1
[START] reset 1
[START] helping 1
[START] krebs 3
[START] william 3
[START] doctor 17
[START] dry 1
[START] bubbly 1
[START] curly 3
[START] naturally 4
[START] chanel 2
[START] 5. 2
[START] customs 1
[START] barb 11
[START] apartment 1
[START] retinal 6
[START] 10 8
[START] bounty 1
[START] steamboat 1
[START] willis 6
[START] moi 1
[START] bullshit 3
[START] add 5
[START] camille 4
[START] alexander 1
[START] needless 2
[START] rumor 2
[START] identification 1
[START] s

[START] spit 6
[START] ll 4
[START] eliminate 1
[START] rally 1
[START] krieger 1
[START] passageway 1
[START] traitor 1
[START] gun 4
[START] lnsurance 1
[START] frabbrizio 2
[START] pistachio 17
[START] uniess 3
[START] fantastico 1
[START] sophia 1
[START] ''lovecake 2
[START] arrivederci 1
[START] ransacked 1
[START] poiice 2
[START] heaveniy 1
[START] heip 1
[START] hoiy 1
[START] grandpapa 2
[START] grandfather 3
[START] fooi 1
[START] latex 1
[START] abraham 1
[START] rough 1
[START] vote 1
[START] impossibie 1
[START] ''if 1
[START] behoid 1
[START] deviin 3
[START] buffoon 1
[START] ciown 1
[START] energico 2
[START] become 5
[START] ''become 1
[START] india 2
[START] tickietickietickietickie 1
[START] disguiseys 1
[START] aiways 3
[START] dignity 1
[START] observe 1
[START] eibows 1
[START] eibow 1
[START] learn 3
[START] aiiow 2
[START] ''levei 1
[START] history 1
[START] freak 6
[START] brave 1
[START] barney 8
[START] measurements 2
[START] technicaiiy 1
[START] ''master 1

[START] 215a 1
[START] 215b 1
[START] 215c 1
[START] 215d 1
[START] 222 2
[START] vine 1
[START] shakes 1
[START] gordon 10
[START] 228 1
[START] gothamites 1
[START] flashing 1
[START] redbird 2
[START] statue 4
[START] schematics 2
[START] 246 2
[START] rooftop 2
[START] 246a 1
[START] rooftops 2
[START] 252a 2
[START] 257a 1
[START] 270 2
[START] 270a 1
[START] 270b 1
[START] 273 2
[START] floored 1
[START] judo 1
[START] arkham 10
[START] blossom 5
[START] mideastern 1
[START] slaves 1
[START] current 1
[START] pale 1
[START] golum 2
[START] venom 1
[START] icy 1
[START] various 1
[START] unauthorized 1
[START] kickstarts 1
[START] leather 1
[START] costumed 1
[START] dyed 1
[START] pierced 1
[START] banker 2
[START] engines 1
[START] 305 1
[START] receiving 1
[START] sits 1
[START] arckham 1
[START] puzzled 1
[START] mannequins 1
[START] leans 2
[START] 331 2
[START] dangerously 1
[START] auxiliary 1
[START] empty 2
[START] 333/ 1
[START] pipes 1
[START] 342a 1
[START] 342b 1
[STA

[START] 70a 1
[START] jimmyolsen 1
[START] warfield 27
[START] voila 1
[START] lacy 98
[START] teacher 9
[START] nervous 3
[START] zap 1
[START] defiantly 1
[START] metropolis 9
[START] 89 1
[START] guide 3
[START] 91 1
[START] branded 1
[START] oohs 1
[START] throws 1
[START] pretends 1
[START] research 2
[START] reads 2
[START] walks 1
[START] ridicule 1
[START] newfound 1
[START] classmate 1
[START] 104 1
[START] closes 1
[START] locks 1
[START] listens 1
[START] opens 1
[START] plaster 1
[START] headline 1
[START] stands 1
[START] fffaaather 1
[START] barring 1
[START] young 3
[START] metro 6
[START] c'mon 3
[START] forcing 1
[START] including 1
[START] 'look 2
[START] wideeyed 1
[START] whoosh 1
[START] crunch 1
[START] whispers 1
[START] thoroughly 1
[START] club 2
[START] alleyway 1
[START] car 4
[START] 152a 1
[START] rubs 1
[START] material 1
[START] mumbles 1
[START] reporter 2
[START] blurring 1
[START] stops 1
[START] sticks 1
[START] gale 1
[START] thunderous 1
[START] sno

[START] doctors 1
[START] speed 1
[START] hatch 2
[START] lmpossible 1
[START] anchor 1
[START] jan 1
[START] allez 2
[START] mummy 1
[START] zero 2
[START] splash 1
[START] lapd 1
[START] fireworks 1
[START] univershell 1
[START] ga. 1
[START] academy 1
[START] flintstone 1
[START] cranes 1
[START] mechanic 1
[START] frederick 1
[START] melvin 1
[START] gifts 3
[START] ya 3
[START] wilma 23
[START] chip 9
[START] klaatu 1
[START] fred 13
[START] ga 2
[START] naa 1
[START] dumdums 1
[START] violent 1
[START] betty 6
[START] upper 2
[START] atlast 1
[START] style 1
[START] numero 1
[START] f 1
[START] yeaa 1
[START] rockcandy 1
[START] ringtoss 1
[START] middle 2
[START] future 1
[START] barn 1
[START] classy 1
[START] hire 1
[START] dino 2
[START] morale 1
[START] cavalry 1
[START] mount 1
[START] charmed 1
[START] stones 1
[START] diamonds 1
[START] rocko 1
[START] buzz 1
[START] forty 2
[START] bank 1
[START] credit 1
[START] roxie 2
[START] freedom 1
[START] fourteen 1
[START] bowli

[START] religion 1
[START] ecumenical 1
[START] locusts 2
[START] spraying 1
[START] almighty 1
[START] possessed 1
[START] breaking 1
[START] remaining 1
[START] phiiip 1
[START] penn 2
[START] georgetown 1
[START] prospect 2
[START] gangway 1
[START] ian 3
[START] well.. 1
[START] centipedes 1
[START] ja 1
[START] fuckin 1
[START] ahhhhh 2
[START] beverly 1
[START] faboo 1
[START] viv 4
[START] armand 1
[START] bimini 1
[START] preppy 1
[START] sam 1
[START] grasshopper 1
[START] rachel 3
[START] shiatsu 1
[START] spoiled 2
[START] dive 2
[START] gardener 1
[START] mucho 1
[START] perdóname 1
[START] pack 1
[START] surfer 1
[START] cayuca 3
[START] hola 2
[START] las 1
[START] dogfights 2
[START] mexico 1
[START] rache 1
[START] delgado 6
[START] weighing 1
[START] clearly 1
[START] bro 1
[START] vámonos 1
[START] helps 1
[START] diablo 1
[START] ashe 1
[START] carthay 1
[START] bienvenidos 1
[START] mutt 1
[START] angela 1
[START] marvellously 1
[START] snotty 1
[START] pass 1
[STAR

In [168]:
import numpy as np

def bigramPredict(scripts, word, length, alpha):
    bigramModel = createBigramCount(scripts)
    sentence = word.split()
    #print(np.random.choice(list(model[sentence[-1]]), 1, p = [float(i)/sum(model[sentence[-1]].values()) for i in model[sentence[-1]].values()]))
    count = 0
    punctuations = ['.', '!', '?']
    while count < length:
        #sentence.append(max(bigramModel[sentence[-1]], key=bigramModel[sentence[-1]].get))  
        randomToken = np.random.choice(list(bigramModel[sentence[-1]]), 1, p = [float(i + alpha)/(sum(bigramModel[sentence[-1]].values()) + (alpha * len(bigramModel[sentence[-1]].keys()))) for i in bigramModel[sentence[-1]].values()])[0]
        if randomToken in punctuations:
            count += 1
        sentence.append(randomToken)
    return sentence

# Trigram Model

In [170]:
def createTrigramCount(scripts):
    model = defaultdict(lambda: defaultdict(lambda: 0))
    trigrams = ngrams(scripts, 3)
    for i in scripts:
        for word1, word2, word3 in trigrams:
            model[(word1, word2)][word3] += 1
    return model

model = createTrigramCount(badMovieScripts)
#The dictionary is structured like this: [(word1, word2) : [word 3: Count(word3)]]
word1_word2 = ('[START]', 'oh')
print("The highest count after 'you, just' is: ", max(model[word1_word2], key=model[word1_word2].get))
for word3 in model[word1_word2]:
    print(word1_word2, word3, model[word1_word2][word3])

The highest count after 'you, just' is:  ,
('[START]', 'oh') , 1415
('[START]', 'oh') no 5
('[START]', 'oh') my 46
('[START]', 'oh') how 1
('[START]', 'oh') god 11
('[START]', 'oh') . 140
('[START]', 'oh') yeah 19
('[START]', 'oh') right 2
('[START]', 'oh') dropped 1
('[START]', 'oh') on 1
('[START]', 'oh') well 3
('[START]', 'oh') coopie 1
('[START]', 'oh') hey 1
('[START]', 'oh') and 2
('[START]', 'oh') hi 3
('[START]', 'oh') listen 1
('[START]', 'oh') really 1
('[START]', 'oh') look 2
('[START]', 'oh') shit 6
('[START]', 'oh') fuck 1
('[START]', 'oh') ! 172
('[START]', 'oh') harsh 1
('[START]', 'oh') ohh 1
('[START]', 'oh') all 1
('[START]', 'oh') that 1
('[START]', 'oh') simon 1
('[START]', 'oh') ugly 1
('[START]', 'oh') : 1
('[START]', 'oh') hello 1
('[START]', 'oh') thanks 1
('[START]', 'oh') cute 1
('[START]', 'oh') oh 3
('[START]', 'oh') wait 1
('[START]', 'oh') love 1
('[START]', 'oh') we 2
('[START]', 'oh') were 1
('[START]', 'oh') yes 3
('[START]', 'oh') now 2
('[START]', 'o

In [145]:
from collections import defaultdict
def trigramPredict(scripts, word, length, alpha):
    bigramModel = createBigramCount(scripts)
    trigramModel = createTrigramCount(scripts)
    sentence = []
    sentence.append(word)
    sentence.append(np.random.choice(list(bigramModel[word]), 1, p = [float(i + alpha)/(sum(bigramModel[word].values()) + (alpha * len(bigramModel[word].keys()))) for i in bigramModel[word].values()])[0])
    count = 0
    punctuations = ['.', '!', '?']
    while count < length:
        if len(trigramModel[(sentence[-2], sentence[-1])]) == 0:
            pick = np.random.choice(list(bigramModel[sentence[-1]]), 1, p = [float(i + alpha)/(sum(bigramModel[sentence[-1]].values()) + (alpha * len(bigramModel[sentence[-1]].keys()))) for i in bigramModel[sentence[-1]].values()])[0]
            if pick in punctuations:
                count += 1
            sentence.append(pick)
        else:
            #sentence.append(max(trigramModel[(sentence[-2], sentence[-1])], key=trigramModel[(sentence[-2], sentence[-1])].get))
            randomToken = np.random.choice(list(trigramModel[(sentence[-2], sentence[-1])]), 1, p = [float(i + alpha)/(sum(trigramModel[(sentence[-2], sentence[-1])].values()) + (alpha * len(trigramModel[(sentence[-2], sentence[-1])].keys()))) for i in trigramModel[(sentence[-2], sentence[-1])].values()])[0]
            if randomToken in punctuations:
                count += 1
            sentence.append(randomToken)
            
    return sentence

In [183]:
def generate(ls):
    output = ls[1].capitalize()
    for token in ls[2:]:
        if token == '[START]':
            pass
        elif len(output) == 1:
            output += token.capitalize()
        elif output[-1] in ['.', '!', '?'] or token == 'i':
            output += " " + token.capitalize()
        elif token in [',', ':', '.', '!', '?'] or token[0] == "'" or token == "n't":
            output += token
        else:
            output += " " + token
    return output

print("### Bigram ###")
print(generate(bigramPredict(badMovieScripts, '[START]', 2, 0.1)))
print(generate(bigramPredict(goodMovieScripts, '[START]', 2, 0.1)))
print()
print("### Trigram ###")
print(generate(trigramPredict(badMovieScripts, '[START]', 2, 0.1)))
print(generate(trigramPredict(goodMovieScripts, '[START]', 2, 0.1)))

### Bigram ###
Just sittin' me a conscience? Skeet shooting her balance, what I speak it's the same.
Wouldn't worry about is the entire shrimping boats. But I do you got first catch the sewers.

### Trigram ###
Relax, becca, get it. Alone in the world.
And we still have hope for them. My baby!
