# Accessing the Files

In [210]:
import nltk
from nltk import ngrams
from nltk.tokenize import RegexpTokenizer
import os

# This regex expression extracts only words. 
tokenizer = RegexpTokenizer(r'\w+')

# Get the number format such as: 001, 002, 010, 100
def getNum(i):
    num = ""
    if len(str(i)) == 1:
        num = "00" + str(i)
    if len(str(i)) == 2:
        num = "0" + str(i)
    if len(str(i)) == 3:
        num = str(i)
    return num

# Return a tokensized text of the Transcripts and a raw text of the Transcripts
def getMovieTranscripts(low = True):
    movieScripts = []
    text = ""
    movieScores = ""
    if low == True:
        movieScores = "low"
    else:
        movieScores = "high"
    #Taking advantage of the naming convension for the files, iterate through the files.
    for i in list(range(1,101)):
        #Check if the file exists.
        if os.path.isfile('data\\'+ movieScores + '-rated\\' + getNum(i) + '-transcript.txt'): 
            f = open('data\\'+ movieScores + '-rated\\' + getNum(i) + '-transcript.txt', 'r')
            content = f.read()
            # Normalize the whitespaces and lowercase everything
            newContent = " ".join(content.lower().split())
            text += newContent
            movieScripts += tokenizer.tokenize(newContent)
    return movieScripts, text

badMovieScripts, badText = getMovieTranscripts(low = True)
print("Number of Tokenized Words for Bad Movies:", len(badMovieScripts))

goodMovieScripts, goodText = getMovieTranscripts(low = False)
print("Number of Tokenized Words for Good Movies:", len(goodMovieScripts))

Number of Tokenized Words for Bad Movies: 415959
Number of Tokenized Words for Good Movies: 395602


# Bigram Model

In [270]:
from collections import defaultdict

def createBigramCount(scripts):
    model = defaultdict(lambda: defaultdict(lambda: 0))
    bigrams = ngrams(scripts, 2)
    for i in scripts:
        for word1, word2 in bigrams:
            model[word1][word2] += 1
    return model

model = createBigramCount(badMovieScripts)

#The dictionary is structured like this: [word1 : [word2: Count(word2)]]
word1 = 'oh'
print("The highest count after 'oh' is: ", max(model[word1], key=model[word1].get))
for word2 in model['oh']:
    print(word1, word2, model['oh'][word2])

The highest count after 'oh' is:  my
oh shit 50
oh my 240
oh god 76
oh boy 13
oh man 35
oh hey 27
oh look 15
oh you 53
oh babe 1
oh a 6
oh hell 12
oh no 126
oh carrie 1
oh baby 7
oh what 18
oh where 3
oh i 116
oh good 11
oh come 29
oh damn 6
oh yeah 148
oh do 6
oh yes 28
oh she 5
oh that 50
oh jesus 4
oh wonderful 1
oh so 13
oh this 13
oh okay 14
oh it 35
oh son 1
oh and 17
oh whoa 4
oh archie 1
oh they 3
oh finally 1
oh by 6
oh kahuna 1
oh honey 7
oh how 7
oh jeanette 1
oh these 1
oh right 11
oh shucks 1
oh dropped 1
oh resistance 1
oh oh 86
oh excuse 6
oh the 7
oh he 9
oh on 1
oh well 12
oh coopie 1
oh about 1
oh hi 15
oh listen 2
oh really 12
oh lovers 1
oh thanks 5
oh fuck 3
oh poopoo 1
oh poo 1
oh ah 3
oh sweet 2
oh snap 4
oh morgan 1
oh l 5
oh wow 6
oh never 1
oh not 4
oh crap 3
oh gamsie 1
oh 1167 1
oh please 9
oh 1174 1
oh krishna 1
oh dax 1
oh 1478 1
oh darlin 2
oh but 10
oh chuckles 1
oh girl 1
oh if 4
oh very 3
oh great 10
oh bring 1
oh lex 1
oh sorry 7
oh c 1
oh thank 9
oh 

In [325]:
def bigramPredict(scripts, word, length):
    bigramModel = createBigramCount(scripts)
    sentence = word.split()
    #print(np.random.choice(list(model[sentence[-1]]), 1, p = [float(i)/sum(model[sentence[-1]].values()) for i in model[sentence[-1]].values()]))
    for i in range(length):
        sentence.append(max(bigramModel[sentence[-1]], key=bigramModel[sentence[-1]].get))  
        #sentence.append(np.random.choice(list(model[sentence[-1]]), 1, p = [float(i)/sum(model[sentence[-1]].values()) for i in model[sentence[-1]].values()])[0])  
    return sentence

print(bigramPredict(badMovieScripts, 'oh', 11))

['oh', 'my', 'god', 'i', 'm', 'not', 'a', 'little', 'bit', 'of', 'the', 'hell']


# Trigram Model

In [237]:
def createTrigramCount(scripts):
    model = defaultdict(lambda: defaultdict(lambda: 0))
    trigrams = ngrams(scripts, 3)
    for i in scripts:
        for word1, word2, word3 in trigrams:
            model[(word1, word2)][word3] += 1
    return model

model = createTrigramCount(badMovieScripts)
#The dictionary is structured like this: [(word1, word2) : [word 3: Count(word3)]]
word1_word2 = ('you', 'just')
print("The highest count after 'you, just' is: ", max(model[word1_word2], key=model[word1_word2].get))
for word3 in model[word1_word2]:
    print(word1_word2, word3, model[word1_word2][word3])

The highest count after 'you, just' is:  got
('you', 'just') got 8
('you', 'just') walk 1
('you', 'just') don 4
('you', 'just') can 2
('you', 'just') name 1
('you', 'just') get 4
('you', 'just') gonna 1
('you', 'just') wish 1
('you', 'just') say 3
('you', 'just') drive 1
('you', 'just') wait 1
('you', 'just') have 2
('you', 'just') do 3
('you', 'just') kept 1
('you', 'just') said 3
('you', 'just') keep 1
('you', 'just') found 1
('you', 'just') never 2
('you', 'just') heard 1
('you', 'just') shut 1
('you', 'just') go 1
('you', 'just') wrecked 1
('you', 'just') fart 1
('you', 'just') hear 1
('you', 'just') kissed 1
('you', 'just') pop 1
('you', 'just') had 2
('you', 'just') you 1
('you', 'just') leave 5
('you', 'just') let 2
('you', 'just') stuff 1
('you', 'just') passing 1
('you', 'just') stay 1
('you', 'just') come 1
('you', 'just') give 2
('you', 'just') listen 2
('you', 'just') taik 1
('you', 'just') stop 1
('you', 'just') iove 1
('you', 'just') wanted 1
('you', 'just') the 1
('you',

In [329]:
from collections import defaultdict
def trigramPredict(scripts, word, length):
    trigramModel = createTrigramCount(scripts)
    sentence = bigramPredict(scripts, word, 1)
    for i in range(length):
        if len(trigramModel[(sentence[-2], sentence[-1])]) == 0:
            sentence.append(bigramPredict(scripts, sentence[-1], 1)[-1])
        else:
            sentence.append(max(trigramModel[(sentence[-2], sentence[-1])], key=trigramModel[(sentence[-2], sentence[-1])].get))
            #sentence.append(np.random.choice(list(trigramModel[(sentence[-2], sentence[-1])]), 1, p = [float(i)/sum(trigramModel[(sentence[-2], sentence[-1])].values()) for i in trigramModel[(sentence[-2], sentence[-1])].values()])[0])
    return sentence


result = trigramPredict(badMovieScripts, random.choice(badMovieScripts), 20)
output1 = ""
for i in result:
    output1 += " " + i
print(output1)

result2 = trigramPredict(goodMovieScripts, random.choice(goodMovieScripts), 20)
output2 = ""
for i in result2:
    output2 += " " + i
print(output2)

 fucking fucking knees on your head now rub them both in circles put your hand on your head now rub them both
 on the other side of the world is changed i remember the last time i ve got to be a little more
