In [None]:
# CAP 6619-002
# Movie Classifier
# Dr. Zhu
# Michael Cuomo

In [8]:
import os
import numpy
import tensorflow as tf
import matplotlib.pyplot as plt

import collections
import pathlib
import re
import csv
import random

from tensorflow.keras import layers
from tensorflow.keras import losses
from tensorflow.keras import utils
from tensorflow.keras.layers import TextVectorization


In [61]:
directory = os.fsencode(".\\data\\keywords")
movie_names = []
for file in os.listdir(directory):
    movie_names.append(os.fsdecode(file))

print(movie_names[:20])

['10_Things_I_Hate_About_You.txt', '12.txt', '127_Hours.txt', '12_and_Holding.txt', '12_Monkeys.txt', '12_Years_a_Slave.txt', '1492_Conquest_of_Paradise.txt', '15_Minutes.txt', '17_Again.txt', '187.txt', '2001_A_Space_Odyssey.txt', '2012.txt', '28_Days_Later.txt', '30_Minutes_or_Less.txt', '44_Inch_Chest.txt', '48_Hrs.txt', '50-50.txt', '500_Days_of_Summer.txt', '8MM.txt', '9.txt']


In [2]:
def plot_graphs(history, metric):
  plt.plot(history.history[metric])
  plt.plot(history.history['val_'+metric], '')
  plt.xlabel("Epochs")
  plt.ylabel(metric)
  plt.legend([metric, 'val_'+metric])


In [3]:
def create_encoder():
    VOCAB_SIZE = 50000
    encoder = tf.keras.layers.TextVectorization(max_tokens = VOCAB_SIZE)
    dataset = tf.data.TextLineDataset(filenames = list(map(lambda x: f".\\data\\scripts\\{x}", movie_names)))
    for line in dataset.take(5):
        print(line.numpy())
    encoder.adapt(dataset.batch(1024))
    return encoder

In [4]:
def get_existing_encoder(vocab_size = 50000):
    if vocab_size != 50000 and vocab_size != 10000 and vocab_size != 5000:
        raise ValueError("Vocab of that size does not exist")
    encoder = tf.keras.layers.TextVectorization(max_tokens=vocab_size)
    f = open(f".\\data\\vocabs\\vocab_{vocab_size}.txt", "r")
    vocab = f.read()
    f.close()

    encoder.set_vocabulary(vocab.split(","))
    return encoder

In [5]:
directory = os.fsencode(".\\data\\keywords")
movie_names = []
for file in os.listdir(directory):
    movie_names.append(os.fsdecode(file))

print(movie_names[:20])

['10_Things_I_Hate_About_You.txt', '12.txt', '127_Hours.txt', '12_and_Holding.txt', '12_Monkeys.txt', '12_Years_a_Slave.txt', '1492_Conquest_of_Paradise.txt', '15_Minutes.txt', '17_Again.txt', '187.txt', '2001_A_Space_Odyssey.txt', '2012.txt', '28_Days_Later.txt', '30_Minutes_or_Less.txt', '44_Inch_Chest.txt', '48_Hrs.txt', '50-50.txt', '500_Days_of_Summer.txt', '8MM.txt', '9.txt']


In [13]:
for movie in movie_names:
    with open(f".\\data\\scripts_old\\{movie}", 'r') as r, open(f'.\\data\\scripts\\{movie}', 'w', encoding = 'utf8') as o: 
        for line in r: 
            if line.strip(): 
                o.write(line.strip() +'\n') 

In [6]:
def createSuccinctOutputVectorModel(encoder = get_existing_encoder(50000)):
    model = tf.keras.models.Sequential()
    model.add(encoder)
    model.add(tf.keras.layers.Embedding(input_dim = len(encoder.get_vocabulary()), output_dim = 1024, mask_zero = True))
    model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(1024)))
    model.add(tf.keras.layers.Dense(24000, activation = 'relu'))
    model.add(tf.keras.layers.Dense(17466))
    return model


In [9]:
def succintGenerator(seed = 42, train_split = 0.75, is_train = True):
    for movie in movie_names:
        s = open(f".\\data\\scripts\\{movie}", "r")
        script_lines = random.Random(seed).shuffle(s.read().split("\n"))
        s.close()
        if is_train:
            script_lines = script_lines[:int(len(script_lines) * train_split) + 1]
        else:
            script_lines = script_lines[-(int(len(script_lines) * train_split) + 1):]
        k = open(f".\\data\\vectorized_keywords_succinct\\{movie}", "r")
        keywords = k.read().split(",")
        k.close()
        for line in script_lines:
            yield line, keywords

In [10]:
def trainSuccintGenerator():
    yield succintGenerator(seed = 42, train_split = 0.75, is_train = True)

In [11]:
def testSuccintGenerator():
    yield succintGenerator(seed = 42, train_split = 0.75, is_train = False)

In [12]:
def getSuccinctTrainDataset():
    dataset = tf.data.Dataset.from_generator(trainSuccintGenerator, output_signature = (
        tf.TensorSpec(shape = (), dtype = tf.string),
        tf.TensorSpec(shape = (17466,), dtype = tf.int8)))
    return dataset

In [13]:
def getSuccinctTestDataset():
    dataset = tf.data.Dataset.from_generator(testSuccintGenerator, output_signature = (
        tf.TensorSpec(shape = (), dtype = tf.string),
        tf.TensorSpec(shape = (17466,), dtype = tf.int8)))
    return dataset

In [18]:
def parse_map(data, labels):
    return data, tf.stack(labels, axis=0)

In [14]:
model = createSuccinctOutputVectorModel(get_existing_encoder(50000))

In [15]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [16]:
train = getSuccinctTrainDataset()
test = getSuccinctTestDataset()

In [19]:
history = model.fit(train.map(pa), epochs=1000,
                    validation_data=test,
                    validation_steps=3000)


TypeError: parse_map() missing 1 required positional argument: 'labels'

In [34]:
f = open(f".\\data\\vocabs\\vocab_50000.txt", "w")
f.write(str(encoder.get_vocabulary()))
f.close()

In [28]:
encoder_10000 = encoder

In [22]:
encoder_5000 = encoder

In [27]:
print(len(encoder.get_vocabulary()))
print(len(encoder_5000.get_vocabulary()))

10000
5000


In [15]:
y_labels = []

keywords_directory = os.fsencode(".\\data\\keywords")
for file in os.listdir(keywords_directory):
    f = open(f".\\data\\keywords\\{os.fsdecode(file)}", "r")
    keywords = f.read()
    f.close()
    keyword_list = keywords.split(",")
    while len(keyword_list) < 50:
        keyword_list.append("not-a-keyword")
    y_labels.append(keyword_list)

y_labels = tf.keras.utils.to_categorical(y_labels, dtype = "string")
print(y_labels)

    

ValueError: invalid literal for int() with base 10: 'protective-father'