# Network Trainer

## Importing Tensorflow and Keras

In [1]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import pandas as pd
from collections import Counter

## Training data layout

pulled from a CSV with columns for 

* title of movie

* the full text of the script

* individual trope membership

An entry of one means that the movie contains that trope on TV tropes, zero otherwise.

In [2]:
movie_df = pd.read_csv('tropes_by_movie.csv')
movie_df.head()

Unnamed: 0,Title,Text,"""arabian nights"" days","""awesome mccool"" name","""bang!"" flag gun","""be quiet!"" nudge","""billy elliot"" plot","""blind idiot"" translation","""could have avoided this!"" plot","""dear john"" letter",...,youtube poop,zack snyder,zany scheme,zeerust,zeerust canon,zerg rush,zeroth law rebellion,zettai ryouiki,zig-zagging trope,übermensch
0,A Few Good Men\n,b' A FEW ...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Aladdin\n,b'ALADDIN: THE COMPLETE SCRIPT\nCOMPILED BY B...,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,Alien\n,b' \n\n\n\n \n\n\n\n ...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Almost Famous\n,b' UNTITLED\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Amadeus\n,"b' \n ""A...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Each script is broken down into a list of words

All punctuation is removed

**TODO:** remove newline n's

In [3]:
movie_scripts_raw = list(movie_df['Text'])
text_processor = lambda text: keras.preprocessing.text.text_to_word_sequence(text, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=' ')
scripts_list = [text_processor(script) for script in movie_scripts_raw]

print(scripts_list[1][500:520])

['n', 'on', 'his', 'shoulder', 'gazeem', 'comes', 'riding', 'up', 'to', 'the', 'pair', 'n', 'njafar', 'you', 'are', 'late', 'ngazeem', 'a', 'thousand', 'apologies']


## These words are aggregated, counted, and assigned unique ids

Words than less than 5 uses are thrown out.

In [4]:
all_words = list(set(sum(scripts_list, [])))
all_words_counter = Counter(sum(scripts_list, []))
sifted_words = [word for word in all_words if all_words_counter[word] > 5]
word_to_id = {v: i for i, v in enumerate(sifted_words)}
id_to_word = {i: v for i, v in enumerate(sifted_words)}


len(word_to_id), len(id_to_word)

(16834, 16834)

## Each movie is assigned a membership (one-hot) list

In [5]:
padding = len(word_to_id)

new_scripts_list = []
for script in scripts_list:
    s = [0]*padding
    for word in script:
        if word in word_to_id:
            s[word_to_id[word]] = 1
    #s = [ss/(sum(s)) for ss in s]
    new_scripts_list.append(s)
    
scripts_list = np.array(new_scripts_list)
scripts_list[0][:10]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

## Tropes are filtered by frequency

Tropes with frequency between 5 and 20 are are generally characteristic of the movie it comes from without being too specific or broad.

This range will change depending on the *Kettani Valence* of the dataset.

In [6]:
movie_tropes = movie_df.columns.values[2:]
dropped_tropes = [trope for trope in movie_tropes if (movie_df[trope].sum() not in range(5, 20) )]
filtered_movie_df = movie_df.drop(dropped_tropes, axis=1)
trope_count = len([1 for trope in movie_tropes if (movie_df[trope].sum() in range(5, 20) )])
trope_arrays = np.array(filtered_movie_df.iloc[:,2:])
filtered_movie_df.head()

Unnamed: 0,Title,Text,"""the reason you suck"" speech","""well done, son!"" guy","""what now?"" ending","""where are they now?"" epilogue",a date with rosie palms,a god am i,abusive parents,"action film, quiet drama scene",...,wounded gazelle gambit,writers cannot do math,wrong genre savvy,you are better than you think you are,you can't fight fate,you have outlived your usefulness,you have to believe me!,you killed my father,you monster!,your cheating heart
0,A Few Good Men\n,b' A FEW ...,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Aladdin\n,b'ALADDIN: THE COMPLETE SCRIPT\nCOMPILED BY B...,0,0,0,0,0,1,0,0,...,0,0,0,0,1,1,0,0,0,0
2,Alien\n,b' \n\n\n\n \n\n\n\n ...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Almost Famous\n,b' UNTITLED\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,Amadeus\n,"b' \n ""A...",0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


# Building the Model

## Building an autoencoder-like neural network

In [7]:
model = keras.Sequential([
    keras.layers.Flatten(input_shape=(padding,)),
    keras.layers.Dense(1000, activation=tf.nn.relu),
    keras.layers.Dense(100, activation=tf.nn.relu),
    keras.layers.Dense(1000, activation=tf.nn.relu),
    keras.layers.Dense(trope_count, activation="sigmoid")
])

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

Instructions for updating:
Colocations handled automatically by placer.


## Training the Model

In [8]:
model.fit(scripts_list, trope_arrays, epochs=100)

Instructions for updating:
Use tf.cast instead.
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100

<tensorflow.python.keras.callbacks.History at 0x1ebb599d240>

# Testing the model

## Opening a test script

In [9]:
with open('raiders.txt', 'r') as f:
    test_text = f.read()

test_text_clean = text_processor(test_text)
print(test_text_clean[:10])

['raiders', 'of', 'the', 'lost', 'ark', 'screenplay', 'by', 'lawrence', 'kasdan', 'story']


## Making the membership list for the movie

In [10]:
test_array = []
for script in [test_text_clean]:
    s = [0]*padding
    for word in script:
        if word in word_to_id:
            s[word_to_id[word]] = 1
    test_array.append(s)
test_array = np.array(test_array)

## Using the model on the new script

In [11]:
predictions = model.predict(test_array)

## Save the top 10 predicted tropes to a results CSV

Outputs confidence level alongside trope

In [12]:
trope_list = filtered_movie_df.columns.values[2:]

L = sorted([(trope_list[i], trope) for i, trope in enumerate((predictions[0]*100).astype('uint32'))], key=lambda x: x[1], reverse=True)[:10]
L = [' - '.join([str(ll) for ll in l]) for l in L]
df_out = pd.DataFrame({'Results': L})
df_out.to_csv(r'Results.csv', index=None)
df_out

Unnamed: 0,Results
0,ask a stupid question... - 85
1,jerkass has a point - 74
2,gory discretion shot - 73
3,all there in the manual - 68
4,fanservice - 65
5,chekhov's skill - 63
6,never trust a trailer - 59
7,nothing is scarier - 59
8,fate worse than death - 58
9,heroic sacrifice - 56


## Save the model

In [13]:
model.save('TropeClassifier_.model')