In [1]:
from keras.models import Sequential
from keras.layers import *
from keras.callbacks import EarlyStopping

Using TensorFlow backend.


In [2]:
## Help Python find our packages
import sys
sys.path.append('..')

import json
import numpy as np
import matplotlib.pyplot as plt
import domain_scoring.domain_scoring as domain_scoring

# Randomness
import random as rn
import tensorflow as tf
import os

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [4]:
from sklearn import preprocessing
from keras.preprocessing.sequence import pad_sequences

## Load the data
Load the data we want to work with

In [5]:
path = '../rated_datasets/Rotten Tomato_Potato_1519142479.127663.json'

In [6]:
data = json.load(open(path, "r", encoding="utf8"))

In [7]:
type_selection = data["edge_type_selection"] + data["node_type_selection"]
types = []
for pair in type_selection:
    types.append(pair[0])
types

['ACTED_IN', 'PRODUCED', 'DIRECTED', 'WROTE', 'Person', 'Movie']

## Data extraction
Define functions for extraction and extract the data we need

In [8]:
def extract_mps(data):
    count = 0
    first = True
    batches = []
    batch = []
    for probably_path in data["meta_paths"]:
        if count % 6 == 0:
            # Don't add empty batches
            if len(batch) > 0:
                batches.append(batch)
                batch = []
        else:
            if 'time_to_rate' not in probably_path.keys():
                batch.append(probably_path)
        count += 1
    # append last batch
    if len(batch) > 0:
        batches.append(batch)
    print('#meta-paths:', count - len(batches) - 1)
    return batches

In [9]:
def construct_graph(batches):
    ## Construct rating graph
    from util.datastructures import MetaPathRatingGraph, MetaPath
    graph = MetaPathRatingGraph()

    for batch in batches:
        #ordered = sorted(batch, key=lambda x: float(x['rating']))
        for metapath in batch:
            for another_metapath in batch:
                if metapath is another_metapath:
                    continue
                if float(metapath['rating']) <= float(another_metapath['rating']):
                    graph.add_user_rating(MetaPath.from_list(another_metapath['metapath']), MetaPath.from_list(metapath['metapath']), 
                                          distance=float(another_metapath['rating']) - float(metapath['rating']))
    return graph

In [10]:
## Clean up data // remove time_to_rate from array of ratings.
def clean_up(data):
    batches = extract_mps(data)
    return batches, construct_graph(batches)

In [11]:
batches, graph = clean_up(data)

#meta-paths: 51


## Preprocessing
Define functions for preprocessing and preprocess the data for training

In [12]:
def to_sequence(in_list):
    return in_list.reshape((-1, 1))

In [24]:
def one_hot(in_sequence, distinct_values):
    sequence = np.zeros((len(in_sequence), distinct_values))
    i = 0
    for point in in_sequence:      
        sequence[i][point] = 1
        i += 1
    return sequence

In [25]:
def preprocess_raw(raw, types, max_len=9):
    labeler = preprocessing.LabelEncoder()
    labeler.fit(types)
    distinct_values = len(types)
    
    data = []
    for a,b in raw:
        # encode labels to integers (0 is reserved for padding)
        a = np.array(labeler.transform(a.as_list())) + 1
        b = np.array(labeler.transform(b.as_list())) + 1
        # pad to same length
        a, b = pad_sequences([a, b], max_len, padding='post', value=0)
        # merge a and b
        sequence = np.append(a, b)
        # to sequence
        sequence = to_sequence(sequence)
        # one-hot encode because we don't have distances/embeddings
        sequence = one_hot(sequence, distinct_values + 1)
        data.append(sequence)
    return data

In [26]:
# Use methods from our own domain scoring module as this will finally implement our model.
domain_score = domain_scoring.DomainScoring()
# Extract data and labels
x, y = domain_score._extract_data_labels(graph)

In [None]:
# TODO: Add further features:
# 1. Neighbor node types in graph schema
# 2. Length of mp
# 3. Number of instances?

In [27]:
x_preprocess = preprocess_raw(x, types)

## Setup
Setup the training process

In [28]:
test_size = 0.3
random_state = 42

In [29]:
# Make sure we have reproducible results
os.environ['PYTHONHASHSEED'] = '0'
def reset_seed():
    np.random.seed(random_state)
    rn.seed(random_state)
    tf.set_random_seed(random_state)

In [30]:
# Split in test and train data
x_p_train, x_p_test, y_p_train, y_p_test = train_test_split(x_preprocess, y,
                                                    test_size=test_size,
                                                    random_state=random_state,
                                                    shuffle=True)

In [31]:
np.array(x_p_train).shape

(131, 18, 7)

## Train
Do the actual training and validation

In [32]:
# Reproducible results
reset_seed()
# Use sequence classification (RNN/LSTM)
model_rnn = Sequential()
model_rnn.add(SimpleRNN(128, input_shape=(18, 7)))
model_rnn.add(Dropout(0.5))
model_rnn.add(Dense(1, activation='sigmoid'))

model_rnn.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

model_rnn.fit(np.array(x_p_train), np.array(y_p_train), batch_size=35, epochs=25, callbacks=[])

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x7f1bceaf29e8>

In [33]:
model_rnn.evaluate(np.array(x_p_test), np.array(y_p_test), batch_size=35)



[0.5171718952948587, 0.8245614223312914]