In [1]:
from keras.models import Sequential
from keras.layers import *

Using TensorFlow backend.


In [2]:
## Help Python find our packages
import sys
sys.path.append('..')

import json
import numpy as np
import matplotlib.pyplot as plt
import domain_scoring.domain_scoring as domain_scoring

# Randomness
import random as rn
import tensorflow as tf
import os

In [3]:
path = '../rated_datasets/Rotten_Tomato_Merlin_1519148528.2417703.json'

In [4]:
potato = json.load(open(path, "r", encoding="utf8"))

In [5]:
type_selection = potato["edge_type_selection"] + potato["node_type_selection"]
types = []
for pair in type_selection:
    types.append(pair[0])
types

['PRODUCED', 'DIRECTED', 'WROTE', 'ACTED_IN', 'Person', 'Movie']

In [6]:
def extract_mps(data):
    count = 0
    first = True
    batches = []
    batch = []
    for probably_path in data["meta_paths"]:
        if count % 6 == 0:
            # Don't add empty batches
            if len(batch) > 0:
                batches.append(batch)
                batch = []
        else:
            if 'time_to_rate' not in probably_path.keys():
                batch.append(probably_path)
        count += 1
    # append last batch
    if len(batch) > 0:
        batches.append(batch)
    print('#meta-paths:', count - len(batches) - 1)
    return batches

In [7]:
def construct_graph(batches):
    ## Construct rating graph
    from util.datastructures import MetaPathRatingGraph, MetaPath
    graph = MetaPathRatingGraph()

    for batch in batches:
        #ordered = sorted(batch, key=lambda x: float(x['rating']))
        for metapath in batch:
            for another_metapath in batch:
                if metapath is another_metapath:
                    continue
                if float(metapath['rating']) <= float(another_metapath['rating']):
                    graph.add_user_rating(MetaPath.from_list(another_metapath['metapath']), MetaPath.from_list(metapath['metapath']), 
                                          distance=float(another_metapath['rating']) - float(metapath['rating']))
    return graph

In [8]:
## Clean up data // remove time_to_rate from array of ratings.
def clean_up(data):
    batches = extract_mps(data)
    return batches, construct_graph(batches)

In [9]:
p_batches, p_graph = clean_up(potato)

#meta-paths: 51


In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [11]:
domain_score = domain_scoring.DomainScoringRegressor()

In [12]:
# Preprocess meta-paths
domain_score.vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), token_pattern='\\b\\w+\\b')

In [13]:
domain_score.fit(p_graph)

In [14]:
x, y = domain_score._extract_data_labels(p_graph)
x_raw = x
x = domain_score._preprocess(x)

In [15]:
from sklearn import preprocessing
from keras.preprocessing.sequence import pad_sequences

In [16]:
labeler = preprocessing.LabelEncoder()
labeler.fit(types)

LabelEncoder()

In [17]:
def to_sequence(in_list):
    return in_list.reshape((-1, 1))

In [18]:
def one_hot(in_sequence, distinct_values):
    sequence = np.zeros((len(in_sequence), distinct_values))
    i = 0
    for point in in_sequence:      
        sequence[i][point] = 1
        i += 1
    return sequence

In [19]:
def preprocess_raw(raw, distinct_values, max_len=9):
    data = []
    for a,b in raw:
        # encode labels to integers (0 is reserved for padding)
        a = np.array(labeler.transform(a.as_list())) + 1
        b = np.array(labeler.transform(b.as_list())) + 1
        # pad to same length
        a, b = pad_sequences([a, b], max_len, padding='post', value=0)
        # merge a and b
        sequence = np.append(a, b)
        # to sequence
        sequence = to_sequence(sequence)
        # one-hot encode because we don't have distances/embeddings
        sequence = one_hot(sequence, distinct_values + 1)
        data.append(sequence)
    return data

In [20]:
x_preprocess = preprocess_raw(x_raw, len(types))

In [21]:
test_size = 0.3
random_state = 42

In [22]:
x_train, x_test, y_train, y_test = train_test_split(x, y,
                                                    test_size=test_size,
                                                    random_state=random_state,
                                                    shuffle=True)

In [23]:
len(x_train)

135

In [24]:
np.array(x_train).shape

(135, 44)

In [25]:
y

[0.15,
 -0.15,
 0.04999999999999999,
 -0.04999999999999999,
 0.04999999999999999,
 -0.04999999999999999,
 0.1,
 -0.1,
 0.1,
 -0.1,
 0.05000000000000002,
 -0.05000000000000002,
 0.1,
 -0.1,
 0.05000000000000002,
 -0.05000000000000002,
 0.04999999999999999,
 -0.04999999999999999,
 0.1,
 -0.1,
 0.16,
 -0.16,
 0.17,
 -0.17,
 0.03,
 -0.03,
 0.06,
 -0.06,
 0.07,
 -0.07,
 0.010000000000000009,
 -0.010000000000000009,
 0.07,
 -0.07,
 0.13,
 -0.13,
 0.14,
 -0.14,
 0.13,
 -0.13,
 0.34,
 -0.34,
 0.21000000000000002,
 -0.21000000000000002,
 0.15000000000000002,
 -0.15000000000000002,
 0.08000000000000002,
 -0.08000000000000002,
 0.19,
 -0.19,
 0.06,
 -0.06,
 0.26,
 -0.26,
 0.13,
 -0.13,
 0.07,
 -0.07,
 0.43,
 -0.43,
 0.21000000000000002,
 -0.21000000000000002,
 0.020000000000000018,
 -0.020000000000000018,
 0.23000000000000004,
 -0.23000000000000004,
 0.21999999999999997,
 -0.21999999999999997,
 0.020000000000000018,
 -0.020000000000000018,
 0.41,
 -0.41,
 0.19,
 -0.19,
 0.21000000000000002,
 -0.2

In [26]:
os.environ['PYTHONHASHSEED'] = '0'
def reset_seed():
    np.random.seed(random_state)
    rn.seed(random_state)
    tf.set_random_seed(random_state)

In [27]:
# Reproducible results
reset_seed()
# Build model
model = Sequential()
model.add(Dense(128, activation='relu', input_dim=44))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='mean_squared_error',
              metrics=['accuracy'])

# Train model
model.fit(np.array(x_train), np.array(y_train), epochs=10, batch_size=16)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f0b99df0eb8>

In [28]:
mean_squared_error(model.predict(np.array(x_test), batch_size=16), np.array(y_test))

0.03216452232974655

In [29]:
x_p_train, x_p_test, y_p_train, y_p_test = train_test_split(x_preprocess, y,
                                                    test_size=test_size,
                                                    random_state=random_state,
                                                    shuffle=True)

In [30]:
np.array(x_p_train).shape

(135, 18, 7)

In [31]:
# Reproducible results
reset_seed()
# Use sequence classification (RNN/LSTM)
model_rnn = Sequential()
model_rnn.add(SimpleRNN(256, input_shape=(18, 7)))
model_rnn.add(Dropout(0.5))
model_rnn.add(Dense(1, activation='sigmoid'))

model_rnn.compile(loss='mean_squared_error',
              optimizer='sgd',
              metrics=['accuracy'])

model_rnn.fit(np.array(x_p_train), np.array(y_p_train), batch_size=35, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f0b87b89ba8>

In [45]:
y_predicted = model_rnn.predict(np.array(x_p_test), batch_size=16)
y_p_test = np.array(y_p_test)
mean_squared_error(y_predicted, Y_p_test)

0.035975770946391046

In [46]:
score = np.mean(np.logical_and((y_p_test) > 0, (y_predicted) > 0))
print('Test accuracy is {}'.format(score))

Test accuracy is 0.423728813559322
