In [1]:
import json
from collections import Counter
from keras.models import Model
from keras.layers import Embedding, Input, Reshape
from keras.layers.merge import Dot
from sklearn.linear_model import LinearRegression
import numpy as np
import random
from sklearn import svm

## 4.2　训练电影嵌入

我们先计数传出链接，将其作为一个快速的方法来看看我们认为的是否合理：

In [3]:
with open('../data/wp_movies_10k.ndjson') as fin:
    movies = [json.loads(l) for l in fin]

In [4]:
link_counts = Counter()
for movie in movies:
    link_counts.update(movie[2])
link_counts.most_common(10)

[('Rotten Tomatoes', 9393),
 ('Category:English-language films', 5882),
 ('Category:American films', 5867),
 ('Variety (magazine)', 5450),
 ('Metacritic', 5112),
 ('Box Office Mojo', 4186),
 ('The New York Times', 3818),
 ('The Hollywood Reporter', 3553),
 ('Roger Ebert', 2707),
 ('Los Angeles Times', 2454)]

我们模型的任务是检测电影的维基百科页面是否能找到特定的链接，因此我们需要向模型输入一些标识好的匹配和不匹配的例子。我们只保留出现至少三次的链接，建立一个列表存放有效的（link，movie）对，用于后面快速查找。我们顺便把同样的东西作为集合，也便于后续查找：

In [5]:
top_links = [link for link, c in link_counts.items() if c >= 3]
link_to_idx = {link: idx for idx, link in enumerate(top_links)}
movie_to_idx = {movie[0]: idx for idx, movie in enumerate(movies)}
pairs = []
for movie in movies:
    pairs.extend((link_to_idx[link], movie_to_idx[movie[0]]) for link in movie[2] if link in link_to_idx)
pairs_set = set(pairs)
len(pairs), len(top_links), len(movie_to_idx)

(949544, 66913, 10000)

从原理上讲，我们把link_id和movie_id作为数字，并将它们输入到各自的嵌入层中。嵌入层将为每个可能的输入分配一个embedding_size大小的向量。然后我们将这两个向量的点积设为模型的输出。该模型将学习权重，使得该点积接近于标签。然后，这些权重将电影和链接映射到一个空间中，使得类似的电影最终处于相似的位置：

In [12]:
def movie_embedding_model(embedding_size=50):
    link = Input(name='link', shape=(1,))
    movie = Input(name='movie', shape=(1,))
    link_embedding = Embedding(name='link_embedding',
                               input_dim=len(top_links),
                               output_dim=embedding_size)(link)
    movie_embedding = Embedding(name='movie_embedding',
                                input_dim=len(movie_to_idx),
                                output_dim=embedding_size)(movie)
    dot = Dot(name='dot_product', normalize=True, axes=2)(
        [link_embedding, movie_embedding])
    merged = Reshape((1,))(dot)
    model = Model(inputs=[link, movie], outputs=[merged])
    model.compile(optimizer='nadam', loss='mse')
    return model


model = movie_embedding_model()
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
link (InputLayer)               [(None, 1)]          0                                            
__________________________________________________________________________________________________
movie (InputLayer)              [(None, 1)]          0                                            
__________________________________________________________________________________________________
link_embedding (Embedding)      (None, 1, 50)        3345650     link[0][0]                       
__________________________________________________________________________________________________
movie_embedding (Embedding)     (None, 1, 50)        500000      movie[0][0]                      
____________________________________________________________________________________________

我们使用生成器向模型输入数据。该生成器产生一些由正例和反例组成的批次数据。  
我们从（link，movie）对数组中采集正例，然后再填入反例。反例随机抽取，并确保不在pairs_set之中。然后，我们以神经网络期望的格式返回输入/输出元组数据。

In [19]:
random.seed(5)


def batchifier(pairs, positive_samples=50, negative_ratio=10):
    batch_size = positive_samples * (1 + negative_ratio)
    batch = np.zeros((batch_size, 3))
    while True:
        for idx, (link_id, movie_id) in enumerate(random.sample(pairs, positive_samples)):
            batch[idx, :] = (link_id, movie_id, 1)
        idx = positive_samples
        while idx < batch_size:
            movie_id = random.randrange(len(movie_to_idx))
            link_id = random.randrange(len(top_links))
            if not (link_id, movie_id) in pairs_set:
                batch[idx, :] = (link_id, movie_id, -1)
                idx += 1
        np.random.shuffle(batch)
        yield {'link': batch[:, 0], 'movie': batch[:, 1]}, batch[:, 2]


next(batchifier(pairs, positive_samples=3, negative_ratio=2))

({'link': array([48731., 31254.,  1313., 13365., 20558., 32318.,  3801., 32643.,
         22418.]),
  'movie': array([1854., 5530., 7236., 6238.,  849., 7685., 5874., 7628., 1529.])},
 array([-1.,  1.,  1., -1., -1., -1., -1., -1.,  1.]))

### 训练模型：

In [20]:
positive_samples_per_batch = 512

model.fit(
    batchifier(pairs, positive_samples=positive_samples_per_batch, negative_ratio=10),
    epochs=15,
    steps_per_epoch=len(pairs) // positive_samples_per_batch,
    verbose=2
)

Epoch 1/15
1854/1854 - 51s - loss: 0.2466
Epoch 2/15
1854/1854 - 49s - loss: 0.2270
Epoch 3/15
1854/1854 - 50s - loss: 0.2198
Epoch 4/15
1854/1854 - 51s - loss: 0.2170
Epoch 5/15
1854/1854 - 51s - loss: 0.2153
Epoch 6/15
1854/1854 - 52s - loss: 0.2143
Epoch 7/15
1854/1854 - 51s - loss: 0.2137
Epoch 8/15
1854/1854 - 53s - loss: 0.2132
Epoch 9/15
1854/1854 - 51s - loss: 0.2129
Epoch 10/15
1854/1854 - 52s - loss: 0.2126
Epoch 11/15
1854/1854 - 56s - loss: 0.2123
Epoch 12/15
1854/1854 - 54s - loss: 0.2124
Epoch 13/15
1854/1854 - 53s - loss: 0.2121
Epoch 14/15
1854/1854 - 55s - loss: 0.2120
Epoch 15/15
1854/1854 - 54s - loss: 0.2119


<tensorflow.python.keras.callbacks.History at 0x18825d070>

In [21]:
movie = model.get_layer('movie_embedding')
movie_weights = movie.get_weights()[0]
movie_lengths = np.linalg.norm(movie_weights, axis=1)
normalized_movies = (movie_weights.T / movie_lengths).T

def similar_movies(movie):
    dists = np.dot(normalized_movies, normalized_movies[movie_to_idx[movie]])
    closest = np.argsort(dists)[-10:]
    for c in reversed(closest):
        print(c, movies[c][0], dists[c])

similar_movies('Rogue One')

29 Rogue One 1.0000001
19 Interstellar (film) 0.97693247
245 Gravity (film) 0.97262806
659 Rise of the Planet of the Apes 0.96969795
25 Star Wars sequel trilogy 0.9688958
86 Tomorrowland (film) 0.9622955
62 Fantastic Beasts and Where to Find Them (film) 0.9603083
3349 Star Wars: The Force Awakens 0.9598935
221 The Dark Knight Trilogy 0.9597953
784 Spider-Man 2 0.9572549


In [22]:
link = model.get_layer('link_embedding')
link_weights = link.get_weights()[0]
link_lengths = np.linalg.norm(link_weights, axis=1)
normalized_links = (link_weights.T / link_lengths).T

def similar_links(link):
    dists = np.dot(normalized_links, normalized_links[link_to_idx[link]])
    closest = np.argsort(dists)[-10:]
    for c in reversed(closest):
        print(c, top_links[c], dists[c])

similar_links('George Lucas')

127 George Lucas 1.0
3176 Star Wars (film) 0.95481646
2707 Star Wars 0.95048773
4830 widescreen 0.9459705
976 Hugo Award for Best Dramatic Presentation 0.9346279
4051 novelization 0.9107668
2778 Lucasfilm 0.908279
2931 LaserDisc 0.9057042
1732 Academy Award for Best Visual Effects 0.90343004
2810 film treatment 0.8994031


## 4.3　构建电影推荐系统

In [23]:
best = ['Star Wars: The Force Awakens', 'The Martian (film)', 'Tangerine (film)', 'Straight Outta Compton (film)',
        'Brooklyn (film)', 'Carol (film)', 'Spotlight (film)']
worst = ['American Ultra', 'The Cobbler (2014 film)', 'Entourage (film)', 'Fantastic Four (2015 film)',
         'Get Hard', 'Hot Pursuit (2015 film)', 'Mortdecai (film)', 'Serena (2014 film)', 'Vacation (2015 film)']
y = np.asarray([1 for _ in best] + [0 for _ in worst])
X = np.asarray([normalized_movies[movie_to_idx[movie]]
                for movie in best + worst])
X.shape

(16, 50)

In [24]:
clf = svm.SVC(kernel='linear')
clf.fit(X, y)

SVC(kernel='linear')

在数据集中的所有电影上运行新的分类器，并打印最好和最差的五个：

In [25]:
estimated_movie_ratings = clf.decision_function(normalized_movies)
best = np.argsort(estimated_movie_ratings)
print('best:')
for c in reversed(best[-5:]):
    print(c, movies[c][0], estimated_movie_ratings[c])

print('worst:')
for c in best[:5]:
    print(c, movies[c][0], estimated_movie_ratings[c])

best:
481 The Devil Wears Prada (film) 1.3518291914840432
66 Skyfall 1.334615196377807
458 Hugo (film) 1.1848986072227856
307 Les Misérables (2012 film) 1.1502452556763718
3 Spectre (2015 film) 1.1388150583799725
worst:
1878 The Little Rascals (film) -1.6223719054695447
5097 Ready to Rumble -1.5909924369067088
1782 Scooby-Doo! WrestleMania Mystery -1.5784702479850679
7889 The Comebacks -1.5666192198507929
3073 Joe Dirt -1.5574117594165326


## 4.4　预测简单的电影属性

你希望预测简单的电影属性，比如烂番茄评级。  
在嵌入模型学习到的向量上使用线性回归模型预测电影的属性。

让我们尝试进行烂番茄评级。幸运的是，它们已经存在于我们的数据中，以字符串N%格式存储在movie[-2]中。

In [27]:
rotten_y = np.asarray([float(movie[-2][:-1]) / 100 for movie in movies if movie[-2]])
rotten_X = np.asarray([normalized_movies[movie_to_idx[movie[0]]] for movie in movies if movie[-2]])

这使我们得到了大约一半电影的数据。让我们训练前面80%的数据：

In [28]:
TRAINING_CUT_OFF = int(len(rotten_X) * 0.8)
regr = LinearRegression()
regr.fit(rotten_X[:TRAINING_CUT_OFF], rotten_y[:TRAINING_CUT_OFF])

LinearRegression()

现在，让我们看看模型在剩下20%的数据上的表现：

In [29]:
error = (regr.predict(rotten_X[TRAINING_CUT_OFF:]) - rotten_y[TRAINING_CUT_OFF:])
'mean square error %2.2f' % np.mean(error ** 2)

'mean square error 0.06'

In [32]:
error = (np.mean(rotten_y[:TRAINING_CUT_OFF]) - rotten_y[TRAINING_CUT_OFF:])
'数据自身方差 %2.2f' % np.mean(error ** 2)

'数据自身方差 0.09'

In [33]:
def gross(movie):
    v = movie[1].get('gross')
    if not v or not ' ' in v:
        return None
    v, unit = v.split(' ', 1)
    unit = unit.lower()
    if not unit in ('million', 'billion'):
        return None
    if not v.startswith('$'):
        return None
    try:
        v = float(v[1:])
    except ValueError:
        return None
    if unit == 'billion':
        v *= 1000
    return v

movie_gross = [gross(m) for m in movies]
movie_gross = np.asarray([gr for gr in movie_gross if gr is not None])
highest = np.argsort(movie_gross)[-10:]
for c in reversed(highest):
    print(c, movies[c][0], movie_gross[c])

6 The Martian (film) 10900.0
7 List of Marvel Cinematic Universe films 4300.0
49 Back to the Future 3900.0
71 The Conjuring 2932.0
162 Thor (film) 2464.0
36 Furious 7 2340.0
30 Finding Dory 2187.0
1906 Jane Eyre (2011 film) 2068.0
19 Interstellar (film) 1670.0
2251 An American Werewolf in London 1655.0


In [34]:
gross_y = np.asarray([gr for gr in movie_gross if gr])
gross_X = np.asarray([normalized_movies[movie_to_idx[movie[0]]]
                      for movie, gr in zip(movies, movie_gross) if gr])

In [35]:
TRAINING_CUT_OFF = int(len(gross_X) * 0.8)
regr = LinearRegression()
regr.fit(gross_X[:TRAINING_CUT_OFF], gross_y[:TRAINING_CUT_OFF])

LinearRegression()

In [36]:
error = (regr.predict(gross_X[TRAINING_CUT_OFF:]) - gross_y[TRAINING_CUT_OFF:])
'mean square error %2.2f' % np.mean(error ** 2)

'mean square error 8349.41'

In [37]:
error = (np.mean(gross_y[:TRAINING_CUT_OFF]) - gross_y[TRAINING_CUT_OFF:])
'数据自身方差 %2.2f' % np.mean(error ** 2)

'数据自身方差 14115.59'