In [None]:
# Code from here 
# https://github.com/DOsinga/deep_learning_cookbook/blob/master/04.1%20Collect%20movie%20data%20from%20Wikipedia.ipynb

# Some of the steps needed to download the movies

In [1]:
import xml.sax
import subprocess
import os

In [2]:
class WikiXmlHandler(xml.sax.handler.ContentHandler):
    def __init__(self):
        xml.sax.handler.ContentHandler.__init__(self)
        self._buffer = None
        self._values = {}
        self._movies = []
        self._curent_tag = None

    def characters(self, content):
        if self._curent_tag:
            self._buffer.append(content)

    def startElement(self, name, attrs):
        if name in ('title', 'text'):
            self._curent_tag = name
            self._buffer = []

    def endElement(self, name):
        if name == self._curent_tag:
            self._values[name] = ' '.join(self._buffer)

        if name == 'page':
            movie = process_article(**self._values)
            if movie:
                self._movies.append(movie)

In [3]:
path = 'data/wp_movies_10k.ndjson'

In [4]:
parser = xml.sax.make_parser()
handler = WikiXmlHandler()
parser.setContentHandler(handler)
for line in subprocess.Popen(['bzcat'], stdin=open(path), stdout=subprocess.PIPE).stdout:
    try:
        parser.feed(line)
    except StopIteration:
        break

In [5]:
#with open('data/wp_movies_10k.ndjson', 'wt') as fout:
#    for movie in handler._movies:
#         fout.write(json.dumps(movie) + '\n')

# Process movies

In [1]:
import json
from collections import Counter
import random
import numpy as np

In [2]:
with open('data/wp_movies_10k.ndjson') as fin:
    movies = [json.loads(l) for l in fin]

In [3]:
print(len(movies))

10000


In [4]:
print(movies[0])

['Deadpool (film)', {'image': 'Deadpool poster.jpg', 'name': 'Deadpool', 'cinematography': 'Ken Seng', 'Software Used': 'Adobe Premier Pro', 'alt': "Official poster shows the titular hero Deadpool standing in front of the viewers, with hugging his hands, and donning his traditional black and red suit and mask, and the film's name, credits and billing below him.", 'distributor': '20th Century Fox', 'caption': 'Theatrical release poster', 'gross': '$783.1 million', 'country': 'United States', 'director': 'Tim Miller', 'runtime': '108 minutes', 'editing': 'Julian Clarke', 'language': 'English', 'music': 'Tom Holkenborg', 'budget': '$58 million'}, ['Tim Miller (director)', 'Simon Kinberg', 'Ryan Reynolds', 'Lauren Shuler Donner', 'Rhett Reese', 'Paul Wernick', 'Deadpool', 'Fabian Nicieza', 'Rob Liefeld', 'Morena Baccarin', 'Ed Skrein', 'T.J. Miller', 'Gina Carano', 'Leslie Uggams', 'Brianna Hildebrand', 'Stefan Kapičić', 'Junkie XL', 'Julian Clarke', 'Marvel Entertainment', 'Kinberg Genre'

In [5]:
print('Title:',movies[0][0])

Title: Deadpool (film)


In [6]:
print('Links:',movies[0][2])

Links: ['Tim Miller (director)', 'Simon Kinberg', 'Ryan Reynolds', 'Lauren Shuler Donner', 'Rhett Reese', 'Paul Wernick', 'Deadpool', 'Fabian Nicieza', 'Rob Liefeld', 'Morena Baccarin', 'Ed Skrein', 'T.J. Miller', 'Gina Carano', 'Leslie Uggams', 'Brianna Hildebrand', 'Stefan Kapičić', 'Junkie XL', 'Julian Clarke', 'Marvel Entertainment', 'Kinberg Genre', 'Lauren Shuler Donner', 'TSG Entertainment', '20th Century Fox', 'Le Grand Rex', 'Variety (magazine)', 'Box Office Mojo', 'superhero film', 'Tim Miller (director)', 'Rhett Reese', 'Paul Wernick', 'Marvel Comics', 'Deadpool', 'X-Men (film series)', 'Ryan Reynolds', 'Morena Baccarin', 'Ed Skrein', 'T.J. Miller', 'Gina Carano', 'Leslie Uggams', 'Brianna Hildebrand', 'Stefan Kapičić', 'antihero', 'New Line Cinema', '20th Century Fox', 'X-Men Origins: Wolverine', 'principal photography', 'Vancouver', 'IMAX', 'Digital Light Processing', 'D-Box Technologies', 'List of accolades received by Deadpool (film)', 'Golden Globe Award', 'Golden Globe

In [7]:
link_counts = Counter()
for movie in movies:
    link_counts.update(movie[2])
link_counts.most_common(10)

[('Rotten Tomatoes', 9393),
 ('Category:English-language films', 5882),
 ('Category:American films', 5867),
 ('Variety (magazine)', 5450),
 ('Metacritic', 5112),
 ('Box Office Mojo', 4186),
 ('The New York Times', 3818),
 ('The Hollywood Reporter', 3553),
 ('Roger Ebert', 2707),
 ('Los Angeles Times', 2454)]

In [8]:
print(f'Number of links from other movies to {movies[0][0]}: {link_counts[movies[0][0]]}')

Number of links from other movies to Deadpool (film): 20


In [9]:
print('Number of links for Rotten Tomatoes',link_counts['Rotten Tomatoes'])

Number of links for Rotten Tomatoes 9393


In [10]:
# Keep links that exist at least 3 times
top_links = [link for link, c in link_counts.items() if c >= 3]
top_links[:10]

['Tim Miller (director)',
 'Simon Kinberg',
 'Ryan Reynolds',
 'Lauren Shuler Donner',
 'Rhett Reese',
 'Paul Wernick',
 'Deadpool',
 'Morena Baccarin',
 'Ed Skrein',
 'T.J. Miller']

In [11]:
print(f'All links are {len(link_counts.items())}, we keep the {len(top_links)} with at least 3 occurences')

All links are 218029, we keep the 66913 with at least 3 occurences


In [12]:
link_to_idx = {link: idx for idx, link in enumerate(top_links)}
print(f'Index for {top_links[0]} is {link_to_idx[top_links[0]]}')
print(f'Index for {top_links[10]} is {link_to_idx[top_links[10]]}')
#link_to_idx gets a link and returns its index in list top_links

Index for Tim Miller (director) is 0
Index for Gina Carano is 10


In [13]:
movie_to_idx = {movie[0]: idx for idx, movie in enumerate(movies)}
print(f'Index for {movies[0][0]} is {movie_to_idx[movies[0][0]]}')
#movie_to_idx gets a movie and returns its index in list movies

Index for Deadpool (film) is 0


In [14]:
pairs = []
for movie in movies:
    pairs.extend((link_to_idx[link], movie_to_idx[movie[0]]) for link in movie[2] if link in link_to_idx)
pairs_set = set(pairs)

In [15]:
for movie in movies[:10]:
    print(movie[0])
    test_pairs=[]
    test_pairs.extend((link_to_idx[link], movie_to_idx[movie[0]]) for link in movie[2] if link in link_to_idx)
    print(test_pairs)
#pairs creates a tuple of link_id and movie_id for links that exist in a movie

Deadpool (film)
[(0, 0), (1, 0), (2, 0), (3, 0), (4, 0), (5, 0), (6, 0), (7, 0), (8, 0), (9, 0), (10, 0), (11, 0), (12, 0), (13, 0), (14, 0), (15, 0), (16, 0), (17, 0), (3, 0), (18, 0), (19, 0), (20, 0), (21, 0), (22, 0), (23, 0), (0, 0), (4, 0), (5, 0), (24, 0), (6, 0), (25, 0), (2, 0), (7, 0), (8, 0), (9, 0), (10, 0), (11, 0), (12, 0), (13, 0), (26, 0), (27, 0), (19, 0), (28, 0), (29, 0), (30, 0), (31, 0), (32, 0), (33, 0), (34, 0), (35, 0), (36, 0), (37, 0), (38, 0), (39, 0), (40, 0), (41, 0), (6, 0), (42, 0), (43, 0), (44, 0), (45, 0), (46, 0), (47, 0), (48, 0), (49, 0), (50, 0), (51, 0), (52, 0), (53, 0), (54, 0), (55, 0), (2, 0), (6, 0), (56, 0), (28, 0), (57, 0), (7, 0), (43, 0), (58, 0), (8, 0), (45, 0), (59, 0), (0, 0), (9, 0), (49, 0), (1, 0), (60, 0), (10, 0), (46, 0), (11, 0), (50, 0), (61, 0), (12, 0), (52, 0), (53, 0), (62, 0), (63, 0), (64, 0), (65, 0), (13, 0), (51, 0), (66, 0), (67, 0), (68, 0), (69, 0), (70, 0), (71, 0), (72, 0), (73, 0), (74, 0), (75, 0), (76, 0), (1

In [16]:
#print()
#p = pairs[0]
#print(p[0])
first_movie_pairs = []
first_movie_pairs.extend( (p for p in pairs if p[1]==0) )
print(first_movie_pairs)

[(0, 0), (1, 0), (2, 0), (3, 0), (4, 0), (5, 0), (6, 0), (7, 0), (8, 0), (9, 0), (10, 0), (11, 0), (12, 0), (13, 0), (14, 0), (15, 0), (16, 0), (17, 0), (3, 0), (18, 0), (19, 0), (20, 0), (21, 0), (22, 0), (23, 0), (0, 0), (4, 0), (5, 0), (24, 0), (6, 0), (25, 0), (2, 0), (7, 0), (8, 0), (9, 0), (10, 0), (11, 0), (12, 0), (13, 0), (26, 0), (27, 0), (19, 0), (28, 0), (29, 0), (30, 0), (31, 0), (32, 0), (33, 0), (34, 0), (35, 0), (36, 0), (37, 0), (38, 0), (39, 0), (40, 0), (41, 0), (6, 0), (42, 0), (43, 0), (44, 0), (45, 0), (46, 0), (47, 0), (48, 0), (49, 0), (50, 0), (51, 0), (52, 0), (53, 0), (54, 0), (55, 0), (2, 0), (6, 0), (56, 0), (28, 0), (57, 0), (7, 0), (43, 0), (58, 0), (8, 0), (45, 0), (59, 0), (0, 0), (9, 0), (49, 0), (1, 0), (60, 0), (10, 0), (46, 0), (11, 0), (50, 0), (61, 0), (12, 0), (52, 0), (53, 0), (62, 0), (63, 0), (64, 0), (65, 0), (13, 0), (51, 0), (66, 0), (67, 0), (68, 0), (69, 0), (70, 0), (71, 0), (72, 0), (73, 0), (74, 0), (75, 0), (76, 0), (1, 0), (77, 0), (

In [17]:
len(pairs), len(top_links), len(movie_to_idx)

(949544, 66913, 10000)

In [18]:
from keras.models import Model
from keras.layers import Embedding, Input, Reshape
from keras.layers.merge import Dot

Using TensorFlow backend.


In [19]:
def movie_embedding_model(embedding_size=50):
    link = Input(name='link', shape=(1,))
    movie = Input(name='movie', shape=(1,))
    link_embedding = Embedding(name='link_embedding', 
                               input_dim=len(top_links), 
                               output_dim=embedding_size)(link)
    movie_embedding = Embedding(name='movie_embedding', 
                                input_dim=len(movie_to_idx), 
                                output_dim=embedding_size)(movie)
    dot = Dot(name='dot_product', normalize=True, axes=2)([link_embedding, movie_embedding])
    merged = Reshape((1,))(dot)
    model = Model(inputs=[link, movie], outputs=[merged])
    model.compile(optimizer='nadam', loss='mse')
    return model

model = movie_embedding_model()
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
link (InputLayer)               (None, 1)            0                                            
__________________________________________________________________________________________________
movie (InputLayer)              (None, 1)            0                                            
__________________________________________________________________________________________________
link_embedding (Embedding)      (None, 1, 50)        3345650     link[0][0]                       
__________________________________________________________________________________________________
movie_embedding (Embedding)     (None, 1, 50)        500000      movie[0][0]                      
__________________________________________________________________________________________________
dot_produc

In [20]:
random.seed(5)

def batchifier(pairs, positive_samples=50, negative_ratio=10):
    batch_size = positive_samples * (1 + negative_ratio)
    batch = np.zeros((batch_size, 3))
    while True:
        for idx, (link_id, movie_id) in enumerate(random.sample(pairs, positive_samples)):
            batch[idx, :] = (link_id, movie_id, 1)
        idx = positive_samples
        while idx < batch_size:
            movie_id = random.randrange(len(movie_to_idx))
            link_id = random.randrange(len(top_links))
            if not (link_id, movie_id) in pairs_set:
                batch[idx, :] = (link_id, movie_id, -1)
                idx += 1
        np.random.shuffle(batch)
        yield {'link': batch[:, 0], 'movie': batch[:, 1]}, batch[:, 2]

next(batchifier(pairs, positive_samples=3, negative_ratio=2))

({'link': array([20558., 31254., 32318., 22418., 13365., 32643.,  3801., 48731.,
          1313.]),
  'movie': array([ 849., 5530., 7685., 1529., 6238., 7628., 5874., 1854., 7236.])},
 array([-1.,  1., -1.,  1., -1., -1., -1., -1.,  1.]))

In [21]:
positive_samples_per_batch = 512

model.fit_generator(
    batchifier(pairs, positive_samples=positive_samples_per_batch, negative_ratio=10),
    epochs=15,
    steps_per_epoch=len(pairs) // positive_samples_per_batch,
    verbose=2
)

Epoch 1/15
 - 222s - loss: 0.4157
Epoch 2/15
 - 265s - loss: 0.2534
Epoch 3/15
 - 326s - loss: 0.2541
Epoch 4/15
 - 265s - loss: 0.2465
Epoch 5/15
 - 243s - loss: 0.2459
Epoch 6/15
 - 245s - loss: 0.2476
Epoch 7/15
 - 251s - loss: 0.2387
Epoch 8/15
 - 314s - loss: 0.2437
Epoch 9/15
 - 301s - loss: 0.2446
Epoch 10/15
 - 336s - loss: 0.2567
Epoch 11/15
 - 332s - loss: 0.2567
Epoch 12/15
 - 352s - loss: 0.2590
Epoch 13/15
 - 438s - loss: 0.2620
Epoch 14/15
 - 304s - loss: 0.2385
Epoch 15/15
 - 285s - loss: 0.2358


<keras.callbacks.History at 0x188bfa46e80>

In [30]:
# serialize model to JSON
model_json = model.to_json()
with open("model.json", "w") as json_file:
    json_file.write(model_json)

In [31]:
# serialize weights to HDF5
model.save_weights("model.h5")
print("Saved model to disk")

Saved model to disk


In [None]:
# load json and create model
json_file = open('model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)

In [None]:
# load weights into new model
loaded_model.load_weights("model.h5")
print("Loaded model from disk")

In [22]:
# Get the embeddings layer for movies
movie = model.get_layer('movie_embedding')
movie_weights = movie.get_weights()[0]
movie_lengths = np.linalg.norm(movie_weights, axis=1)
normalized_movies = (movie_weights.T / movie_lengths).T

def similar_movies(movie):
    dists = np.dot(normalized_movies, normalized_movies[movie_to_idx[movie]])
    closest = np.argsort(dists)[-10:]
    for c in reversed(closest):
        print(c, movies[c][0], dists[c])



In [23]:
similar_movies('Rogue One')

29 Rogue One 0.99999994
19 Interstellar (film) 0.9699454
3349 Star Wars: The Force Awakens 0.96456456
245 Gravity (film) 0.9623266
25 Star Wars sequel trilogy 0.95689327
101 Prometheus (2012 film) 0.9508581
86 Tomorrowland (film) 0.94835293
181 Pacific Rim (film) 0.9483281
62 Fantastic Beasts and Where to Find Them (film) 0.9469297
372 The Amazing Spider-Man (2012 film) 0.94672596


In [24]:
print(len(normalized_movies))
my_movie = 'Rogue One'
my_movie = 'Deadpool (film)'
for mov in movies[:10]:
    print('Embedding for movie ', mov[0], normalized_movies[movie_to_idx[mov[0]]])
    dist = np.dot(normalized_movies[movie_to_idx[mov[0]]], normalized_movies[movie_to_idx[my_movie]])
    print('Distance from ', my_movie,dist)

10000
Embedding for movie  Deadpool (film) [-0.14955294  0.00106091  0.05171971  0.3408223  -0.06126123 -0.06886251
 -0.07799357 -0.21200722  0.04472226  0.3114141   0.13159375 -0.0857121
 -0.18723357 -0.21329294  0.05470647  0.01476673 -0.22184294  0.00170165
  0.08567134 -0.26278624  0.07065912 -0.036895    0.01056606 -0.11456358
  0.01930739 -0.04799744 -0.29009888 -0.0249148  -0.15026273 -0.04259526
  0.33606774 -0.0713882   0.01177547  0.2742229  -0.09725597  0.18392338
 -0.04437275 -0.08470355 -0.03168604  0.0674007  -0.10773304  0.03262369
  0.07684439 -0.11433008  0.02915251  0.0342999  -0.02739471  0.21867752
  0.01961173  0.02894497]
Distance from  Deadpool (film) 1.0000001
Embedding for movie  The Revenant (2015 film) [-0.29834604 -0.01788932  0.02469176  0.18686444  0.09642974 -0.04551457
 -0.10605694 -0.29628813 -0.03666583  0.20760658  0.13658133 -0.01875682
 -0.0941946  -0.29857802  0.01708717  0.0027443  -0.11210477  0.07666252
  0.06685811 -0.15849805  0.00916505  0.15

In [25]:
link = model.get_layer('link_embedding')
link_weights = link.get_weights()[0]
link_lengths = np.linalg.norm(link_weights, axis=1)
normalized_links = (link_weights.T / link_lengths).T

def similar_links(link):
    dists = np.dot(normalized_links, normalized_links[link_to_idx[link]])
    closest = np.argsort(dists)[-10:]
    for c in reversed(closest):
        print(c, top_links[c], dists[c])

similar_links('George Lucas')

127 George Lucas 0.9999999
2707 Star Wars 0.9571903
3176 Star Wars (film) 0.9199441
976 Hugo Award for Best Dramatic Presentation 0.9106061
4830 widescreen 0.904821
2931 LaserDisc 0.90081215
2984 Saturn Award for Best Science Fiction Film 0.88510555
4051 novelization 0.88298804
2778 Lucasfilm 0.87901235
2860 Steven Spielberg 0.8772577


In [26]:
link_weights[:10]

array([[ 6.59459710e-01, -3.05113494e-01,  1.78294554e-01,
        -3.51522475e-01, -1.16291463e-01,  1.53800175e-02,
        -2.14736030e-01,  5.10043323e-01,  4.92311835e-01,
        -1.09864265e-01,  1.38826370e-01,  1.04873434e-01,
         1.03481777e-01,  2.56983846e-01, -1.96908563e-01,
         3.13127577e-01,  3.13498527e-02, -1.31013289e-01,
         3.37424457e-01,  8.15265626e-02, -3.85932252e-02,
        -2.77592570e-01,  4.72134531e-01,  3.00799578e-01,
         2.92815000e-01,  1.48443922e-01,  1.56209677e-01,
         3.43578130e-01,  3.46012600e-02, -5.39597757e-02,
        -3.29789847e-01,  2.07741082e-01,  2.21270546e-01,
        -1.98924929e-01, -3.23401868e-01, -4.87453699e-01,
        -4.77734566e-01, -1.00976713e-01,  1.20029949e-01,
        -4.09130991e-01, -2.69001395e-01,  2.06812263e-01,
         3.77009362e-01,  2.15715408e-01, -4.65460002e-01,
         4.46933568e-01, -3.52678567e-01, -1.48775205e-01,
         2.53880709e-01, -3.83449942e-01],
       [ 5.57

In [66]:
normalized_links[:10]

array([[ 1.30051732e-01, -2.01022729e-01, -1.53982997e-01,
         7.71845058e-02,  2.85015889e-02,  5.51747996e-03,
        -8.74200240e-02, -2.06263140e-01, -2.33879060e-01,
         5.69674745e-02,  2.67301559e-01,  2.07131788e-01,
         2.96612024e-01,  2.50173155e-02,  1.73979610e-01,
         1.91244528e-01, -5.16384207e-02, -6.93533570e-02,
        -6.48904080e-03, -8.41802061e-02, -9.69737470e-02,
         9.13702250e-02, -2.25792840e-01, -2.22530885e-04,
         2.12989394e-02, -1.40223727e-01, -2.17906266e-01,
         1.27074823e-01, -1.98474497e-01,  1.26541480e-01,
        -7.36315874e-03,  1.39770031e-01,  1.12036087e-01,
        -1.24705620e-01,  6.75868094e-02, -6.97758347e-02,
        -1.61953270e-01, -2.25724742e-01,  1.41666591e-01,
         1.23416178e-01,  1.12372167e-01,  5.18213175e-02,
         7.50072449e-02, -1.24009244e-01, -1.49616957e-01,
        -6.66701347e-02, -1.96737289e-01,  1.51129276e-01,
        -1.09239751e-02, -1.31591618e-01],
       [ 2.21

In [27]:
# Recommender
# Let's say that a user liked the movies in list best and did'nt like the movies in list worst
best = ['Star Wars: The Force Awakens', 'The Martian (film)', 'Tangerine (film)', 'Straight Outta Compton (film)',
        'Brooklyn (film)', 'Carol (film)', 'Spotlight (film)']
worst = ['American Ultra', 'The Cobbler (2014 film)', 'Entourage (film)', 'Fantastic Four (2015 film)',
         'Get Hard', 'Hot Pursuit (2015 film)', 'Mortdecai (film)', 'Serena (2014 film)', 'Vacation (2015 film)']
y = np.asarray([1 for _ in best] + [0 for _ in worst])
X = np.asarray([normalized_movies[movie_to_idx[movie]] for movie in best + worst])
X.shape
# X has rows equal to number of movies we know the rating for and the embeddings of each movie as columns (features)

(16, 50)

In [28]:
from sklearn import svm
clf = svm.SVC(kernel='linear')
clf.fit(X, y)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [29]:
#Find the best and worst movies for this user
estimated_movie_ratings = clf.decision_function(normalized_movies)
best = np.argsort(estimated_movie_ratings)
print('best:')
for c in reversed(best[-5:]):
    print(c, movies[c][0], estimated_movie_ratings[c])

print('worst:')
for c in best[:5]:
    print(c, movies[c][0], estimated_movie_ratings[c])

best:
481 The Devil Wears Prada (film) 1.296585610048926
66 Skyfall 1.2760216715397332
307 Les Misérables (2012 film) 1.1899199735915271
630 The Tree of Life (film) 1.1447996798863422
149 12 Years a Slave (film) 1.118079440703199
worst:
1782 Scooby-Doo! WrestleMania Mystery -1.663828146515016
7889 The Comebacks -1.6349067101466144
5097 Ready to Rumble -1.6136054170244138
6565 Son in Law -1.5962125556059634
1878 The Little Rascals (film) -1.5858507341231631


In [33]:
best[-5:]

array([149, 630, 307,  66, 481], dtype=int64)

In [36]:
best[9995:10000]

array([149, 630, 307,  66, 481], dtype=int64)

In [37]:
reversed(best[-5:])

<reversed at 0x188bfe013c8>

In [39]:
list(reversed(best[-5:]))

[481, 66, 307, 630, 149]

In [43]:
movies[0]

['Deadpool (film)',
 {'image': 'Deadpool poster.jpg',
  'name': 'Deadpool',
  'cinematography': 'Ken Seng',
  'Software Used': 'Adobe Premier Pro',
  'alt': "Official poster shows the titular hero Deadpool standing in front of the viewers, with hugging his hands, and donning his traditional black and red suit and mask, and the film's name, credits and billing below him.",
  'distributor': '20th Century Fox',
  'caption': 'Theatrical release poster',
  'gross': '$783.1 million',
  'country': 'United States',
  'director': 'Tim Miller',
  'runtime': '108 minutes',
  'editing': 'Julian Clarke',
  'language': 'English',
  'music': 'Tom Holkenborg',
  'budget': '$58 million'},
 ['Tim Miller (director)',
  'Simon Kinberg',
  'Ryan Reynolds',
  'Lauren Shuler Donner',
  'Rhett Reese',
  'Paul Wernick',
  'Deadpool',
  'Fabian Nicieza',
  'Rob Liefeld',
  'Morena Baccarin',
  'Ed Skrein',
  'T.J. Miller',
  'Gina Carano',
  'Leslie Uggams',
  'Brianna Hildebrand',
  'Stefan Kapičić',
  'Junkie

In [44]:
movies[0][-2]

'84%'

In [45]:
movies[0][-2][:-1]

'84'

In [None]:
# Predict rotten tomeatoes score

In [46]:
rotten_y = np.asarray([float(movie[-2][:-1]) / 100 for movie in movies if movie[-2]])
rotten_X = np.asarray([normalized_movies[movie_to_idx[movie[0]]] for movie in movies if movie[-2]])

In [48]:
from sklearn.linear_model import LinearRegression
TRAINING_CUT_OFF = int(len(rotten_X) * 0.8)
regr = LinearRegression()
regr.fit(rotten_X[:TRAINING_CUT_OFF], rotten_y[:TRAINING_CUT_OFF])

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [49]:
error = (regr.predict(rotten_X[TRAINING_CUT_OFF:]) - rotten_y[TRAINING_CUT_OFF:])
'mean square error %2.2f' % np.mean(error ** 2)

'mean square error 0.06'

In [50]:
# Check the MSE of just the mean
error = (np.mean(rotten_y[:TRAINING_CUT_OFF]) - rotten_y[TRAINING_CUT_OFF:])
'mean square error %2.2f' % np.mean(error ** 2)

'mean square error 0.09'

In [51]:
# Get the gross of each movie
def gross(movie):
    v = movie[1].get('gross')
    if not v or not ' ' in v:
        return None
    v, unit = v.split(' ', 1)
    unit = unit.lower()
    if not unit in ('million', 'billion'):
        return None
    if not v.startswith('$'):
        return None
    try:
        v = float(v[1:])
    except ValueError:
        return None
    if unit == 'billion':
        v *= 1000
    return v

movie_gross = [gross(m) for m in movies]
movie_gross = np.asarray([gr for gr in movie_gross if gr is not None])
highest = np.argsort(movie_gross)[-10:]
for c in reversed(highest):
    print(c, movies[c][0], movie_gross[c])

6 The Martian (film) 10900.0
7 List of Marvel Cinematic Universe films 4300.0
49 Back to the Future 3900.0
71 The Conjuring 2932.0
162 Thor (film) 2464.0
36 Furious 7 2340.0
30 Finding Dory 2187.0
1906 Jane Eyre (2011 film) 2068.0
19 Interstellar (film) 1670.0
2251 An American Werewolf in London 1655.0


In [52]:
gross_y = np.asarray([gr for gr in movie_gross if gr])
gross_X = np.asarray([normalized_movies[movie_to_idx[movie[0]]] for movie, gr in zip(movies, movie_gross) if gr])

In [53]:
# Predict the gross
TRAINING_CUT_OFF = int(len(gross_X) * 0.8)
regr = LinearRegression()
regr.fit(gross_X[:TRAINING_CUT_OFF], gross_y[:TRAINING_CUT_OFF])

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [54]:
error = (regr.predict(gross_X[TRAINING_CUT_OFF:]) - gross_y[TRAINING_CUT_OFF:])
'mean square error %2.2f' % np.mean(error ** 2)

'mean square error 8575.69'

In [55]:
error = (np.mean(gross_y[:TRAINING_CUT_OFF]) - gross_y[TRAINING_CUT_OFF:])
'mean square error %2.2f' % np.mean(error ** 2)

'mean square error 14115.59'