In [1]:
import numpy as np
import pandas as pd

In [2]:
#read in data sets
movies = pd.read_csv('data/ml-latest-small/movies.csv')

ratings = pd.read_csv('data/ml-latest-small/ratings.csv')
ratings_agg = ratings[['movieId','rating']].groupby('movieId',as_index=False).mean()

imdb = pd.read_csv('data/ml-latest-small/scraped_imdb_data.csv')
imdb_data = imdb[['Director','Runtime','movieId']]

In [3]:
#join tables 
movies.set_index('movieId',inplace=True)
ratings_agg.set_index('movieId',inplace=True)
imdb_data.set_index('movieId',inplace=True)
join_1 = movies.join(ratings_agg,how='inner')
join_2 = join_1.join(imdb_data,how = 'inner')
mov_data_sm = join_2

In [4]:
#preprocess genres
genres = mov_data_sm['genres'].astype(str).apply(lambda s:s.split('|'),0)
genres_bin = pd.get_dummies(genres.apply(pd.Series).stack()).sum(level=0)
mov_data_sm.drop('genres',axis=1,inplace=True)
mov_data_sm = mov_data_sm.join(genres_bin)

In [5]:
#preprocess runtime
mov_data_sm['Runtime'] = mov_data_sm['Runtime'].astype(str).apply(lambda s:s.strip(' min'),0)
def time_to_int(t):
    try:
        return int(t)
    except:
        return 90
mov_data_sm['Runtime'] = mov_data_sm['Runtime'].apply(time_to_int,0).astype(int)

In [6]:
#preprocess directors 
mov_data_sm = pd.get_dummies(mov_data_sm,columns=['Director'])

In [7]:
#drop title
mov_data_sm.drop('title',axis=1,inplace=True)

In [43]:
#read in posters and return normalized image dim of (300,450,3)
from PIL import Image
from resizeimage import resizeimage

def get_img_arr(movieid):
    try:
        img = Image.open("/Users/richardwen/Desktop/posters/poster_"+str(movieid)+".jpg")
        img = resizeimage.resize_cover(img,(150,225))
        img = np.array(img)
        assert img.shape == (225,150,3)
        return img*1.0/256
    except:
        #print movieid
        return np.zeros((150,225,3))-1

(8704, 225, 150, 3)
362


In [52]:
#preprare data for random forest 
from sklearn.model_selection import train_test_split
y = mov_data_sm['rating']
x = mov_data_sm.drop('rating',axis=1)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=21)

In [None]:
#random forest model 
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
rfg = RandomForestRegressor(200)
rfg.fit(x_train,y_train)
pred = rfg.predict(x_test)
print mean_squared_error(pred,y_test)

In [61]:
#convert images into 3d arrays for CNN 
images = []
indexes_selected = []
n=0
for i in mov_data_sm.index:
    img = get_img_arr(i)
    #print img.shape
    if img[0,0,0] >=0:
        images.append(img)
        indexes_selected.append(i)
    else:
        n+=1
images = np.array(images)

In [53]:
#prepare data for CNN 
x_cnn = images
y_cnn = mov_data_sm['rating'].loc[indexes_selected]
assert x_cnn.shape[0] == y_cnn.shape[0]
x_cnn_train, x_cnn_test, y_cnn_train, y_cnn_test = train_test_split(x_cnn, y_cnn, test_size=0.2, random_state=21)

In [57]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Conv2D, MaxPooling2D

Using TensorFlow backend.


In [59]:
#settings 
batch_size = 50
epochs = 50

In [58]:
#build cnn model with keras 
model = Sequential()
model.add(Conv2D(32,(3,3),input_shape = (225,150,3),data_format = 'channels_last'))
model.add(Activation('relu'))
model.add(Conv2D(32, (3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))

model.add(Conv2D(32, (3, 3)))
model.add(Activation('relu'))
model.add(Conv2D(32, (3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))

model.add(Flatten())
model.add(Dense(512))
model.add(Activation('relu'))
model.add(Dense(1))
model.add(Activation('linear'))

opt = keras.optimizers.rmsprop(lr=0.0001, decay=1e-6)

model.compile(loss='mean_squared_error',optimizer=opt,metrics=['mean_squared_error'])

In [60]:
#fit model, run on AWS 
model.fit(x_cnn_train, y_cnn_train,
              batch_size=batch_size,
              epochs=epochs,
              validation_data=(x_cnn_test, y_cnn_test),
              shuffle=True)

Train on 6963 samples, validate on 1741 samples
Epoch 1/50
1050/6963 [===>..........................] - ETA: 588s - loss: 19.7558 - mean_squared_error: 19.7558

KeyboardInterrupt: 

In [None]:
scores = model.evaluate(x_cnn_test, y_cnn_test, verbose=1)
print('Test MSE:', scores[0])