In [3]:
import tensorflow as tf
import pandas as pd
from keras.preprocessing import text, sequence
from keras.models import Sequential, model_from_json, load_model
from keras.layers import Dense, Activation, Dropout
from keras import utils
from sklearn.preprocessing import LabelBinarizer, MultiLabelBinarizer, LabelEncoder

In [4]:
dataset = pd.read_csv("the-movies-dataset/movies_metadata.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
dataset = dataset[dataset.genres != '[]']
dataset = dataset[dataset.overview != 'No overview found.']

In [6]:
dataset = dataset[['overview', 'genres']]
dataset = dataset.dropna()

In [7]:
dataset['genres'] = dataset['genres'].map(lambda x: x.lstrip('[{').rstrip('}]'))
dataset['genres'] = dataset['genres'].map(lambda x: x.split("}, {"))

In [8]:
dataset.reset_index().drop('index', axis = 1)

Unnamed: 0,overview,genres
0,"Led by Woody, Andy's toys live happily in his ...","['id': 16, 'name': 'Animation', 'id': 35, 'nam..."
1,When siblings Judy and Peter discover an encha...,"['id': 12, 'name': 'Adventure', 'id': 14, 'nam..."
2,A family wedding reignites the ancient feud be...,"['id': 10749, 'name': 'Romance', 'id': 35, 'na..."
3,"Cheated on, mistreated and stepped on, the wom...","['id': 35, 'name': 'Comedy', 'id': 18, 'name':..."
4,Just when George Banks has recovered from his ...,"['id': 35, 'name': 'Comedy']"
5,"Obsessive master thief, Neil McCauley leads a ...","['id': 28, 'name': 'Action', 'id': 80, 'name':..."
6,An ugly duckling having undergone a remarkable...,"['id': 35, 'name': 'Comedy', 'id': 10749, 'nam..."
7,"A mischievous young boy, Tom Sawyer, witnesses...","['id': 28, 'name': 'Action', 'id': 12, 'name':..."
8,International action superstar Jean Claude Van...,"['id': 28, 'name': 'Action', 'id': 12, 'name':..."
9,James Bond must unmask the mysterious head of ...,"['id': 12, 'name': 'Adventure', 'id': 28, 'nam..."


In [9]:
# Create MultiLabelBinarizer object
one_hot = MultiLabelBinarizer()

# One-hot encode data
pd.DataFrame(one_hot.fit_transform(dataset['genres']))


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,0,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
6,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,1,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
reduced = pd.DataFrame.join(dataset.reset_index().drop('index', axis = 1), pd.DataFrame(one_hot.fit_transform(dataset['genres'])))

In [11]:
reduced.drop(['overview', 'genres'], axis = 1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,0,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
6,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,1,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
print("Total Number of entries: ", len(reduced))
print("Train entries (80%): ", int(len(reduced) * .8))
print("Test entries (20%): ", int(len(reduced) * .2))

Total Number of entries:  42207
Train entries (80%):  33765
Test entries (20%):  8441


In [13]:
train_size = int(len(reduced) * .8)
train_entries = reduced['overview'][:train_size]
train_cat = reduced.drop(['overview', 'genres'], axis = 1)[:train_size]

In [14]:
test_entries = reduced['overview'][train_size:]
test_cat = reduced.drop(['overview', 'genres'], axis = 1)[train_size:]

In [15]:
vocab_size = 600
tokenize = text.Tokenizer(num_words=vocab_size)
tokenize.fit_on_texts(train_entries) 

In [16]:
x_train = tokenize.texts_to_matrix(train_entries)
x_test = tokenize.texts_to_matrix(test_entries)

In [17]:
encoder = LabelBinarizer()
encoder.fit(train_cat)                     
y_train = encoder.transform(train_cat)
y_test = encoder.transform(test_cat)

In [18]:
model = []
model = Sequential()

#First Layer

model.add(Dense(512, input_shape=(vocab_size,)))    
model.add(Activation('relu'))

model.add(Dense((32)))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy',      
              optimizer='adam',                     
              metrics=['accuracy'])

history = model.fit(x_train, y_train,               
                    batch_size=32, 
                    epochs=2, 
                    verbose=1, 
                    validation_split=0.1)


score = model.evaluate(x_test, y_test,              
                       batch_size=32, verbose=1)

print('Test score:', score[0])
print('Test accuracy:', score[1])

Train on 30388 samples, validate on 3377 samples
Epoch 1/2
Epoch 2/2
Test accuracy: 0.34340203744247894


In [19]:
encoder.classes_

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31])

In [20]:
test_string = "A killer movie about some stuff done my some amazing people."
test = tokenize.texts_to_matrix(test_string)

In [21]:
test

array([[0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [26]:
result = model.predict(test)