In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import sklearn
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import f1_score,accuracy_score,precision_score,recall_score
from math import sqrt
import nltk

import keras
from keras.models import Sequential,Model
from keras.layers import Dense ,Dropout,Input
import tensorflow as tf

In [2]:
data = pd.read_csv("../datasets/processed_data.csv",index_col=0)

In [3]:
data.head()

Unnamed: 0,movie_info,Action & Adventure,Animation,Art House & International,Classics,Comedy,Documentary,Drama,Horror,Kids & Family,Musical & Performing Arts,Mystery & Suspense,Romance,Science Fiction & Fantasy,Special Interest
0,"Always trouble-prone, the life of teenager Per...",1,0,0,0,1,0,1,0,0,0,0,0,1,0
1,Kate (Catherine Keener) and her husband Alex (...,0,0,0,0,1,0,0,0,0,0,0,0,0,0
2,"A successful, middle-aged Hollywood songwriter...",0,0,0,0,1,0,0,0,0,0,0,1,0,0
3,Following the closing arguments in a murder tr...,0,0,0,1,0,0,1,0,0,0,0,0,0,0
4,"In 1866, Professor Pierre M. Aronnax (Paul Luk...",1,0,0,0,0,0,1,0,1,0,0,0,0,0


In [4]:
infos = data["movie_info"]
targets = data.drop("movie_info",axis=1)

In [5]:
def clean_text(text):
    text = re.sub("\'", " ", text) 
    text = re.sub("[^a-zA-Z]"," ",text) 
    text = ' '.join(text.split()) 
    text = text.lower() 
    return text

In [6]:
def clean_data(data):
    return data.apply(lambda x:clean_text(x))

In [7]:
def transform_data(data):
    data = clean_data(data)
    tfidf = TfidfVectorizer(stop_words='english')
    matrix = tfidf.fit_transform(data)
    return matrix,tfidf

In [8]:
sparse_matrix,tfdif = transform_data(infos)

In [9]:
x_train, x_test, y_train, y_test = train_test_split(sparse_matrix, targets, test_size=0.2, random_state=1)

In [10]:
x_train.shape

(13889, 42950)

In [11]:
y_train.shape

(13889, 14)

In [12]:
def build_model(nb_inputs,nb_outputs):
    model = Sequential()
    model.add(Dense(1024,activation="relu",input_dim=nb_inputs))
    model.add(Dense(512,activation="relu"))
    model.add(Dense(256,activation="relu"))
    model.add(Dense(128,activation="relu"))
    model.add(Dense(64,activation="relu"))
    model.add(Dense(32,activation="relu"))
    model.add(Dense(nb_outputs,activation="sigmoid"))
    model.compile(optimizer=tf.keras.optimizers.Adam(0.001), 
              loss='binary_crossentropy', 
              metrics=['categorical_accuracy'])
    return model

In [13]:
def build_model(nb_inputs,nb_outputs):
    
    inputs = tf.keras.Input(shape=(nb_inputs,))

    dense_layer1 = Dense(1024, activation='relu')
    x = dense_layer1(inputs)

    dropout_layer = Dropout(0.3)
    x = dropout_layer(x)
    
    dense_layer2 = Dense(512, activation='relu')
    x = dense_layer2(x)

    predictions_layer = Dense(nb_outputs, activation='sigmoid')
    predictions = predictions_layer(x)
    
    model = tf.keras.Model(inputs=inputs, outputs=predictions)
    
    model.summary()
    
    model.compile(optimizer=tf.keras.optimizers.Adam(0.01),
                  loss=tf.keras.losses.BinaryCrossentropy(),
                  metrics=['categorical_accuracy', 
                           tf.keras.metrics.Precision(0.5),
                           tf.keras.metrics.Recall(0.5),])
    return model

In [14]:
model = build_model(x_train.shape[1],y_train.shape[1])

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 42950)]           0         
                                                                 
 dense (Dense)               (None, 1024)              43981824  
                                                                 
 dropout (Dropout)           (None, 1024)              0         
                                                                 
 dense_1 (Dense)             (None, 512)               524800    
                                                                 
 dense_2 (Dense)             (None, 14)                7182      
                                                                 
Total params: 44,513,806
Trainable params: 44,513,806
Non-trainable params: 0
_________________________________________________________________


In [15]:
model.fit(x_train.todense(),y_train,epochs=10,batch_size=200,validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2518007b8b0>

In [16]:
model.evaluate(x_train.todense(),y_train)



[0.176031693816185, 0.6124991178512573, 0.9329525828361511, 0.8931636810302734]

In [17]:
model.evaluate(x_test.todense(),y_test)



[0.8637751340866089,
 0.3786351978778839,
 0.5984872579574585,
 0.49050381779670715]

In [18]:
y_pred= model.predict(x_test.todense()) 

In [19]:
def to_score(y_pred,p):
    y_pred_scores = y_pred.copy()
    for row in y_pred_scores:
        m = row.max()
        row[row>=m*p]=1
        row[row!=1]=0
    return y_pred_scores

In [20]:
y_pred_scores = to_score(y_pred,0.9)

In [21]:
genres = data.columns[1:]

In [22]:
for i,genre in enumerate(genres):
    accuracy = accuracy_score(y_test.iloc[:,i],y_pred_scores[:,i])
    precision = precision_score(y_test.iloc[:,i],y_pred_scores[:,i])
    recall = recall_score(y_test.iloc[:,i],y_pred_scores[:,i])
    f1 = f1_score(y_test.iloc[:,i],y_pred_scores[:,i])
    print(genre)
    print("Accuracy : ",accuracy)
    print("Precision : ",precision)
    print("Recall : ",recall)
    print("f1 : ",f1,"\n")

Action & Adventure
Accuracy :  0.844514828678376
Precision :  0.7190265486725663
Recall :  0.44037940379403795
f1 :  0.546218487394958 

Animation
Accuracy :  0.9680391592283328
Precision :  0.7441860465116279
Recall :  0.24242424242424243
f1 :  0.3657142857142857 

Art House & International
Accuracy :  0.8358767636049524
Precision :  0.35511363636363635
Recall :  0.2670940170940171
f1 :  0.30487804878048774 

Classics
Accuracy :  0.9107399942412899
Precision :  0.6201550387596899
Recall :  0.23460410557184752
f1 :  0.34042553191489366 

Comedy
Accuracy :  0.7457529513389001
Precision :  0.6089494163424124
Recall :  0.5654923215898826
f1 :  0.5864168618266978 

Documentary
Accuracy :  0.9308954794126115
Precision :  0.8
Recall :  0.531328320802005
f1 :  0.6385542168674699 

Drama
Accuracy :  0.6567808810826374
Precision :  0.7070707070707071
Recall :  0.6299629433562731
f1 :  0.6662933930571108 

Horror
Accuracy :  0.9116038007486323
Precision :  0.6981132075471698
Recall :  0.37851662

In [23]:
test_movies={
    "Breaking Bad":"When Walter White, a New Mexico chemistry teacher, is diagnosed with Stage III cancer and given a prognosis\
    of only two years left to live. He becomes filled with a sense of fearlessness and an unrelenting desire to secure his \
    family's financial future at any cost as he enters the dangerous world of drugs and crime.",
    "Game of thrones":"Seven noble families fight for control of the mythical land of Westeros. Friction between the houses \
    leads to full-scale war. All while a very ancient evil awakens in the farthest north. Amidst the war, a neglected military \
    order of misfits, the Night's Watch, is all that stands between the realms of men and icy horrors beyond."}

In [28]:
score = model.predict(tfdif.transform([test_movies["Game of thrones"]]).todense())

In [29]:
list(genres[(to_score(score,0.95)==1)[0]])

['Science Fiction & Fantasy']