In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import sklearn
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import f1_score,accuracy_score,precision_score,recall_score
from math import sqrt

import keras
from keras.models import Sequential
from keras.layers import Dense 
import tensorflow as tf

In [2]:
data = pd.read_csv("processed_data.csv",index_col=0)

In [3]:
data.head()

Unnamed: 0,overview,Action,Adventure,Animation,Comedy,Crime,Documentary,Drama,Family,Fantasy,...,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
0,taisto kasurinen finnish coal miner whose fath...,0,0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,episod life nikand,0,0,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ted bellhop first night job hotel unusu guest ...,0,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,princess leia captur held hostag evil imperi f...,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,nemo,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
infos = data["movie_info"]
targets = data.drop("movie_info",axis=1)

KeyError: 'movie_info'

In [5]:
def clean_text(text):
    text = re.sub("\'", "", text) 
    text = re.sub("[^a-zA-Z]"," ",text) 
    text = ' '.join(text.split()) 
    text = text.lower() 
    
    return text

In [6]:
def clean_data(data):
    return data.apply(lambda x:clean_text(x))

In [7]:
def transform_data(data):
    data = clean_data(data)
    tfidf = TfidfVectorizer(stop_words='english')
    matrix = tfidf.fit_transform(data)
    return matrix,tfidf

In [8]:
sparse_matrix,tfdif = transform_data(infos)

In [9]:
x_train, x_test, y_train, y_test = train_test_split(sparse_matrix, targets, test_size=0.2, random_state=1)

In [10]:
x_train.shape

(13889, 45160)

In [11]:
y_train.shape

(13889, 14)

In [17]:
Sequential().

In [181]:
def build_model(nb_inputs,nb_outputs):
    model = Sequential()
    model.add(Dense(1024,activation="relu",input_dim=nb_inputs))
    model.add(Dense(512,activation="relu"))
    model.add(Dense(256,activation="relu"))
    model.add(Dense(128,activation="relu"))
    model.add(Dense(64,activation="relu"))
    model.add(Dense(32,activation="relu"))
    model.add(Dense(nb_outputs,activation="sigmoid"))
    model.compile(optimizer=tf.keras.optimizers.Adam(0.001), 
              loss='binary_crossentropy', 
              metrics=['categorical_accuracy'])
    return model

In [182]:
model = build_model(x_train.shape[1],y_train.shape[1])

In [183]:
model.fit(x_train.todense(),y_train,epochs=10,batch_size=50,validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x211ffa00d00>

In [184]:
model.evaluate(x_test.todense(),y_test)



[0.9960055351257324, 0.3924560844898224]

In [185]:
y_pred= model.predict(x_test.todense()) 

In [186]:
def to_score(y_pred,p):
    y_pred_scores = y_pred.copy()
    for row in y_pred_scores:
        m = row.max()
        row[row>=m*p]=1
        row[row!=1]=0
    return y_pred_scores

In [208]:
y_pred_scores = to_score(y_pred,0.9)

In [209]:
genres = data.columns[1:]

In [210]:
for i,genre in enumerate(genres):
    accuracy = accuracy_score(y_test.iloc[:,i],y_pred_scores[:,i])
    precision = precision_score(y_test.iloc[:,i],y_pred_scores[:,i])
    recall = recall_score(y_test.iloc[:,i],y_pred_scores[:,i])
    f1 = f1_score(y_test.iloc[:,i],y_pred_scores[:,i])
    print(genre)
    print("Accuracy : ",accuracy)
    print("Precision : ",precision)
    print("Recall : ",recall)
    print("f1 : ",f1,"\n")

Action & Adventure
Accuracy :  0.8260869565217391
Precision :  0.5856777493606138
Recall :  0.6205962059620597
f1 :  0.6026315789473685 

Animation
Accuracy :  0.964295997696516
Precision :  0.5769230769230769
Recall :  0.22727272727272727
f1 :  0.3260869565217391 

Art House & International
Accuracy :  0.8568960552836165
Precision :  0.4161849710982659
Recall :  0.15384615384615385
f1 :  0.22464898595943839 

Classics
Accuracy :  0.9101641232363951
Precision :  0.5935483870967742
Recall :  0.2697947214076246
f1 :  0.3709677419354838 

Comedy
Accuracy :  0.7636049524906421
Precision :  0.6710526315789473
Recall :  0.5067750677506775
f1 :  0.577457539886773 

Documentary
Accuracy :  0.9277281888856896
Precision :  0.7341772151898734
Recall :  0.581453634085213
f1 :  0.648951048951049 

Drama
Accuracy :  0.683270947307803
Precision :  0.7180762852404643
Recall :  0.6876654314452091
f1 :  0.7025419145484045 

Horror
Accuracy :  0.9087244457241578
Precision :  0.6637168141592921
Recall :  

In [205]:
test_movies={
    "Breaking Bad":"When Walter White, a New Mexico chemistry teacher, is diagnosed with Stage III cancer and given a prognosis\
    of only two years left to live. He becomes filled with a sense of fearlessness and an unrelenting desire to secure his \
    family's financial future at any cost as he enters the dangerous world of drugs and crime.",
    "Game of thrones":"Seven noble families fight for control of the mythical land of Westeros. Friction between the houses \
    leads to full-scale war. All while a very ancient evil awakens in the farthest north. Amidst the war, a neglected military \
    order of misfits, the Night's Watch, is all that stands between the realms of men and icy horrors beyond."}

In [206]:
score = model.predict(tfdif.transform([test_movies["Game of thrones"]]).todense())

In [207]:
list(genres[(to_score(score,0.95)==1)[0]])

['Action & Adventure', 'Science Fiction & Fantasy']