In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import regex as re
from nltk.corpus import stopwords

In [2]:
movie = pd.read_csv('..\Data\movie_info.csv')

In [3]:
movie = movie[['Rating', 'Synopsis']].dropna().reset_index(drop=True)

In [4]:
for i in range(len(movie)):
    movie['Rating'].iloc[i] = movie['Rating'].iloc[i].split(' ')[0]

In [5]:
for i in range(len(movie)):
    text = movie['Synopsis'].iloc[i]

    text = text.lower()
    text = re.sub(r'[^\w\s]',' ',text)
    text = re.sub("\d+", " ", text)
    stop = stopwords.words('english')
    text = " ".join(text for text in text.split() if text not in stop)

    movie['Synopsis'].iloc[i] = text

In [6]:
movie

Unnamed: 0,Rating,Synopsis
0,R,allan karlsson year old explosives expert esca...
1,PG,frank morris clint eastwood hardened con histo...
2,PG-13,tami oldham richard sharp anticipate would sai...
3,PG,billy crystal stars sammy agitated agent lost ...
4,PG-13,challenges impending parenthood turn lives fiv...
...,...,...
15654,PG-13,last summer middle school comes close four bes...
15655,PG-13,buffy summers kristy swanson nothing meets mer...
15656,R,clown killed party mishap comes back dead seek...
15657,PG,sequel dreamworks animation oscar nominated bl...


In [7]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
labels = encoder.fit_transform(movie['Rating'].values)
labels

array([4, 2, 3, ..., 4, 2, 4])

In [8]:
encoder.classes_

array(['G', 'NC-17', 'PG', 'PG-13', 'R', 'TV14', 'TVG', 'TVMA', 'TVPG',
       'TVY7'], dtype=object)

In [9]:
movie['label'] = labels
movie

Unnamed: 0,Rating,Synopsis,label
0,R,allan karlsson year old explosives expert esca...,4
1,PG,frank morris clint eastwood hardened con histo...,2
2,PG-13,tami oldham richard sharp anticipate would sai...,3
3,PG,billy crystal stars sammy agitated agent lost ...,2
4,PG-13,challenges impending parenthood turn lives fiv...,3
...,...,...,...
15654,PG-13,last summer middle school comes close four bes...,3
15655,PG-13,buffy summers kristy swanson nothing meets mer...,3
15656,R,clown killed party mishap comes back dead seek...,4
15657,PG,sequel dreamworks animation oscar nominated bl...,2


## Chia tập train và test

In [10]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(movie['Synopsis'].values, movie['label'].values, test_size = 0.3, random_state = 0)

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
features = vectorizer.fit_transform(x_train) # training vector TF-IDF pada tiap data
features_test = vectorizer.transform(x_test)

In [12]:
from keras.models import Sequential
from keras.layers import Flatten, Dense, Softmax, Dropout

model = Sequential()
model.add(Dense(256, input_shape = (features.shape[1],), activation = 'relu'))
model.add(Dense(128, activation = 'relu'))
model.add(Dense(128, activation = 'sigmoid'))
model.add(Dense(128, activation = 'relu'))
model.add(Dense(128, activation = 'sigmoid'))
model.add(Dropout(0.2))
model.add(Dense(19))
model.add(Softmax())

In [13]:
callback = tf.keras.callbacks.EarlyStopping(monitor='accuracy', min_delta=0.2, patience=2, restore_best_weights=True)

model.compile(optimizer = 'adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [14]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 256)               9199104   
                                                                 
 dense_1 (Dense)             (None, 128)               32896     
                                                                 
 dense_2 (Dense)             (None, 128)               16512     
                                                                 
 dense_3 (Dense)             (None, 128)               16512     
                                                                 
 dense_4 (Dense)             (None, 128)               16512     
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense_5 (Dense)             (None, 19)                2

In [15]:
model.fit(features.todense(), y_train, validation_data = (features_test.todense(), y_test), epochs = 10, callbacks=[callback])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10


<keras.callbacks.History at 0x1bf4e431df0>

In [16]:
t = """Simu Liu (Shang-Chi And The Legend Of The Ten Rings), Phillipa Soo (Hamilton), and Luke Bracey (Point Break) star in this modern twist on a classic love story from NY Times bestselling author Taylor Jenkins Reid. Emma and Jesse are living the perfect life together, until Jesse disappears in a tragic helicopter crash on their first wedding anniversary. Four years later, Emma has found happiness again and is about to marry her best friend when Jesse resurfaces, turning her world upside down and leaving her torn between two great loves."""

text = t.lower()
text = re.sub(r'[^\w\s]',' ',text)
text = re.sub("\d+", " ", text)
stop = stopwords.words('english')
text = " ".join(text for text in text.split() if text not in stop)
text

'simu liu shang chi legend ten rings phillipa soo hamilton luke bracey point break star modern twist classic love story ny times bestselling author taylor jenkins reid emma jesse living perfect life together jesse disappears tragic helicopter crash first wedding anniversary four years later emma found happiness marry best friend jesse resurfaces turning world upside leaving torn two great loves'

In [17]:
t = vectorizer.transform([text])
pre_class = np.argmax(model.predict(t.todense()),axis=1)
encoder.classes_[pre_class][0]



'PG-13'

In [18]:
model_json = model.to_json()

with open("predict_rating.json", "w") as json_file:
    json_file.write(model_json)

model.save_weights("predict_rating.h5")