In [1]:
# Read dataset
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# Train test split
from sklearn.model_selection import train_test_split
# Build model
from sklearn.feature_extraction.text import TfidfVectorizer
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Flatten, Dense, Softmax, Dropout
from nltk.corpus import stopwords
import re

# Đọc dữ liệu

In [2]:
data = pd.read_csv('E:\Crawl_Web\Rotten Tomatoes\Preprocessing\movie_cleaned.csv')[:20000]

In [3]:
data.head(3)

Unnamed: 0,Title,Score,Review count,Genre,Rating,Synopsis
0,Paa,58.5,1008,Drama,PG-13,A politician's 13-year-old son has a rare diso...
1,Small Town Wisconsin,85.5,62,"Drama, Comedy",PG-13,"After losing a custody battle, perpetual teena..."
2,The 100-Year-Old Man Who Climbed Out the Windo...,68.5,2581,"Comedy, Adventure",R,"Allan Karlsson, a 100-year-old explosives expe..."


# Tiền xử lí dữ liệu

In [4]:
# Drop các cột không dùng trong model
data.drop(columns=['Title', 'Review count'], inplace=True)

In [5]:
data

Unnamed: 0,Score,Genre,Rating,Synopsis
0,58.5,Drama,PG-13,A politician's 13-year-old son has a rare diso...
1,85.5,"Drama, Comedy",PG-13,"After losing a custody battle, perpetual teena..."
2,68.5,"Comedy, Adventure",R,"Allan Karlsson, a 100-year-old explosives expe..."
3,91.0,"Drama, Mystery & thriller, Action",PG,"Frank Morris (Clint Eastwood), a hardened con ..."
4,67.0,"Drama, Adventure, Romance",PG-13,Tami Oldham and Richard Sharp couldn't anticip...
...,...,...,...,...
19995,69.5,"Drama, Lgbtq+",PG-13,"Ashraf (Yousef ""Joe"" Sweid), a Palestinian, me..."
19996,84.0,Documentary,R,Terrorist suspect Abu Zubaydah becomes the fir...
19997,18.5,Comedy,PG,"Two friends from Brooklyn (Jerry O'Connell), (..."
19998,89.0,"Drama, Comedy",R,A young Englishman dreams of escaping from his...


In [6]:
data['Info'] = data['Genre'] + ' ' + data['Rating'] + ' ' + data['Synopsis']

In [7]:
data['Info']

0        Drama PG-13 A politician's 13-year-old son has...
1        Drama, Comedy PG-13 After losing a custody bat...
2        Comedy, Adventure R Allan Karlsson, a 100-year...
3        Drama, Mystery & thriller, Action PG Frank Mor...
4        Drama, Adventure, Romance PG-13 Tami Oldham an...
                               ...                        
19995    Drama, Lgbtq+ PG-13 Ashraf (Yousef "Joe" Sweid...
19996    Documentary R Terrorist suspect Abu Zubaydah b...
19997    Comedy PG Two friends from Brooklyn (Jerry O'C...
19998    Drama, Comedy R A young Englishman dreams of e...
19999    Drama PG-13 In a provincial Iranian town, the ...
Name: Info, Length: 20000, dtype: object

In [8]:
# Hàm tiền xử lí dữ liệu text
def preprocess_text(text):
    text = text.lower() # Chuyển văn bản thành chữ thường
    text = re.sub(r'[^\w\s]',' ',text) # Xóa dấu câu
    text = re.sub("\d+", " ", text) # Xóa chữ số
    stop = stopwords.words('english') # Xóa các từ không có nghĩa (Stop Words)
    text = " ".join(text for text in text.split() if text not in stop)
    return text

In [9]:
data['Info'] = data['Info'].apply(preprocess_text)

In [10]:
data['Score'] = data['Score']/10

# Xây dựng mô hình

## Vector hóa dữ liệu đầu vào bằng TF-IDF

In [11]:
vectorizer = TfidfVectorizer()
features = vectorizer.fit_transform(data['Info'])

# Chọn biến X và Y

In [12]:
X = features
Y = data['Score']

# Chia tập dữ liệu thành 2 tập train(80%) và test(20%)

In [13]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 42)

# Build model

In [14]:
model = Sequential()
model.add(Dense(64, input_dim = x_train.shape[1], activation = 'relu'))
model.add(Dropout(0.15))
model.add(Dense(64, activation = 'relu'))
model.add(Dropout(0.2))
model.add(Dense(40, activation = 'relu'))
model.add(Dropout(0.15))
model.add(Dense(54, activation = 'relu'))
model.add(Dropout(0.18))
model.add(Dense(1))

In [15]:
model.compile(optimizer = 'adam', loss = 'mean_squared_error', metrics = ['mse', 'mae'])

In [16]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 64)                3002880   
                                                                 
 dropout (Dropout)           (None, 64)                0         
                                                                 
 dense_1 (Dense)             (None, 64)                4160      
                                                                 
 dropout_1 (Dropout)         (None, 64)                0         
                                                                 
 dense_2 (Dense)             (None, 40)                2600      
                                                                 
 dropout_2 (Dropout)         (None, 40)                0         
                                                                 
 dense_3 (Dense)             (None, 54)                2

In [17]:
model.fit(x_train.todense(), y_train, validation_split=0.1, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x1ea72433580>

# Dự đoán điểm phim

In [31]:
# Thể loại phim
genre = input('Thể loại phim:')
genre_vector = []

# Rating phim
rating = input('Rating phim:')


# Tóm tắt phim
synopsis = input('Tóm tắt phim:')

In [32]:
# Dự đoán
text = genre + ' ' + rating + ' ' + synopsis
info_vector = vectorizer.transform([preprocess_text(text)])

predict = model.predict(info_vector.todense())
print(predict)

[[9.206136]]
