In [1]:
# Read dataset
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# Scale data
from sklearn.preprocessing import StandardScaler
# Train test split
from sklearn.model_selection import train_test_split
# Build model
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Flatten, Dense, Softmax, Dropout
from nltk.corpus import stopwords
import re

# Đọc dữ liệu

In [2]:
data = pd.read_csv('E:\Crawl_Web\Rotten Tomatoes\Preprocessing\movie_preprocessing4.csv')

In [3]:
data.head(3)

Unnamed: 0,Title,Tomatometer state,Audience state,Genre,Original Language,Director,Producer,Writer,Release Date (Theaters),Box Office (Gross USA),...,Distributor,Synopsis,Cast,Release Date (Streaming),Rating,Aspect Ratio,Sound Mix,View the collection,Score,Review count
0,Paa,rotten,upright,Drama,Hindi,R. Balki,Sunil Manchanda,R. Balki,"Dec 4, 2009 limited",$199.2K,...,Big Pictures,A politician's 13-year-old son has a rare diso...,"Amitabh Bachchan, Abhishek Bachchan, Vidya Bal...",,PG-13,,,,58.5,1008
1,Small Town Wisconsin,fresh,upright,"Drama, Comedy",English,Niels Mueller,"Scott K. Foley, Hongtao Liu, Niels Mueller, Jo...",Jason Naczek,"Jun 3, 2022 limited",,...,Quiver Distribution,"After losing a custody battle, perpetual teena...","David Sullivan, Bill Heck, Kristen Johnston, T...","Jun 10, 2022",PG-13,,,,85.5,62
2,The 100-Year-Old Man Who Climbed Out the Windo...,fresh,upright,"Comedy, Adventure",Swedish,Felix Herngren,"Malte Forssell, Felix Herngren, Henrik Jansson...","Felix Herngren, Hans Ingemansson","May 1, 2015 limited",$923.9K,...,Music Box Films,"Allan Karlsson, a 100-year-old explosives expe...","Robert Gustafsson, Iwar Wiklander, David Wiber...","Aug 18, 2015",R,,,,68.5,2581


# Tiền xử lí dữ liệu

In [4]:
# Drop các cột không dùng trong model
data.drop(columns=['Title', 'Tomatometer state', 'Audience state', 'Original Language', 
                   'Director', 'Producer', 'Writer', 'Release Date (Theaters)', 'Box Office (Gross USA)',
                    'Aspect Ratio', 'Sound Mix', 'View the collection', 'Distributor', 'Cast', 'Release Date (Streaming)', 'Review count'], inplace=True)

In [5]:
data

Unnamed: 0,Genre,Runtime,Synopsis,Rating,Score
0,Drama,133.0,A politician's 13-year-old son has a rare diso...,PG-13,58.5
1,"Drama, Comedy",109.0,"After losing a custody battle, perpetual teena...",PG-13,85.5
2,"Comedy, Adventure",114.0,"Allan Karlsson, a 100-year-old explosives expe...",R,68.5
3,"Drama, Mystery & thriller, Action",107.0,"Frank Morris (Clint Eastwood), a hardened con ...",PG,91.0
4,"Drama, Adventure, Romance",120.0,Tami Oldham and Richard Sharp couldn't anticip...,PG-13,67.0
...,...,...,...,...,...
32551,Drama,136.0,Encouraged by his editor to seek 'sexy stories...,R,42.0
32552,Drama,89.0,A distraught man (James Coburn) embarks on a c...,R,40.5
32553,Documentary,80.0,Three brothers discover the late-night scene o...,PG-13,75.5
32554,"Drama, Comedy, Documentary",79.0,"Jeb Berrier, a regular American man, makes a p...",PG-13,90.5


In [6]:
list_genre = []
for g in data['Genre']:
    try:
        g = g.split(', ')
        for sub_g in g:
            if sub_g not in list_genre:
                list_genre.append(sub_g)
    except:
        continue

In [7]:
for g in list_genre:
    data[g] = 0

for i in range(len(data)):
    for g in list_genre:
        if not pd.isna(data['Genre'].iloc[i]) and g in data['Genre'].iloc[i]:
            data[g].iloc[i] = 1
            
data.drop(columns=['Genre'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[g].iloc[i] = 1


In [8]:
data['Score'] = data['Score']/10

In [9]:
data = data[list_genre + ['Runtime', 'Rating', 'Synopsis',	'Score']]

In [10]:
# Mã hóa cột "Rating"
data = pd.get_dummies(data, columns=['Rating'], drop_first=True)

In [11]:
# Hàm tiền xử lí dữ liệu text
def preprocess_text(text):
    text = text.lower() # Chuyển văn bản thành chữ thường
    text = re.sub(r'[^\w\s]',' ',text) # Xóa dấu câu
    text = re.sub("\d+", " ", text) # Xóa chữ số
    stop = stopwords.words('english') # Xóa các từ không có nghĩa (Stop Words)
    text = " ".join(text for text in text.split() if text not in stop)
    return text

In [12]:
for i in range(len(data)):
    data['Synopsis'].iloc[i] = preprocess_text(data['Synopsis'].iloc[i])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Synopsis'].iloc[i] = preprocess_text(data['Synopsis'].iloc[i])


In [13]:
vectorizer = TfidfVectorizer()
features = vectorizer.fit_transform(data['Synopsis']) # training vector TF-IDF pada tiap data
data.drop(columns=['Synopsis'], inplace=True)

In [14]:
data.columns

Index(['Drama', 'Comedy', 'Adventure', 'Mystery & thriller', 'Action',
       'Romance', 'Crime', 'Documentary', 'Horror', 'Kids & family', 'Fantasy',
       'Biography', 'Western', 'Musical', 'Music', 'Sci-fi', 'History',
       'Animation', 'Lgbtq+', 'Anime', 'Holiday', 'War', 'Stand-up', 'Nature',
       'Sports & fitness', 'Sports', 'Faith & spirituality', 'Short',
       'Variety', 'Foreign', 'Special interest', 'Entertainment', 'News',
       'Other', 'Gay & lesbian', 'Health & wellness', 'Runtime', 'Score',
       'Rating_NC-17', 'Rating_PG', 'Rating_PG-13', 'Rating_R', 'Rating_TV14',
       'Rating_TVG', 'Rating_TVMA', 'Rating_TVPG', 'Rating_TVY7'],
      dtype='object')

# Xây dựng mô hình

In [15]:
# scaler = StandardScaler()
# data_sc = data.copy().drop(columns='Score')
# data_y = data['Score']
# data_scales = scaler.fit_transform(data_sc)
# data = pd.DataFrame(data_scales, index=data_sc.index, columns=data_sc.columns)
# data['Score'] = data_y

# Chọn biến X và Y

In [16]:
# X = data[feature_selection]
X = np.concatenate((data.drop(columns=['Score', 'Runtime']).values, features.toarray()), axis=1)
Y = data['Score']

# Chia tập dữ liệu thành 2 tập train(80%) và test(20%)

In [17]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 42)

# Build model

In [18]:
model = Sequential()
model.add(Dense(64, input_dim = x_train.shape[1], activation = 'relu'))
model.add(Dropout(0.15))
model.add(Dense(64, activation = 'relu'))
model.add(Dropout(0.2))
model.add(Dense(40, activation = 'relu'))
model.add(Dropout(0.15))
model.add(Dense(54, activation = 'relu'))
model.add(Dropout(0.18))
model.add(Dense(1))

In [19]:
model.compile(optimizer = 'adam', loss = 'mean_squared_error', metrics = ['mse', 'mae'])

In [20]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 64)                3765184   
                                                                 
 dropout (Dropout)           (None, 64)                0         
                                                                 
 dense_1 (Dense)             (None, 64)                4160      
                                                                 
 dropout_1 (Dropout)         (None, 64)                0         
                                                                 
 dense_2 (Dense)             (None, 40)                2600      
                                                                 
 dropout_2 (Dropout)         (None, 40)                0         
                                                                 
 dense_3 (Dense)             (None, 54)                2

In [21]:
model.fit(x_train, y_train, validation_split=0.1, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x1aae0c02be0>

# Dự đoán điểm phim

In [22]:
data.columns

Index(['Drama', 'Comedy', 'Adventure', 'Mystery & thriller', 'Action',
       'Romance', 'Crime', 'Documentary', 'Horror', 'Kids & family', 'Fantasy',
       'Biography', 'Western', 'Musical', 'Music', 'Sci-fi', 'History',
       'Animation', 'Lgbtq+', 'Anime', 'Holiday', 'War', 'Stand-up', 'Nature',
       'Sports & fitness', 'Sports', 'Faith & spirituality', 'Short',
       'Variety', 'Foreign', 'Special interest', 'Entertainment', 'News',
       'Other', 'Gay & lesbian', 'Health & wellness', 'Runtime', 'Score',
       'Rating_NC-17', 'Rating_PG', 'Rating_PG-13', 'Rating_R', 'Rating_TV14',
       'Rating_TVG', 'Rating_TVMA', 'Rating_TVPG', 'Rating_TVY7'],
      dtype='object')

In [23]:
# Thể loại phim
genre = input('Thể loại phim:')
genre_vector = []

for g in list_genre:
    if g in genre:
        genre_vector.append(1)
    else:
        genre_vector.append(0)

# Thời lượng phim
# runtime = int(input('Thời lượng phim:'))

# Rating phim
rating = input('Rating phim:')
list_rating = ['NC-17', 'PG', 'PG-13', 'R', 'TV14', 'TVG', 'TVMA', 'TVPG', 'TVY7']
rating_vector = []

for r in list_rating:
    if r == 'G':
        continue
    if r == rating:
        rating_vector.append(1)
    else:
        rating_vector.append(0)

# Tóm tắt phim
synopsis = input('Tóm tắt phim:')

synopsis_vector = vectorizer.transform([preprocess_text(synopsis)])

In [24]:
# Dự đoán
info_vector = genre_vector + rating_vector + list(synopsis_vector.toarray()[0])

predict = model.predict([info_vector])
print(predict)

[[6.12737]]
