In [1]:
import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, concat, lit, from_json
from cassandra.cluster import Cluster
from time import sleep
from IPython.display import display, clear_output
import uuid


scala_version = '2.12'  # your scala version
spark_version = '3.3.0' # your spark version
packages = [
    f'org.apache.spark:spark-sql-kafka-0-10_{scala_version}:{spark_version}',
    'org.apache.kafka:kafka-clients:3.5.0' #your kafka version
]
spark = SparkSession.builder.master("local").appName("kafka-example").config("spark.jars.packages", ",".join(packages)).getOrCreate()
spark

In [2]:
import pandas as pd
import tensorflow as tf
from transformers import AutoModel, AutoTokenizer # Thư viện BERT
import numpy as np
import torch
from pathlib import Path
import matplotlib.pyplot as plt
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import Sequential, Model
from tensorflow.keras.layers import Embedding, Input, Reshape, Concatenate
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras import regularizers
from tensorflow.keras.optimizers import Adam
import warnings
warnings.filterwarnings("ignore")

#### PreProcessing

In [3]:
def user2coder(stng):
    lst = stng.split()
    # Xóa các tên khách hàng có chữ Mr. hoặc Ms.
    if (lst[0] == 'Mr.'): lst.remove('Mr.')
    if (lst[0] == 'Ms.'): lst.remove('Ms.')
    oupt = lst[0]
    count = 0

    for i in range(len(lst) - 1):
        oupt += ' ' + lst[i + 1][0].upper() + '.'

    return oupt

In [11]:
import regex as re

uniChars = "àáảãạâầấẩẫậăằắẳẵặèéẻẽẹêềếểễệđìíỉĩịòóỏõọôồốổỗộơờớởỡợùúủũụưừứửữựỳýỷỹỵÀÁẢÃẠÂẦẤẨẪẬĂẰẮẲẴẶÈÉẺẼẸÊỀẾỂỄỆĐÌÍỈĨỊÒÓỎÕỌÔỒỐỔỖỘƠỜỚỞỠỢÙÚỦŨỤƯỪỨỬỮỰỲÝỶỸỴÂĂĐÔƠƯ"
unsignChars = "aaaaaaaaaaaaaaaaaeeeeeeeeeeediiiiiooooooooooooooooouuuuuuuuuuuyyyyyAAAAAAAAAAAAAAAAAEEEEEEEEEEEDIIIOOOOOOOOOOOOOOOOOOOUUUUUUUUUUUYYYYYAADOOU"

def loaddicchar():
    dic = {}
    char1252 = 'à|á|ả|ã|ạ|ầ|ấ|ẩ|ẫ|ậ|ằ|ắ|ẳ|ẵ|ặ|è|é|ẻ|ẽ|ẹ|ề|ế|ể|ễ|ệ|ì|í|ỉ|ĩ|ị|ò|ó|ỏ|õ|ọ|ồ|ố|ổ|ỗ|ộ|ờ|ớ|ở|ỡ|ợ|ù|ú|ủ|ũ|ụ|ừ|ứ|ử|ữ|ự|ỳ|ý|ỷ|ỹ|ỵ|À|Á|Ả|Ã|Ạ|Ầ|Ấ|Ẩ|Ẫ|Ậ|Ằ|Ắ|Ẳ|Ẵ|Ặ|È|É|Ẻ|Ẽ|Ẹ|Ề|Ế|Ể|Ễ|Ệ|Ì|Í|Ỉ|Ĩ|Ị|Ò|Ó|Ỏ|Õ|Ọ|Ồ|Ố|Ổ|Ỗ|Ộ|Ờ|Ớ|Ở|Ỡ|Ợ|Ù|Ú|Ủ|Ũ|Ụ|Ừ|Ứ|Ử|Ữ|Ự|Ỳ|Ý|Ỷ|Ỹ|Ỵ'.split(
        '|')
    charutf8 = "à|á|ả|ã|ạ|ầ|ấ|ẩ|ẫ|ậ|ằ|ắ|ẳ|ẵ|ặ|è|é|ẻ|ẽ|ẹ|ề|ế|ể|ễ|ệ|ì|í|ỉ|ĩ|ị|ò|ó|ỏ|õ|ọ|ồ|ố|ổ|ỗ|ộ|ờ|ớ|ở|ỡ|ợ|ù|ú|ủ|ũ|ụ|ừ|ứ|ử|ữ|ự|ỳ|ý|ỷ|ỹ|ỵ|À|Á|Ả|Ã|Ạ|Ầ|Ấ|Ẩ|Ẫ|Ậ|Ằ|Ắ|Ẳ|Ẵ|Ặ|È|É|Ẻ|Ẽ|Ẹ|Ề|Ế|Ể|Ễ|Ệ|Ì|Í|Ỉ|Ĩ|Ị|Ò|Ó|Ỏ|Õ|Ọ|Ồ|Ố|Ổ|Ỗ|Ộ|Ờ|Ớ|Ở|Ỡ|Ợ|Ù|Ú|Ủ|Ũ|Ụ|Ừ|Ứ|Ử|Ữ|Ự|Ỳ|Ý|Ỷ|Ỹ|Ỵ".split(
        '|')
    for i in range(len(char1252)):
        dic[char1252[i]] = charutf8[i]
    return dic

dicchar = loaddicchar()

# Đưa toàn bộ dữ liệu qua hàm này để chuẩn hóa lại
def convert_unicode(txt):
    return re.sub(
        r'à|á|ả|ã|ạ|ầ|ấ|ẩ|ẫ|ậ|ằ|ắ|ẳ|ẵ|ặ|è|é|ẻ|ẽ|ẹ|ề|ế|ể|ễ|ệ|ì|í|ỉ|ĩ|ị|ò|ó|ỏ|õ|ọ|ồ|ố|ổ|ỗ|ộ|ờ|ớ|ở|ỡ|ợ|ù|ú|ủ|ũ|ụ|ừ|ứ|ử|ữ|ự|ỳ|ý|ỷ|ỹ|ỵ|À|Á|Ả|Ã|Ạ|Ầ|Ấ|Ẩ|Ẫ|Ậ|Ằ|Ắ|Ẳ|Ẵ|Ặ|È|É|Ẻ|Ẽ|Ẹ|Ề|Ế|Ể|Ễ|Ệ|Ì|Í|Ỉ|Ĩ|Ị|Ò|Ó|Ỏ|Õ|Ọ|Ồ|Ố|Ổ|Ỗ|Ộ|Ờ|Ớ|Ở|Ỡ|Ợ|Ù|Ú|Ủ|Ũ|Ụ|Ừ|Ứ|Ử|Ữ|Ự|Ỳ|Ý|Ỷ|Ỹ|Ỵ',
        lambda x: dicchar[x.group()], txt)

def no_accent_vietnamese(s):
    convert_unicode(s)
    s = re.sub(u'[àáạảãâầấậẩẫăằắặẳẵ]', 'a', s)
    s = re.sub(u'[ÀÁẠẢÃĂẰẮẶẲẴÂẦẤẬẨẪ]', 'A', s)
    s = re.sub(u'[èéẹẻẽêềếệểễ]', 'e', s)
    # s = re.sub('ẹ', 'e', s)
    s = re.sub(u'[ÈÉẸẺẼÊỀẾỆỂỄ]', 'E', s)
    s = re.sub(u'[òóọỏõôồốộổỗơờớợởỡ]', 'o', s)
    s = re.sub(u'[ÒÓỌỎÕÔỒỐỘỔỖƠỜỚỢỞỠ]', 'O', s)
    s = re.sub(u'[ìíịỉĩ]', 'i', s)
    s = re.sub(u'[ÌÍỊỈĨ]', 'I', s)
    s = re.sub(u'[ùúụủũưừứựửữ]', 'u', s)
    s = re.sub(u'[ƯỪỨỰỬỮÙÚỤỦŨ]', 'U', s)
    s = re.sub(u'[ỳýỵỷỹ]', 'y', s)
    s = re.sub(u'[ỲÝỴỶỸ]', 'Y', s)
    s = re.sub(u'Đ', 'D', s)
    s = re.sub(u'đ', 'd', s)

    return s

bang_nguyen_am = [['a', 'à', 'á', 'ả', 'ã', 'ạ', 'a'],
                  ['ă', 'ằ', 'ắ', 'ẳ', 'ẵ', 'ặ', 'aw'],
                  ['â', 'ầ', 'ấ', 'ẩ', 'ẫ', 'ậ', 'aa'],
                  ['e', 'è', 'é', 'ẻ', 'ẽ', 'ẹ', 'e'],
                  ['ê', 'ề', 'ế', 'ể', 'ễ', 'ệ', 'ee'],
                  ['i', 'ì', 'í', 'ỉ', 'ĩ', 'ị', 'i'],
                  ['o', 'ò', 'ó', 'ỏ', 'õ', 'ọ', 'o'],
                  ['ô', 'ồ', 'ố', 'ổ', 'ỗ', 'ộ', 'oo'],
                  ['ơ', 'ờ', 'ớ', 'ở', 'ỡ', 'ợ', 'ow'],
                  ['u', 'ù', 'ú', 'ủ', 'ũ', 'ụ', 'u'],
                  ['ư', 'ừ', 'ứ', 'ử', 'ữ', 'ự', 'uw'],
                  ['y', 'ỳ', 'ý', 'ỷ', 'ỹ', 'ỵ', 'y']]
bang_ky_tu_dau = ['', 'f', 's', 'r', 'x', 'j']

nguyen_am_to_ids = {}

for i in range(len(bang_nguyen_am)):
    for j in range(len(bang_nguyen_am[i]) - 1):
        nguyen_am_to_ids[bang_nguyen_am[i][j]] = (i, j)

def is_valid_vietnam_word(word):
    chars = list(word)
    nguyen_am_index = -1
    for index, char in enumerate(chars):
        x, y = nguyen_am_to_ids.get(char, (-1, -1))
        if x != -1:
            if nguyen_am_index == -1:
                nguyen_am_index = index
            else:
                if index - nguyen_am_index != 1:
                    return False
                nguyen_am_index = index
    return True

def chuan_hoa_dau_tu_tieng_viet(word):
    if not is_valid_vietnam_word(word):
        return word

    chars = list(word)
    dau_cau = 0
    nguyen_am_index = []
    qu_or_gi = False
    for index, char in enumerate(chars):
        x, y = nguyen_am_to_ids.get(char, (-1, -1))
        if x == -1:
            continue
        elif x == 9:  # check qu
            if index != 0 and chars[index - 1] == 'q':
                chars[index] = 'u'
                qu_or_gi = True
        elif x == 5:  # check gi
            if index != 0 and chars[index - 1] == 'g':
                chars[index] = 'i'
                qu_or_gi = True
        if y != 0:
            dau_cau = y
            chars[index] = bang_nguyen_am[x][0]
        if not qu_or_gi or index != 1:
            nguyen_am_index.append(index)
    if len(nguyen_am_index) < 2:
        if qu_or_gi:
            if len(chars) == 2:
                x, y = nguyen_am_to_ids.get(chars[1])
                chars[1] = bang_nguyen_am[x][dau_cau]
            else:
                x, y = nguyen_am_to_ids.get(chars[2], (-1, -1))
                if x != -1:
                    chars[2] = bang_nguyen_am[x][dau_cau]
                else:
                    chars[1] = bang_nguyen_am[5][dau_cau] if chars[1] == 'i' else bang_nguyen_am[9][dau_cau]
            return ''.join(chars)
        return word

    for index in nguyen_am_index:
        x, y = nguyen_am_to_ids[chars[index]]
        if x == 4 or x == 8:  # ê, ơ
            chars[index] = bang_nguyen_am[x][dau_cau]
            return ''.join(chars)

    if len(nguyen_am_index) == 2:
        if nguyen_am_index[-1] == len(chars) - 1:
            x, y = nguyen_am_to_ids[chars[nguyen_am_index[0]]]
            chars[nguyen_am_index[0]] = bang_nguyen_am[x][dau_cau]
        else:
            x, y = nguyen_am_to_ids[chars[nguyen_am_index[1]]]
            chars[nguyen_am_index[1]] = bang_nguyen_am[x][dau_cau]
    else:
        x, y = nguyen_am_to_ids[chars[nguyen_am_index[1]]]
        chars[nguyen_am_index[1]] = bang_nguyen_am[x][dau_cau]
    return ''.join(chars)

#Chuyển câu tiếng việt về chuẩn gõ dấu kiểu cũ.
def chuan_hoa_dau_cau_tieng_viet(sentence):
    sentence = sentence.lower()
    words = sentence.split()
    for index, word in enumerate(words):
        cw = re.sub(r'(^\p{P}*)([p{L}.]*\p{L}+)(\p{P}*$)', r'\1/\2/\3', word).split('/')
        if len(cw) == 3:
            cw[1] = chuan_hoa_dau_tu_tieng_viet(cw[1])
        words[index] = ''.join(cw)
    return ' '.join(words)

#Chuẩn hóa viết tắt
def chuan_hoa_viet_tat_tieng_viet(sentence):
  sentence = re.sub(r'\b[k][s]\b', 'khách sạn', sentence)
  sentence = re.sub(r'\b[n][v]\b', 'nhân viên', sentence)
  sentence = re.sub(r'\b[d][v]\b', 'dịch vụ', sentence)
  sentence = re.sub(r'\b[v][i][e][w]\b', 'khung cảnh', sentence)
  sentence = sentence.replace('check out', 'trả phòng')
  sentence = sentence.replace('checkout', 'trả phòng')
  sentence = sentence.replace('check in', 'nhận phòng')
  sentence = sentence.replace('checkin', 'nhận phòng')
  sentence = sentence.replace('delay', 'trì hoãn')
  sentence = sentence.replace('resort', 'khu nghĩ dưỡng')

  sentence = re.sub(r'\b[o][k]\b', 'tốt', sentence)
  sentence = re.sub(r'\b[v][s]\b', 'với', sentence)
  sentence = re.sub(r'\b[k][o]?\b', 'không', sentence)
  sentence = re.sub(r'\b[n][i][c][e]?\b', 'tuyệt vời', sentence)
  return sentence

from underthesea import word_tokenize
def text_preprocess(document):
    # chuẩn hóa unicode
    document = convert_unicode(document)
    # chuẩn hóa cách gõ dấu tiếng Việt
    document = chuan_hoa_dau_cau_tieng_viet(document)
    # đưa về lower (chữ thường)
    document = document.lower()
    # xóa các ký tự không cần thiết
    document = re.sub(r'[^\s\w%_]',' ',document)
    # xóa khoảng trắng thừa
    document = re.sub(r'\s+', ' ', document).strip()
    # Sửa chính tả, lỗi cú pháp, viết tắt, tiếng anh lẫn vào, không dấu bằng tay trên gg sheet
    document = chuan_hoa_viet_tat_tieng_viet(document)
    # tách từ
    document = word_tokenize(document, format="text")

    return document

#### Các hàm cần dùng

In [5]:
# Load danh sách stopword
def load_stopwords():
    stopwords = []
    with open("vietnamese-stopwords.txt", encoding='utf-8') as f:
        lines = f.readlines()
    for line in lines:
        stopwords.append(line.replace("\n",""))
    return stopwords

# Loại bỏ stopword
def remove_stopwords(line, stopwords):
    words = []
    for word in line.strip().split():
        if word not in stopwords:
            words.append(word)
    return ' '.join(words)

# Tokenize comment
def vietnamese_tokenized(X):
  v_tokenized=[]
  for line in X:
    # remove_stopwords(line, stopwords)
    line = tokenizer.encode(line)
    v_tokenized.append(line)
  return v_tokenized

# Chèn thêm số 1 vào cuối câu nếu như không đủ 80 từ
def padding(max_len, X):
    padded = []
    tokenized = vietnamese_tokenized(X)
    for i in tokenized:
      if (max_len - len(i)) >= 0:
        cmt_token = i + [1] * (max_len - len(i))
        padded.append(cmt_token)
      else:
        cmt_token = i[:80]
        padded.append(cmt_token)
    padded = np.array(padded)
    return padded

def X_tensor(X):
  max_len = 80
  X = np.array(padding(max_len, X))
  X_tensor = torch.tensor(X).to(torch.long)
  return X_tensor

def mask(X):
  max_len=80
  X = np.array(padding(max_len, X))
  attention_mask = np.where(X == 1, 0, 1)
  return attention_mask

def mask_tensor(X):
  max_len=80
  attention_mask_tensor = torch.tensor(mask(X))
  return attention_mask_tensor

# Lấy features đầu ra từ BERT
def get_features(X):
  with torch.no_grad():
      last_hidden_states = phobert(input_ids= X_tensor(X), attention_mask = mask_tensor(X))
  features = last_hidden_states[0][:, :, :].numpy()
  return features

def get_user_id(user_name, df_last_info):
  for i in range(len(df_last_info)):
    if user_name == df_last_info["User_name"][i]:
      user_id = df_last_info["UserId"][i]
      user_id_frame = df_last_info[["UserId","User_name"]].loc[i:i]
      break
  return user_id, user_id_frame

def get_hotel_id(df_last_info):
  hotel_name = get_hotel_name()
  for i in range(len(df_last_info)):
    if hotel_name == df_last_info["Hotel_name"][i]:
      hotel_id  = df_last_info['HotelId'][i]
      hotel_id_frame  = df_last_info[['HotelId','Hotel_name']].loc[i:i]
      break
  return hotel_id, hotel_id_frame

# Hàm dự đoán
def pred_func_1sample(features):
  y_pred = predict_model.predict(features)
  pred =[]
  count = 0
  threshold = 0.5
  for i in y_pred[0][0]:
    if i >= threshold: pred.append(1)
    else: pred.append(0)
    count +=1
  pred.append(np.argmax(y_pred[1][0]))

  pred = pd.DataFrame(pred).T
  pred.columns = ['Service', 'Infrastructure','Sanitary','Location','Attitude']

  return pred

# Lịch sử feedback người dùng
def feedback_history(X):
  X = df_last_info.loc[:,['UserId', 'HotelId', 'Location_hotel','Comment', 'Rating']] # 'Location_hotel': nếu trong DB thì ko cần lấy
  X = pd.concat([X], axis = 0, ignore_index=True)
  return X

# Lấy kết quả đánh giá
def get_ratings(user_id):
  df_history = feedback_history(df_last_info)
  # Lịch sử khách sạn mà user đã đi
  hotels_went_by_user = df_history[df_history.UserId == user_id]
  hotels_went_by_user_df = hotels_went_by_user
  # Lịch sử khách sạn mà user chưa đi
  hotels_not_went_by_user = df_history[~df_history["HotelId"].isin(hotels_went_by_user.HotelId.value_counts().index)]
  # Lấy unique hotel ID từ hotel user chưa đi
  id_hotels_not_went_by_user = hotels_not_went_by_user['HotelId']
  id_hotels_not_went_by_user = pd.DataFrame(id_hotels_not_went_by_user).HotelId.value_counts().index
  id_hotels_not_went_by_user = list(id_hotels_not_went_by_user)
  # chuyển về list 2x2
  id_hotels_not_went_by_user = [[x] for x in id_hotels_not_went_by_user]
  # Lấy cmt gần nhất của user
  context = feedback.loc[0]

  user_hotel_array = np.hstack(( [[user_id]] * len(id_hotels_not_went_by_user), id_hotels_not_went_by_user,
                               [context] * len(id_hotels_not_went_by_user) ))

  userID_hotel_pred = [user_hotel_array[:,:1].astype(int),
                    user_hotel_array[:,1:2].astype(int),
                    user_hotel_array[:,2:6].astype(int),
                    user_hotel_array[:,-1].astype(int)]

  ratings = recommend_model.predict(userID_hotel_pred).flatten()
  return hotels_went_by_user_df, ratings

# Lấy danh sách id khách sạn được gợi ý
def get_recommended_hotel_id(user_id):
  df_history = feedback_history(df_last_info)
  hotels_went_by_user_df, ratings = get_ratings(user_id)
  # Lịch sử khách sạn mà user đã đi
  hotels_went_by_user = df_history[df_history.UserId == user_id]
  # Lịch sử khách sạn mà user chưa đi
  hotels_not_went_by_user = df_history[~df_history["HotelId"].isin(hotels_went_by_user.HotelId.value_counts().index)]
  # Lấy unique hotel ID từ hotel user chưa đi
  id_hotels_not_went_by_user = hotels_not_went_by_user['HotelId']
  id_hotels_not_went_by_user = pd.DataFrame(id_hotels_not_went_by_user).HotelId.value_counts().index
  id_hotels_not_went_by_user = list(id_hotels_not_went_by_user)
  # chuyển về list 2x2
  id_hotels_not_went_by_user = [[x] for x in id_hotels_not_went_by_user]
  # Lấy top 10 rating
  top_ratings_indices = ratings.argsort()[-10:][::-1]
  recommended_hotels_ids = [id_hotels_not_went_by_user[x][0] for x in top_ratings_indices]
  return hotels_went_by_user_df, recommended_hotels_ids

def get_hotel_name():
  hotel_name = df_hotel.Hotel_name
  return hotel_name

#### Load model base

In [28]:
# lấy từ Dataset
df_last_info = pd.read_csv('dataset_final.csv')

# tokenize User_name, Hotel_name

user_ids = df_last_info["User_name"].unique().tolist()
user2user_encoded = {x: i for i, x in enumerate(user_ids)}
user_encoded2user = {i: x for i, x in enumerate(user_ids)}

hotel_ids = df_last_info["Hotel_name"].unique().tolist()
hotel2hotel_encoded = {x: i for i, x in enumerate(hotel_ids)}
hotel_encoded2hotel = {i: x for i, x in enumerate(hotel_ids)}

df_last_info["UserId"] = df_last_info["User_name"].map(user2user_encoded)
df_last_info["HotelId"] = df_last_info["Hotel_name"].map(hotel2hotel_encoded)

num_users = len(user2user_encoded)
num_hotels = len(hotel2hotel_encoded)
df_last_info["Rating"] = df_last_info["Rating"].values.astype(np.float32)

min_rating = min(df_last_info["Rating"])
max_rating = max(df_last_info["Rating"])

print("Number of users: {}, Number of hotels: {}, Min rating: {}, Max rating: {}".format(num_users, num_hotels, min_rating, max_rating))

# Load bert pretrained
def load_bert():
    v_phobert = AutoModel.from_pretrained("vinai/phobert-base") # vinai/phobert-large
    v_tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base", use_fast=False)# vinai/phobert-large
    return v_phobert, v_tokenizer

def predict_model():
  max_len = 80
  text_input = tf.keras.layers.Input(shape=(max_len, 768))
  net = Bidirectional(LSTM(128, return_sequences = False))(text_input)
  net = tf.keras.layers.Dropout(0.5) (net)
  net = tf.keras.layers.Dense(384, activation='ReLU', kernel_regularizer = regularizers.l2(0.01)) (net)
  net = BatchNormalization() (net)
  net = tf.keras.layers.Dense(192, activation='ReLU', kernel_regularizer = regularizers.l2(0.03)) (net)

  net_1 = tf.keras.layers.Dense(96, activation='ReLU') (net)
  net_1 = tf.keras.layers.Dropout(0.5) (net_1)
  net_1 = tf.keras.layers.Dense(48, activation='ReLU') (net_1)
  net_1 = tf.keras.layers.Dropout(0.3) (net_1)
  net_output1 = tf.keras.layers.Dense(4, activation='sigmoid', name = 'output1') (net_1)

  net_2 = tf.keras.layers.Dense(32, activation='ReLU') (net)
  net_2 = tf.keras.layers.Dropout(0.4) (net_2)
  net_output2 = tf.keras.layers.Dense(3, activation='softmax', name = 'output2') (net_2)

  output_list = [net_output1, net_output2]
  predict_model = tf.keras.Model(inputs = text_input, outputs = output_list)
  return predict_model

user_embed = 40
hotel_embed = 15
topic_embed = 5

def create_model(user_embed, hotel_embed, topic_embed):
  # Create two input layers
  user_id_input = Input(shape=[1], name='user')
  hotel_id_input = Input(shape=[1], name='hotel')
  # Create separate embeddings for users and hotels
  user_embedding = Embedding( input_dim=500,
                              output_dim=user_embed,
                              input_length=1,
                              name='user_embedding')(user_id_input)
  hotel_embedding = Embedding(input_dim=300,
                              output_dim=hotel_embed,
                              input_length=1,
                              name='hotel_embedding')(hotel_id_input)
  # Context
  topic_input = Input(shape=[4], name='topic')
  topic_embedding = Embedding(input_dim=10,
                              output_dim=topic_embed,
                              input_length=4,
                              name='topic_embedding')(topic_input)
  sentiment_input = Input(shape=[1], name='sentiment')
  # Reshape both embedding layers
  user_vectors = Reshape([user_embed])(user_embedding)
  hotel_vectors = Reshape([hotel_embed])(hotel_embedding)
  topic_vectors = Reshape([topic_embed*4])(topic_embedding)
  # Concatenate all layers into one vector
  concatenate_context = Concatenate(name='concatenate_context')([sentiment_input, topic_vectors])
  concatenate_context = Dense(21, activation='relu')(concatenate_context)
  concatenate = Concatenate()([user_vectors, hotel_vectors, concatenate_context])
  # Dense
  dense = Dense(32, activation='relu')(concatenate)
  dense = Dropout(0.2)(dense)
  output = Dense(1)(dense)

  model = Model(inputs=[user_id_input, hotel_id_input, topic_input, sentiment_input], outputs=output)
  return model

stopwords = load_stopwords()
phobert, tokenizer = load_bert()

# Load model Sentiment
predict_model = predict_model()
predict_model.load_weights('checkpoint/checkpoint_PhoBERT_bi-acc1-0.81_acc2-0.82.hdf5')

# Load model RRS
recommend_model = create_model(user_embed, hotel_embed, topic_embed)
recommend_model.load_weights('checkpoint\checkpoint_Recommend_mse-0.0268.hdf5')

Number of users: 368, Number of hotels: 137, Min rating: 4.5, Max rating: 10.0


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


### Code process

In [7]:
topic_name = 'TestTopic'
kafka_server = 'localhost:9092'

kafkaDf = spark.read\
    .format("kafka")\
    .option("kafka.bootstrap.servers", kafka_server)\
    .option("subscribe", topic_name)\
    .option("startingOffsets", "earliest")\
    .load()

schema = spark.read\
    .json(spark.sparkContext\
        .parallelize([{"User_name": "string", "Location_hotel": "string", 
                       "Hotel_name": "string", "Rating": "string", 
                       "Comment": "string"}])).schema

batch_hotel = kafkaDf\
    .select(from_json(kafkaDf.value.cast("string"), schema)\
            .alias("data"))\
    .select("data.*")

In [29]:
df_hotel = batch_hotel.toPandas().iloc[-1]
df_hotel.Comment = text_preprocess(df_hotel.Comment)
df_hotel.User_name = user2coder(no_accent_vietnamese(df_hotel.User_name))

In [57]:
# cluster = Cluster(['127.0.0.1'], port=9042)
# session = cluster.connect('rrs_hotel')

# num_id = session.execute("SELECT COUNT(*) FROM user_reviews;").one()[0] + 1

In [33]:
for x in range(0, 2000):
    try:
        print("Showing live view refreshed every 5 seconds")
        print(f"Seconds passed: {x*5}")
        df_hotel = batch_hotel.toPandas().iloc[-1]
        print("Data raw:")
        print("-------------------------------------------------------------------")
        display(df_hotel)

        df_hotel.Comment = text_preprocess(df_hotel.Comment)
        df_hotel.User_name = user2coder(no_accent_vietnamese(df_hotel.User_name))

        # print("Data pre-processing:")
        # print("-------------------------------------------------------------------")
        # display(df_hotel)
        # print("Predictting...")

        # y_pred = pred_func_1sample(get_features(pd.DataFrame(df_hotel).iloc[0]))
        # print("Result predict:")
        # print("-------------------------------------------------------------------")
        # display(y_pred)

        if df_last_info.User_name.str.contains(df_hotel.User_name).any():
            if df_last_info.Hotel_name.str.contains(df_hotel.Hotel_name).any():
                user_id, user_id_frame = get_user_id(df_hotel.User_name, df_last_info)
                user_id_frame = user_id_frame.drop(['User_name'], axis = 1).reset_index().drop(['index'], axis = 1)
                hotel_id, hotel_id_frame= get_hotel_id(df_last_info)
                hotel_id_frame = hotel_id_frame.drop(['Hotel_name'], axis = 1).reset_index().drop(['index'], axis = 1)

                feedback = pd.concat([user_id_frame, pd.concat([hotel_id_frame, y_pred], axis = 1)], axis = 1)

                hotels_went_by_user_df, ratings = get_recommended_hotel_id(user_id) # hotels_went_by_user_df, ratings

                hotels_went_by_user_df["UserId"] = hotels_went_by_user_df["UserId"].map(user_encoded2user)
                hotels_went_by_user_df["HotelId"] = hotels_went_by_user_df["HotelId"].map(hotel_encoded2hotel)

                recommended_hotel = pd.DataFrame(ratings)[0].map(hotel_encoded2hotel)

                print("History:")
                display(hotels_went_by_user_df)
                
                print("Top 10 hotel recommendations:")
                print("----" * 8)
                for row in recommended_hotel:
                    print(row)

        else: print("Sorry, User has no recorded history!!!")

        # session.execute("INSERT INTO user_reviews (r_id, user_name, location_hotel, hotel_name, rating, comment) VALUES (%s, %s, %s, %s, %s, %s)",
                # (num_id + x, df_hotel.User_name, df_hotel.Location_hotel, df_hotel.Hotel_name, float(df_hotel.Rating), df_hotel.Comment))
        sleep(4)
        clear_output(wait=True) 
    
    except KeyboardInterrupt:
        print("break")
        break
print("Live view ended...")

Showing live view refreshed every 5 seconds
Seconds passed: 30
Data raw:
-------------------------------------------------------------------


Comment           Không có xe đưa đón sân bay. Khu nghỉ dưỡng ở ...
Hotel_name                           Khu nghỉ dưỡng Fusion Cam Ranh
Location_hotel                                            Nha Trang
Rating                                                          8.0
User_name                                              Nguyen H. T.
Name: 144, dtype: object

History:


Unnamed: 0,UserId,HotelId,Location_hotel,Comment,Rating
6,Nguyen H. T.,Khu nghỉ dưỡng Fusion Cam Ranh,Nha Trang,không_có xe đưa_đón sân_bay khu nghỉ_dưỡng ở m...,8.0
87,Nguyen H. T.,Khu Nghỉ Dưỡng Best Western Premier Sonasea Ph...,Phú Quốc,không_có trái_cây chào_mừng nên hơi buồn xíu đ...,8.0
243,Nguyen H. T.,Khu nghỉ dưỡng Pandanus Phan Thiết,Phan Thiết,mình đã có kỳ nghỉ ở pandanus khu_nghỉ_mát mũi...,8.0
330,Nguyen H. T.,Khách Sạn LADALAT,Đà Lạt,khách_sạn có dịch_vụ tốt mình hài_lòng khi ngh...,10.0
549,Nguyen H. T.,Khu nghỉ dưỡng Movenpick Cam Ranh,Nha Trang,trẻ con nhà mình thích tới nỗi không_muốn về n...,10.0
562,Nguyen H. T.,Khu nghỉ dưỡng Pandanus Phan Thiết,Phan Thiết,mình đã có kỳ nghỉ ở pandanus khu_nghỉ_mát mũi...,8.0
802,Nguyen H. T.,JW Marriott Phu Quoc Emerald Bay Resort & Spa,Phú Quốc,nhân_viên nhiệt_tình giá_cả hợp_lý chắc_chắn t...,10.0
1021,Nguyen H. T.,Khu nghỉ dưỡng Vinpearl Nha Trang,Nha Trang,nhìn chung tôi hài_lòng về khách_sạn này cảm_ơ...,9.0
1342,Nguyen H. T.,Lasenta Boutique Hotel Hoian,Hội An,rất thích mọi thứ thật tuyệt_vời,10.0
1430,Nguyen H. T.,Khách sạn Four Points by Sheraton Đà Nẵng,Đà Nẵng,khách_sạn dịch_vụ tốt nằm giữa trung_tâm thành...,10.0


Top 10 hotel recommendations:
--------------------------------
Dalat Edensee Lake Resort & Spa
InterContinental Đà Nẵng Sun Peninsula Resort
New Style House Hotel
Khu nghỉ dưỡng Amiana Nha Trang
Vinpearl Condotel Beachfront Nha Trang
Khu nghỉ dưỡng Crown Retreat Quy Nhơn
Khu nghỉ dưỡng Holiday Inn Hồ Tràm
Khu nghỉ dưỡng Melia Đà Nẵng Beach
Khu nghỉ dưỡng Seava Hồ Tràm
Khu nghỉ dưỡng The Anam Nha Trang
break
Live view ended...
