In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
import pandas as pd
import numpy as np

import string

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import CSVLogger

import matplotlib.pyplot as plt
import seaborn as sns

from nltk.corpus import stopwords
import nltk

import re
import multiprocessing

from sklearn.model_selection import train_test_split
nltk.download("stopwords")

import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
train_dataset = pd.read_csv('/content/gdrive/MyDrive/MLSentiment/full_train.csv')
test_dataset = pd.read_csv('/content/gdrive/MyDrive/MLSentiment/test.csv')
train_dataset.head()

Unnamed: 0.1,Unnamed: 0,RevId,UserId,Comment,image_urls,Rating
0,0,3839333,10106093.0,"Xôi dẻo, đồ ăn đậm vị. Hộp xôi được lót lá trô...",['https://images.foody.vn/res/g97/966781/s800/...,1.0
1,1,2824877,786914.0,Gọi ship 1 xuất cari gà bánh naan và 3 miếng g...,['https://images.foody.vn/res/g69/688413/s800/...,0.0
2,2,9816702,22467889.0,"Thời tiết lạnh như này, cả nhà rủ nhau đến leg...",['https://images.foody.vn/res/g72/715078/s800/...,1.0
3,3,2684585,1889449.0,Em có đọc review thấy mng bảo trà sữa nướng đề...,['https://images.foody.vn/res/g90/895545/s800/...,0.0
4,4,2737987,8839942.0,"Đồ ăn rất ngon, nhà hàng cũng rất đẹp, tất cả ...",['https://images.foody.vn/res/g4/30186/s800/fo...,1.0


In [None]:
test_dataset.info()

NameError: ignored

In [None]:
train_dataset = train_dataset.replace('nan', np.nan)
train_dataset = train_dataset.dropna()

In [None]:
train_dataset.shape

In [None]:
train_dataset.describe(include='all')

In [None]:
sns.countplot(x=train_dataset['Rating']);

In [None]:
train_dataset = train_dataset.drop(['UserId', 'RevId', 'image_urls'], axis = 1)

In [None]:
train_dataset.head()

In [None]:
data = train_dataset['Comment']
labels = np.array(train_dataset['Rating'])

In [None]:
stop = stopwords.words('english')
add_stopwords = []
for line in pd.read_csv('/content/gdrive/MyDrive/MLSentiment/stopwords.txt', encoding='utf-8', header=None, chunksize=1):
    add_stopwords.append(line.iloc[0,0])
stop.extend(add_stopwords)
punctuation = list(string.punctuation)

def split_into_words(text):
    print(text)
    words = text.split()
    return words

def to_lower_case(words):
    words = [word.lower() for word in words]
    return words

def remove_punctuation(words):
    re_punc = re.compile('[%s]' % re.escape(string.punctuation))
    stripped = [re_punc.sub('', w) for w in words]
    return stripped

def keep_alphabetic(words):
    words = [word for word in words if word.isalpha()]
    return words

def remove_stopwords(words):
    stop_words = stop
    words = [w for w in words if not w in stop_words]
    return words

def to_sentence(words):
    return ' '.join(words)

def review(words):
    review_tokenizer = nltk.tokenize.word_tokenize(words, language='english', preserve_line=False)[source]
    review = review_tokenizer.tokenize(words)
    return review

def denoise_text(text):
    text = str(text)
    words = split_into_words(text)
    words = to_lower_case(words)
    words = remove_punctuation(words)
    words = keep_alphabetic(words)
    words = remove_stopwords(words)
    return to_sentence(words)

In [None]:
data = data.apply(denoise_text)

In [None]:
print('Before: {}'. format(list(train_dataset['Comment'][:2])))
print('---')
print('After: {}'. format(list(data[:2])))

In [None]:
X_train, X_test, y_train, y_test = test = train_test_split(data, labels, test_size=0.20, random_state=1, stratify = labels)

In [None]:
tokenizer = Tokenizer(num_words=10000, oov_token = '<OOV>')
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index
VOCAB_SIZE = len(word_index)+1
VOCAB_SIZE

9702

In [None]:
maxlen = max([len(x) for x in X_train]) 
maxlen = 50

In [None]:
train_sequences = tokenizer.texts_to_sequences(X_train)
train_padded_sequences = pad_sequences(train_sequences,maxlen=maxlen,padding='post',truncating='post')
test_sequences = tokenizer.texts_to_sequences(X_test)
test_padded_sequences = pad_sequences(test_sequences,maxlen=maxlen,padding='post',truncating='post')

In [None]:
len(train_padded_sequences[0])

50

In [None]:
embedding_dim = 64

model = tf.keras.Sequential([
        tf.keras.layers.Embedding(VOCAB_SIZE+1, embedding_dim, input_length=maxlen),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
        tf.keras.layers.Flatten(),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.30),
        tf.keras.layers.Dense(embedding_dim,activation='relu'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.30),
        tf.keras.layers.Dense(8,activation='relu'),
        tf.keras.layers.Dense(1,activation='sigmoid'),
    ])

model.compile(loss = 'binary_crossentropy',
                optimizer = 'adam',
                metrics = ['accuracy'])

In [None]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 50, 64)            620992    
                                                                 
 bidirectional (Bidirectiona  (None, 128)              66048     
 l)                                                              
                                                                 
 flatten (Flatten)           (None, 128)               0         
                                                                 
 batch_normalization (BatchN  (None, 128)              512       
 ormalization)                                                   
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense (Dense)               (None, 64)                8

In [None]:
model.compile(optimizer='adam',loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),metrics=['accuracy'])

In [None]:
history = model.fit(train_padded_sequences, y_train, validation_data = (test_padded_sequences, y_test), epochs = 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
tokenizer.sequences_to_texts(test_padded_sequences)[99]

'hôm tết nem khoai khoai dai rán mẻ cũ nem nhạt ngon <OOV> hiện sâu hix <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV>'

In [None]:
model.predict(test_padded_sequences)[99]



array([0.59323364], dtype=float32)

In [None]:
rev_id_test = test_dataset["RevId"]
test_dataset = test_dataset.replace('nan', np.nan)
comment_test = test_dataset["Comment"]
comment_test = comment_test.replace(np.nan, '')
print(comment_test)

0       Trà táo 35k\nCookie socola 38k \nNước ở đây bì...
1       Hôm rồi trung tâm mình tổ chức noel party ở đâ...
2       Thịt gà của quán là nhất đấy. Đi ăn gọi liền 4...
3       Hai đứa ăn xong đau bụng cả ngày\nChân gà ok n...
4       Mình vừa thử trưa nay. Điểm cộng đầu tiên là b...
                              ...                        
5098    Bún riêu ở đây ngon mà, lúc nào cũng đông, thế...
5099    Quán ngồi thoải mái và cưc thích ❤ menu đa dạn...
5100    Quá thất vọng, chất lượng sản phẩm ngày càng đ...
5101    Giao hàng nhanh,  miếng băm chả đều ngon ko bị...
5102    Nhìn ở bên ngoài trông quán khá bé nhưng vào t...
Name: Comment, Length: 5103, dtype: object


In [None]:
comment_test.apply(denoise_text)
test_t = tokenizer.texts_to_sequences(comment_test)
test_v = pad_sequences(test_t, padding='post',truncating='post')

In [None]:
pred = model.predict(test_v)
pred = pd.DataFrame(pred,columns=["Rating"])

pred = pred.apply(lambda x: ['0' if y < 0.5 else '1' for y in x])
print(pred)
sub = pd.concat([rev_id_test, pred],axis=1)

sub["RevId"] = sub["RevId"].astype(int)
sub["Rating"] = sub["Rating"].replace(np.nan,'NaN',regex = True)
print(sub)
sub.to_csv(f"metrics.csv", index=False)
!cp metrics.csv "/content/gdrive/MyDrive/MLSentiment/"


     Rating
0         0
1         0
2         0
3         1
4         1
...     ...
5098      1
5099      1
5100      0
5101      1
5102      0

[5103 rows x 1 columns]
        RevId Rating
0      781115      0
1     1219481      0
2     1703765      0
3     4870346      1
4     2638711      1
...       ...    ...
5098  1025826      1
5099  1278470      1
5100  2565212      0
5101  3766155      1
5102  1070891      0

[5103 rows x 2 columns]
