In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
from data_helper import *
from pyvi import ViTokenizer, ViPosTagger
from gensim.models.keyedvectors import KeyedVectors

train_data=pd.read_csv('./data/train.csv').dropna()
test_data=pd.read_csv('./data/test_trung_predict.csv')



In [2]:
authors=train_data.label.unique()
dic={}
for i,label in enumerate(authors):
    dic[label]=i
labels=train_data.label.apply(lambda x:dic[x])

val_data=train_data.sample(frac=0.2,random_state=200)
train_data=train_data.drop(val_data.index)

In [3]:
import gensim
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
import os
from gensim.models.keyedvectors import KeyedVectors

In [4]:
emoj_vector = KeyedVectors.load_word2vec_format('./data/emoji2vec.bin', binary=True)
word_vectors = KeyedVectors.load_word2vec_format('./data/baomoi.window2.vn.model.bin', binary=True)


In [5]:
word_dict = dict({})

for idx, key in enumerate(word_vectors.wv.vocab):
    word_dict[key] = word_vectors.wv[key]

emo_dict = dict({})

for idx, key in enumerate(emoj_vector.wv.vocab):
    emo_dict[key] = emoj_vector.wv[key]

  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.
  
  if __name__ == '__main__':


In [6]:
def tokenize(target):
    return ViTokenizer.tokenize(clean_str(target)).split(' ')

In [7]:
EMBEDDING_DIM=300
vocabulary_size=len(word_dict)+len(emo_dict)
embedding_matrix = np.zeros((vocabulary_size, EMBEDDING_DIM))
tokenzier_custom={}
i=-1
for word in word_dict.keys():
    i=i+1
    tokenzier_custom[word]=i
    try:
        embedding_vector = word_vectors[word]
        embedding_matrix[i] = embedding_vector
    except KeyError:
        embedding_matrix[i]=np.random.normal(0,np.sqrt(0.25),EMBEDDING_DIM)

for word in emo_dict.keys():
    i=i+1
    tokenzier_custom[word]=i
    try:
        embedding_vector = emoj_vector[word]
        embedding_matrix[i] = embedding_vector
    except KeyError:
        embedding_matrix[i]=np.random.normal(0,np.sqrt(0.25),EMBEDDING_DIM)
embedding_matrix[0]=np.zeros(EMBEDDING_DIM)

In [16]:
def vectorize(doc: str) -> np.ndarray:
        doc = doc.lower()
        words = [w for w in doc.split(" ")]
        word_vecs = []
        for word in words:
            try:
                vec = word_vectors[word]
                word_vecs.append(vec)
                print(word)
            except KeyError:
                pass
        vector = np.mean(word_vecs, axis=0)
        return vector

In [21]:
def buildVector(seq_word,tokenzier_custom):
    seq_vec = np.zeros(200,dtype=np.int64)
  # i=-1
    for i,word in enumerate(seq_word):
        if i>=200:
            break;
        if word in tokenzier_custom:
            seq_vec[i]=tokenzier_custom[word] 
        else:
            i=i-1
    return seq_vec 

In [22]:
X_train=[]
for i,word in enumerate(train_data.comment):
    X_train.append(buildVector(tokenize(word),tokenzier_custom))
X_train=np.array(X_train)

X_val=[]
for i,word in enumerate(val_data.comment):
    X_val.append(buildVector(tokenize(word),tokenzier_custom))
X_val=np.array(X_val)

In [23]:
from keras.utils import to_categorical

# X_train = pad_sequences(sequences_train)
# X_val = pad_sequences(sequences_valid,maxlen=X_train.shape[1])
y_train = to_categorical(np.asarray(labels[train_data.index]))
y_val = to_categorical(np.asarray(labels[val_data.index]))
print('Shape of X train and X validation tensor:', X_train.shape,X_val.shape)
print('Shape of label train and validation tensor:', y_train.shape,y_val.shape)

Shape of X train and X validation tensor: (12870, 200) (3217, 200)
Shape of label train and validation tensor: (12870, 2) (3217, 2)


Using TensorFlow backend.


In [24]:
from tensorflow.keras.layers import Embedding
embedding_layer = Embedding(vocabulary_size,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            trainable=True)

In [25]:
from tensorflow.keras.layers import Dense, Input, GlobalMaxPooling1D
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Embedding
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding, Conv2D, MaxPooling2D, Dropout,concatenate
from tensorflow.keras.layers import Reshape, Flatten
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras import regularizers
sequence_length = X_train.shape[1]
filter_sizes = [1,2,3]
num_filters = 200
drop = 0.5



inputs = Input(shape=(sequence_length,))
embedding = embedding_layer(inputs)
reshape = Reshape((sequence_length,EMBEDDING_DIM,1))(embedding)

conv_0 = Conv2D(num_filters, (filter_sizes[0], EMBEDDING_DIM),activation='relu',kernel_regularizer=regularizers.l2(0.01))(reshape)
conv_1 = Conv2D(num_filters, (filter_sizes[1], EMBEDDING_DIM),activation='relu',kernel_regularizer=regularizers.l2(0.01))(reshape)
conv_2 = Conv2D(num_filters, (filter_sizes[2], EMBEDDING_DIM),activation='relu',kernel_regularizer=regularizers.l2(0.01))(reshape)

maxpool_0 = MaxPooling2D((sequence_length - filter_sizes[0] + 1, 1), strides=(1,1))(conv_0)
maxpool_1 = MaxPooling2D((sequence_length - filter_sizes[1] + 1, 1), strides=(1,1))(conv_1)
maxpool_2 = MaxPooling2D((sequence_length - filter_sizes[2] + 1, 1), strides=(1,1))(conv_2)

merged_tensor = concatenate([maxpool_0, maxpool_1, maxpool_2], axis=1)
flatten = Flatten()(merged_tensor)
reshape = Reshape((3*num_filters,))(flatten)
dropout = Dropout(drop)(flatten)
output = Dense(units=2, activation='softmax',kernel_regularizer=regularizers.l2(0.01))(dropout)

# this creates a model that includes
model = Model(inputs, output)


In [26]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 200)]        0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 200, 300)     132215100   input_1[0][0]                    
__________________________________________________________________________________________________
reshape (Reshape)               (None, 200, 300, 1)  0           embedding[0][0]                  
__________________________________________________________________________________________________
conv2d (Conv2D)                 (None, 200, 1, 200)  60200       reshape[0][0]                    
______________________________________________________________________________________________

In [29]:
import pickle
# saving
with open('./data/tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenzier_custom, handle, protocol=pickle.HIGHEST_PROTOCOL)