In [2]:
import warnings
warnings.filterwarnings("ignore")
import ast
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Bidirectional, Conv1D, MaxPooling1D, Flatten, Embedding, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.regularizers import l2
from sklearn.metrics import accuracy_score, f1_score, classification_report, recall_score, precision_score
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from keras.wrappers.scikit_learn import KerasClassifier

import tensorflow as tf
import tensorflow_decision_forests as tfdf

from gensim.models import FastText

ModuleNotFoundError: No module named 'keras.wrappers'

In [2]:
def get_document_vector1(doc, model):
    vectors = [model.wv[word] for word in doc if word in model.wv]
    if len(vectors) == 0:
        return np.zeros(model.vector_size)
    return np.mean(vectors, axis=0)

## Khởi tạo các mô hình

In [3]:
def lstm_model(lstm_units=32, dropout_rate=0.2, vocab_size=10000, embedding_dim=50, embedding_matrix=None, max_length=50):
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, 
                        output_dim=embedding_dim,
                        weights=[embedding_matrix], 
                        input_length=max_length, trainable=False))
    model.add(LSTM(lstm_units, return_sequences=False))
    model.add(Flatten())
    model.add(Dropout(dropout_rate))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [4]:
def bilstm_model(lstm_units=32, dropout_rate=0.2, vocab_size=10000, embedding_dim=50, embedding_matrix=None, max_length=50):
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, 
                        output_dim=embedding_dim,
                        weights=[embedding_matrix], 
                        input_length=max_length, trainable=False))
    
    model.add(Bidirectional(LSTM(lstm_units, return_sequences=True)))
    model.add(Bidirectional(LSTM(lstm_units)))
    model.add(Flatten())
    model.add(Dropout(dropout_rate))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [5]:
def cnn_lstm_model(lstm_units=32, conv_filters=32, kernel_size=3, dropout_rate=0.2, vocab_size=10000, embedding_dim=50, embedding_matrix=None, max_length=50):
    model = Sequential()
    
    model.add(Embedding(input_dim=vocab_size, 
                        output_dim=embedding_dim,
                        weights=[embedding_matrix], 
                        input_length=max_length, trainable=False))
    
    model.add(Conv1D(filters=conv_filters, kernel_size=kernel_size, activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(LSTM(lstm_units))
    model.add(Flatten())
    model.add(Dropout(dropout_rate))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model


In [6]:
def ann_model(dense_units=32, dropout_rate=0.2, vocab_size=10000, embedding_dim=50, embedding_matrix=None, max_length=50):
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, 
                        output_dim=embedding_dim,
                        weights=[embedding_matrix], 
                        input_length=max_length, trainable=False))
    
    model.add(Dense(dense_units, activation='relu'))
    model.add(Dropout(dropout_rate))
    model.add(Flatten())
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

## Build mô hình và in kết quả

In [7]:
def buildDL_fast(X_train, X_test, y_train, y_test, param_grids):
    
    model_fast = FastText(sentences=X_train, vector_size=50, window=8, min_count=1, workers=4)

    #X_train = np.array([get_document_vector1(doc, model_fast) for doc in X_train])
    #X_test = np.array([get_document_vector1(doc, model_fast) for doc in X_test])
    
    #X_train = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
    #X_test = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))

    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(X_train)

    max_length=50
    vocab_size = len(tokenizer.word_index) + 1

    X_train_seq = tokenizer.texts_to_sequences(X_train)
    X_test_seq = tokenizer.texts_to_sequences(X_test)

    X_train_pad = pad_sequences(X_train_seq, maxlen=max_length)
    X_test_pad = pad_sequences(X_test_seq, maxlen=max_length)

    
    embedding_dim = model_fast.vector_size
    embedding_matrix = np.zeros((vocab_size, embedding_dim))

    

    for word, i in tokenizer.word_index.items():
        if word in model_fast.wv:
            embedding_matrix[i] = model_fast.wv[word]
        else:
            embedding_matrix[i] = np.zeros(embedding_dim)

    models = {
        'LSTM': KerasClassifier(build_fn=lstm_model, vocab_size=vocab_size, embedding_dim=embedding_dim, embedding_matrix=embedding_matrix, max_length=max_length),
        'BiLSTM': KerasClassifier(build_fn=bilstm_model, vocab_size=vocab_size, embedding_dim=embedding_dim, embedding_matrix=embedding_matrix, max_length=max_length),
        'CNN-LSTM': KerasClassifier(build_fn=cnn_lstm_model, vocab_size=vocab_size, embedding_dim=embedding_dim, embedding_matrix=embedding_matrix, max_length=max_length),
        'ANN': KerasClassifier(build_fn=ann_model, vocab_size=vocab_size, embedding_dim=embedding_dim, embedding_matrix=embedding_matrix, max_length=max_length)
    }


    res = []
    for key in models.keys():
        model = models[key]
        
        param_grid = param_grids[key]

        tuner = tfdf.tuner.RandomSearch(
            model,
            param_grid,
            max_trials=8,
            objective='accuracy',
            directory='tuner_logs',
            project_name=key
        )

        tuner.search(X_train_pad, y_train, epochs=10, validation_data=(X_test_pad, y_test))

        best_model = tuner.get_best_models()[0]
        y_pred = best_model.predict(X_test_pad)
        y_pred = (y_pred > 0.5).astype(int).flatten()
        
        result = [key, accuracy_score(y_test, y_pred),
                  precision_score(y_test, y_pred),
                  recall_score(y_test, y_pred),
                  f1_score(y_test, y_pred)]
        res.append(result)

    return pd.DataFrame(res, columns=["Model", "Accuracy", "Precision", "Recall", "F1 score"])
    

In [8]:
def sentimentML_evaluate(data_train, data_test, param_grids):
    origin_train_data = data_train[data_train["origin"] == 0]
    augmented_train_data = data_train[data_train["origin"] == 1]

    origin_test_data = data_test[data_test["origin"] == 0]
    augmented_test_data = data_test[data_test["origin"] == 1]
    
    X_train_origin = origin_train_data["Words"]
    X_test_origin = origin_test_data["Words"]

    X_train_augmented = augmented_train_data["Words"]
    X_test_augmented = augmented_test_data["Words"]
    
    le = LabelEncoder()
    y_train_origin = pd.DataFrame(le.fit_transform(origin_train_data.iloc[:, -1]))
    y_test_origin = pd.DataFrame(le.transform(origin_test_data.iloc[:, -1]))

    y_train_augmented = pd.DataFrame(le.transform(augmented_train_data.iloc[:, -1]))
    y_test_augmented = pd.DataFrame(le.transform(augmented_test_data.iloc[:, -1]))

    print("Basic origin - origin")
    res = buildDL_fast(X_train_origin, X_test_origin, y_train_origin, y_test_origin, param_grids)
    print(res)

    print("Augmented - origin")
    res = buildDL_fast(pd.concat((X_train_origin, X_train_augmented), axis=0), X_test_origin, 
                      pd.concat((y_train_origin, y_train_augmented), axis=0), y_test_origin, param_grids)
    print(res)

    print("Augmented - augmented")
    res = buildDL_fast(pd.concat((X_train_origin, X_train_augmented), axis=0), pd.concat((X_test_origin, X_test_augmented), axis=0), 
                      pd.concat((y_train_origin, y_train_augmented), axis=0), pd.concat((y_test_origin, y_test_augmented), axis=0), param_grids)
    print(res)

In [9]:
near_train = pd.read_csv(f"..\\..\\data\\cleaned\\Title\\Near_processed_train.csv")[["Words", "origin", "Near"]]
near_test = pd.read_csv(f"..\\..\\data\\cleaned\\Title\\Near_processed_test.csv")[["Words", "origin", "Near"]]

mid_train = pd.read_csv(f"..\\..\\data\\cleaned\\Title\\Mid_processed_train.csv")[["Words", "origin", "Mid"]]
mid_test = pd.read_csv(f"..\\..\\data\\cleaned\\Title\\Mid_processed_test.csv")[["Words", "origin", "Mid"]]

far_train = pd.read_csv(f"..\\..\\data\\cleaned\\Title\\Far_processed_train.csv")[["Words", "origin", "Far"]]
far_test = pd.read_csv(f"..\\..\\data\\cleaned\\Title\\Far_processed_test.csv")[["Words", "origin", "Far"]]

potential_train = pd.read_csv(f"..\\..\\data\\cleaned\\Title\\Potential_processed_train.csv")[["Words", "origin", "Potential"]]
potential_test = pd.read_csv(f"..\\..\\data\\cleaned\\Title\\Potential_processed_test.csv")[["Words", "origin", "Potential"]]

In [10]:

param_grids = {
    'LSTM': {
        'lstm_units': [32, 64, 128],
        'batch_size': [32, 64, 128],
        'epochs': [10, 30, 50]
    },

    'CNN-LSTM': {
        'lstm_units': [24, 48, 64],
        'conv_filters': [32, 64, 128],
        'kernel_size': [1, 3, 5],
        'batch_size': [32, 64, 128],
        'epochs': [10, 30, 50]
    },
    'BiLSTM': {
        'lstm_units': [32, 64, 128],
        'batch_size': [32, 64, 128],
        'epochs': [10, 30, 50]
    },
    'ANN': {
        'dense_units': [32, 64, 128],
        'batch_size': [32, 64, 128],
        'epochs': [10, 30, 50]
    }
}


In [11]:
sentimentML_evaluate(mid_train, mid_test, param_grids)

Basic origin - origin


KeyboardInterrupt: 

In [None]:

sentimentML_evaluate(near_train, near_test, param_grids)

In [None]:

sentimentML_evaluate(far_train, far_test, param_grids)

In [None]:

sentimentML_evaluate(potential_train, potential_test, param_grids)