In [79]:
import warnings
warnings.filterwarnings("ignore")
import ast
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Bidirectional, Dense, Embedding, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.regularizers import l2
from sklearn.metrics import accuracy_score, f1_score, classification_report, recall_score, precision_score
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder


from gensim.models import FastText

In [80]:
options = ["Near", "Mid", "Far", "Potential"]

near_train = pd.read_csv(f"../../data/cleaned/Title/Near_processed_train.csv")[["Words", "origin", "Near"]]
near_test = pd.read_csv(f"../../data/cleaned/Title/Near_processed_test.csv")[["Words", "origin", "Near"]]

mid_train = pd.read_csv(f"../../data/cleaned/Title/Mid_processed_train.csv")[["Words", "origin", "Mid"]]
mid_test = pd.read_csv(f"../../data/cleaned/Title/Mid_processed_test.csv")[["Words", "origin", "Mid"]]

far_train = pd.read_csv(f"../../data/cleaned/Title/Far_processed_train.csv")[["Words", "origin", "Far"]]
far_test = pd.read_csv(f"../../data/cleaned/Title/Far_processed_test.csv")[["Words", "origin", "Far"]]

potential_train = pd.read_csv(f"../../data/cleaned/Title/Potential_processed_train.csv")[["Words", "origin", "Potential"]]
potential_test = pd.read_csv(f"../../data/cleaned/Title/Potential_processed_test.csv")[["Words", "origin", "Potential"]]

In [89]:
near_test.isna().any()

Words     False
origin    False
Near      False
dtype: bool

In [81]:
def get_document_vector1(doc, model):
    vectors = [model.wv[word] for word in doc if word in model.wv]
    if len(vectors) == 0:
        return np.zeros(model.vector_size)
    return np.mean(vectors, axis=0)

### Khai báo và các tham số tối ưu mô hình học máy

In [82]:
models = [
    SVC(random_state=42),
    LogisticRegression(solver='saga', random_state=42),
    RandomForestClassifier(random_state=42),
    AdaBoostClassifier(random_state=42),
    GradientBoostingClassifier(random_state=42),
    KNeighborsClassifier(),
    GaussianNB()
]

param_grids = [
    {'C': [0.1, 1, 10, 100, 1000],  #SVC
     'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
     'kernel': ['rbf', 'sigmoid']},
    {"penalty":["l1","l2"]}, #LogisticRegression
    {'max_features':[1,3,5,7], #RandomForestClassifier
     'min_samples_leaf':[1,2,3],},
    {'learning_rate': [0.0001, 0.001, 0.01, 0.1, 1.0]}, #AdaBoostClassifier
    {'learning_rate': [0.0001, 0.001, 0.01, 0.1, 1.0]}, #GradientBoostingClassifier
    {'n_neighbors': [2,3,5,7]}, #KNeighborsClassifier
    {'var_smoothing': [0.00000001, 0.000000001, 0.00000001]} #GaussianNB
] 

In [83]:
def w2v_buildML(X_train, X_test, y_train, y_test):

    model_fast = FastText(sentences=X_train, vector_size=50, window=5, min_count=1, workers=4)

    X_train = np.array([get_document_vector1(doc, model_fast) for doc in X_train])
    X_test = np.array([get_document_vector1(doc, model_fast) for doc in X_test])

    res = []
    for i in range(len(models)):
        grid = GridSearchCV(models[i], param_grids[i], refit = True, verbose = 0) 
        grid.fit(X_train, y_train) 
        model = grid.best_estimator_
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        result = [f"{type(model).__name__}", accuracy_score(y_test, y_pred),
                precision_score(y_test, y_pred),
                recall_score(y_test, y_pred),
                f1_score(y_test, y_pred)]
        res.append(result)
    return pd.DataFrame(res, columns = ["Model", "Accuracy", "Precision", "Recal", "F1 score"])

In [123]:
def sentimentML_evaluate(data_train, data_test, models, param_grids):
    origin_train_data = data_train[data_train["origin"] == 0]
    augmented_train_data = data_train[data_train["origin"] == 1]

    origin_test_data = data_test[data_test["origin"] == 0]
    augmented_test_data = data_test[data_test["origin"] == 1]
    
    X_train_origin = origin_train_data["Words"]
    X_test_origin = origin_test_data["Words"]

    X_train_augmented = augmented_train_data["Words"]
    X_test_augmented = augmented_test_data["Words"]
    
    le = LabelEncoder()
    y_train_origin = pd.DataFrame(le.fit_transform(origin_train_data.iloc[:, -1]))
    y_test_origin = pd.DataFrame(le.transform(origin_test_data.iloc[:, -1]))

    y_train_augmented = pd.DataFrame(le.transform(augmented_train_data.iloc[:, -1]))
    y_test_augmented = pd.DataFrame(le.transform(augmented_test_data.iloc[:, -1]))

    print("Basic origin - origin")
    res = w2v_buildML(X_train_origin, X_test_origin, y_train_origin, y_test_origin)
    print(res)

    print("Augmented - origin")
    res = w2v_buildML(pd.concat((X_train_origin, X_train_augmented), axis=0), X_test_origin, 
                      pd.concat((y_train_origin, y_train_augmented), axis=0), y_test_origin)
    print(res)

    print("Augmented - augmented")
    res = w2v_buildML(pd.concat((X_train_origin, X_train_augmented), axis=0), pd.concat((X_test_origin, X_test_augmented), axis=0), 
                      pd.concat((y_train_origin, y_train_augmented), axis=0), pd.concat((y_test_origin, y_test_augmented), axis=0))
    print(res)

In [124]:
sentimentML_evaluate(near_train, near_test, models, param_grids)

Basic origin - origin
                        Model  Accuracy  Precision   Recal  F1 score
0                         SVC  0.559140   0.000000  0.0000  0.000000
1          LogisticRegression  0.569892   0.000000  0.0000  0.000000
2      RandomForestClassifier  0.451613   0.365854  0.3750  0.370370
3          AdaBoostClassifier  0.424731   0.416149  0.8375  0.556017
4  GradientBoostingClassifier  0.510753   0.442105  0.5250  0.480000
5        KNeighborsClassifier  0.516129   0.391304  0.2250  0.285714
6                  GaussianNB  0.500000   0.417722  0.4125  0.415094
Augmented - origin
                        Model  Accuracy  Precision   Recal  F1 score
0                         SVC  0.510753   0.435294  0.4625  0.448485
1          LogisticRegression  0.559140   0.468750  0.1875  0.267857
2      RandomForestClassifier  0.516129   0.424242  0.3500  0.383562
3          AdaBoostClassifier  0.494624   0.400000  0.3500  0.373333
4  GradientBoostingClassifier  0.462366   0.321429  0.2250  0.