In [133]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.neural_network import MLPClassifier

In [134]:
import gensim
from gensim.models import KeyedVectors

path = 'word2vec/W2V_150_cleaned.txt'
model = KeyedVectors.load_word2vec_format(path, binary=False)

In [135]:
def get_word_vector(model, word):
    try:
        return model[word]
    except KeyError:
        # Nếu từ không có trong model, trả về vector 0
        return np.zeros(model.vector_size)

In [136]:
syn = pd.read_csv("antonym-synonym set/Synonym_vietnamese.txt",header=None, sep=' ')
syn['label'] = 1
syn.columns = ['Word1','Word2','label']
syn = syn.dropna()
syn['Word1'] = syn['Word1'].apply(lambda word: get_word_vector(model,word))
syn['Word2'] = syn['Word2'].apply(lambda word: get_word_vector(model,word))
syn['pair'] = syn.apply(lambda row: np.concatenate((row['Word1'], row['Word2'])), axis=1)
syn = syn.drop(['Word1','Word2'],axis=1)
syn

Unnamed: 0,label,pair
0,1,"[-1.0968530178070068, 0.123888298869133, 1.713..."
1,1,"[-1.096853, 0.1238883, 1.713642, 2.17519, -0.5..."
2,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,1,"[1.133512020111084, 0.46207061409950256, -0.55..."
...,...,...
11557,1,"[1.175062, 0.5835593, 0.9456611, 1.035395, -0...."
11558,1,"[1.931044, -0.8083969, 1.601383, 1.352047, 0.3..."
11559,1,"[2.697736, -1.991829, -1.202439, 2.340847, 1.5..."
11560,1,"[0.1775007, 0.08910807, -0.5129477, -0.9176681..."


In [137]:
ant = pd.read_csv("antonym-synonym set/Antonym_vietnamese.txt",header=None, sep=' ')
ant['label'] = 0
ant.columns = ['Word1','Word2','label']
ant = ant.dropna()
ant['Word1'] = ant['Word1'].apply(lambda word: get_word_vector(model,word))
ant['Word2'] = ant['Word2'].apply(lambda word: get_word_vector(model,word))
ant['pair'] = ant.apply(lambda row: np.concatenate((row['Word1'], row['Word2'])), axis=1)
ant = ant.drop(['Word1','Word2'],axis=1)
ant

Unnamed: 0,label,pair
0,0,"[0.2963356, 2.380905, -0.5782873, 2.021804, 2...."
1,0,"[0.2963356, 2.380905, -0.5782873, 2.021804, 2...."
2,0,"[0.2076982, 1.923723, 2.402213, 0.7915516, 1.6..."
3,0,"[0.1616077, 0.9954004, -0.2775585, 1.630965, 2..."
4,0,"[0.1616077, 0.9954004, -0.2775585, 1.630965, 2..."
...,...,...
1995,0,"[-0.1843596, 1.387974, -0.3645253, 1.54733, 3...."
1996,0,"[0.3057049, 3.127424, -1.285929, 0.6471007, -0..."
1997,0,"[0.3057049, 3.127424, -1.285929, 0.6471007, -0..."
1998,0,"[0.3057049, 3.127424, -1.285929, 0.6471007, -0..."


In [138]:
data_train = pd.concat([syn, ant], ignore_index=True)
X_train = data_train['pair']
Y_train = data_train['label']

In [143]:
X_train = np.array(X_train.tolist())

In [145]:
def load_data(path):
    test = pd.read_csv(path,sep='\t')
    test.columns=['Word1','Word2','label']
    test['label'] = test['label'].replace('ANT', 0)
    test['label'] = test['label'].replace('SYN', 1)
    test['Word1'] = test['Word1'].apply(lambda word: get_word_vector(model,word))
    test['Word2'] = test['Word2'].apply(lambda word: get_word_vector(model,word))
    test['pair'] = test.apply(lambda row: np.concatenate((row['Word1'], row['Word2'])), axis=1)
    test = test.drop(['Word1','Word2'],axis=1)
    X_test = test['pair']
    X_test = np.array(X_test.tolist())
    Y_test = test['label']
    return X_test,Y_test
    

In [140]:
from sklearn.metrics import precision_score, recall_score, f1_score
def evaluate(y_true, y_pred, model_name):
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    print(f"{model_name} - Precision: {precision}, Recall: {recall}, F1-Score: {f1}")

In [149]:
clf_logistic = LogisticRegression(max_iter=1000)
clf_logistic.fit(X_train,Y_train)

clf_mlp = MLPClassifier(hidden_layer_sizes=(128, 64), max_iter=500, activation='relu')
clf_mlp.fit(X_train, Y_train)

In [150]:
X_test, Y_test = load_data("datasets/ViCon-400/400_noun_pairs.txt")
y_pred_logistic = clf_logistic.predict(X_test)
evaluate(Y_test, y_pred_logistic, 'Logistic Regression')

y_pred_mlp = clf_mlp.predict(X_test)
evaluate(Y_test, y_pred_mlp, 'MLP')

Logistic Regression - Precision: 0.569364161849711, Recall: 0.985, F1-Score: 0.7216117216117216
MLP - Precision: 0.970873786407767, Recall: 1.0, F1-Score: 0.9852216748768473


In [153]:
X_test, Y_test = load_data("datasets/ViCon-400/400_verb_pairs.txt")
y_pred_logistic = clf_logistic.predict(X_test)
evaluate(Y_test, y_pred_logistic, 'Logistic Regression')

y_pred_mlp = clf_mlp.predict(X_test)
evaluate(Y_test, y_pred_mlp, 'MLP')

Logistic Regression - Precision: 0.598159509202454, Recall: 0.975, F1-Score: 0.7414448669201521
MLP - Precision: 0.9852216748768473, Recall: 1.0, F1-Score: 0.9925558312655087


In [152]:
X_test, Y_test = load_data("datasets/ViCon-400/600_adj_pairs.txt")
y_pred_logistic = clf_logistic.predict(X_test)
evaluate(Y_test, y_pred_logistic, 'Logistic Regression')

y_pred_mlp = clf_mlp.predict(X_test)
evaluate(Y_test, y_pred_mlp, 'MLP')

Logistic Regression - Precision: 0.6927083333333334, Recall: 0.8866666666666667, F1-Score: 0.7777777777777778
MLP - Precision: 1.0, Recall: 1.0, F1-Score: 1.0
