# Libraries

In [1]:
import numpy as np
np.random.seed(42)

from Modules.SimilarityMeasures import vectorize_pair
import re
import pickle

from sklearn.model_selection import KFold
from sklearn.svm import SVC
from sklearn.metrics import f1_score, accuracy_score

# Preprocess the VnPara dataset

In [2]:
def read_data_file(dir):
    text_set =[]
    with open(dir, 'r', encoding ='utf-8') as text:
        for line in text.readlines():
            text_set.append(line.strip())
    return text_set

In [3]:
def remove_special_chars(sentence, keep_under_score):
    sentence = sentence.lower()
    sentence = re.sub(r'(v\s\.\sv\s.)', '', sentence, re.UNICODE)
    if keep_under_score:
        sub_string = ''.join(ch for ch in sentence if (ch.isalnum() or ch == ' ' or ch == '_'))
    else:
        sub_string = ''.join(ch for ch in sentence if (ch.isalnum() or ch == ' '))
    sub_string_2 = re.sub('\s{2,}', ' ', sub_string, re.UNICODE)
    return sub_string_2

In [4]:
def create_list_pair(sentences1, sentences2, keep_under_score = True):
    number_sentences = len(sentences1)
    list_sentences = []
    for i in range(number_sentences):
        s1 = remove_special_chars(sentences1[i], keep_under_score)
        s2 = remove_special_chars(sentences2[i], keep_under_score)
        s = {'s1': s1, 's2': s2}
        list_sentences.append(s)
    return list_sentences

In [5]:
train_labels = read_data_file('Data//vnPara//Labels.txt')
for i in range (len(train_labels)):
    if train_labels[i] == '1,':
        train_labels[i] = 1
    else:
        train_labels[i] = int(train_labels[i])

In [6]:
train_data = []

list_sentence_1 = []
list_sentence_2 = []

with open("Data//vnPara//Sentences1.txt", encoding ='utf-8') as fin:
    for line in fin:
        list_sentence_1.append(line)

with open("Data//vnPara//Sentences2.txt", encoding ='utf-8') as fin:
    for line in fin:
        list_sentence_2.append(line)

pairs = create_list_pair(list_sentence_1, list_sentence_2, False)

In [7]:
num_samples = len(train_labels)
for i in range(num_samples):
    s1 = pairs[i]['s1'].split()
    s2 = pairs[i]['s2'].split()
    train_data.append(vectorize_pair(s1,s2))

X = np.array(train_data)
y = np.array(train_labels)

# K-fold evaluation

In [8]:
kf = KFold(n_splits = 5, shuffle = True, random_state = 42)

f1_list = []
acc_list = []

for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    svm_clf= SVC(kernel = 'rbf', probability = True)
    svm_clf.fit(X_train, y_train)

    f1 = f1_score(y_test, svm_clf.predict(X_test))
    f1_list.append(f1)
    acc = accuracy_score(y_test, svm_clf.predict(X_test))
    acc_list.append(acc)
    print(acc,f1)

print("Average 5-fold accuracy:", sum(acc_list)/len(acc_list))
print("Average 5-fold f1:", sum(f1_list)/len(f1_list))

0.9643435980551054 0.9667673716012085
0.9854132901134521 0.9850746268656716
0.9773095623987034 0.9774193548387097
0.9707792207792207 0.9713375796178344
0.9691558441558441 0.9691056910569106
Average 5-fold accuracy: 0.9734003031004651
Average 5-fold f1: 0.9739409247960669


# Final model

Train the final model on the whole dataset.

In [9]:
svm_clf= SVC(kernel = 'rbf', probability = True)
svm_clf.fit(X, y)

print("Final model accuracy:", accuracy_score(y_test, svm_clf.predict(X_test)))
print("Final model f1:", f1_score(y, svm_clf.predict(X)))

Final model accuracy: 0.9691558441558441
Final model f1: 0.9737851662404091


Save the final model

In [10]:
filename = 'Models//SVM//svm_paraphrase_identification_model.sav'
pickle.dump(svm_clf, open(filename, 'wb'))