In [1]:
import numpy as np
from time import time
import ast
import utils as lu
import logging

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import LinearSVC

In [None]:
def FP_without_pua(y_pred, y_test):
    fp = 0
    for i in range(len(y_pred)):
        if y_pred[i] == 2 and y_test[i] == 0:
            fp += 1
    return fp / len(y_pred)


def FP_with_pua(y_pred, y_test):
    fp = 0
    for i in range(len(y_pred)):
        if (y_pred[i] == 1 or y_pred[i] == 2) and y_test[i] == 0:
            fp += 1
    return fp / len(y_pred)

In [2]:
all_labels_file = 'all_labels.txt'
embeddings = 'gexf_graphs_dims_300_epochs_30_lr_0.3_embeddings.txt'
X, Y = lu.X_Y_from_embeddings(all_labels_file, embeddings)
lu.test_proportions(Y)

In [17]:
for handler in logging.root.handlers[:]:
    logging.root.removeHandler(handler)
log_path = '/home/pogorelov/work/classification_logs_and_results/svm_lin_not_equal_dim300.log'
logging.basicConfig(filename=log_path, filemode="w", level=logging.INFO)


X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=12345)
logging.info('Train and Test matrix shapes: {}, {}, {}, {} '.format(X_train.shape, X_test.shape,
                                                                        Y_train.shape, Y_test.shape))

logging.info('===================== SVM [linear, rbf, poly] =====================')
params = [{'kernel':['linear'], 'C':[0.01,0.1,1, 10]},
          {'kernel':['rbf'], 'gamma':[1e-4, 1e-3, 1e-2], 'C':[0.01,0.1,1, 10]},
          {'kernel':['polynomial'], 'degree':[2, 3], 'coef0':[0, 1] 'C':[0.01,0.1,1, 10]}]

classifier = GridSearchCV(SVC(), params, cv=5, scoring='accuracy',verbose=1)
start = time()
classifier.fit(X_train,Y_train)
end = time()

logging.info('fit time is %.1f min' % ((end-start)/60))
logging.info('best classifier model\'s hyperparamters', classifier.best_params_)

Y_pred = classifier.predict(X_test)
acc = accuracy_score(Y_test, Y_pred)

logging.info('Linear SVM accuracy: {}'.format(acc))
logging.info(classification_report(Y_test, Y_pred))
logging.info('FPR without pua: {}'.format(FP_without_pua(Y_pred, Y_test)))
logging.info('FPR with pua: {}'.format(FP_with_pua(Y_pred, Y_test)))
np.savetxt('svm_lin_not_equal_dim300_pred.txt',Y_pred)
np.savetxt('svm_lin_not_equal_dim300_test.txt',Y_test)
logging.info('======================================================')



logging.info('===================== k-NN =====================')
params = [{'n_neighbors':[5, 7, 10]}]

classifier = GridSearchCV(KNeighborsClassifier(), params, cv=5, scoring='accuracy', verbose=1)
start = time()
classifier.fit(X_train,Y_train)
end = time()

logging.info('fit time is %.1f min' % ((end-start)/60))
logging.info('best classifier model\'s hyperparamters', classifier.best_params_)

Y_pred = classifier.predict(X_test)
acc = accuracy_score(Y_test, Y_pred)

logging.info('Linear SVM accuracy: {}'.format(acc))
logging.info(classification_report(Y_test, Y_pred))
logging.info('FPR without pua: {}'.format(FP_without_pua(Y_pred, Y_test)))
logging.info('FPR with pua: {}'.format(FP_with_pua(Y_pred, Y_test)))
np.savetxt('kNN_not_equal_dim300_pred.txt',Y_pred)
np.savetxt('kNN_not_equal_dim300_test.txt',Y_test)
logging.info('======================================================')
