In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC

from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler

In [2]:
path = 'CORD19_GraphOfDocs/datasets'
datasets = [
    [f'{path}/dataset1/fully_balanced/train_balanced_668.csv', f'{path}/dataset1/fully_balanced/test_balanced_840.csv'],
    [f'{path}/dataset2/fully_balanced/train_balanced_858.csv', f'{path}/dataset2/fully_balanced/test_balanced_1566.csv'],
    [f'{path}/dataset3/fully_balanced/train_balanced_1726.csv', f'{path}/dataset3/fully_balanced/test_balanced_2636.csv'],
    [f'{path}/dataset4/fully_balanced/train_balanced_3346.csv', f'{path}/dataset4/fully_balanced/test_balanced_7798.csv'],
    [f'{path}/dataset5/fully_balanced/train_balanced_5042.csv', f'{path}/dataset5/fully_balanced/test_balanced_12976.csv'],
    [f'{path}/dataset6/fully_balanced/train_balanced_5296.csv', f'{path}/dataset6/fully_balanced/test_balanced_16276.csv'],
    [f'{path}/dataset7/fully_balanced/train_balanced_6210.csv', f'{path}/dataset7/fully_balanced/test_balanced_25900.csv'],
    [f'{path}/dataset8/fully_balanced/train_balanced_8578.csv', f'{path}/dataset8/fully_balanced/test_balanced_34586.csv'],
    [f'{path}/dataset9/fully_balanced/train_balanced_13034.csv', f'{path}/dataset9/fully_balanced/test_balanced_49236.csv']
]

features_combinations = [
    ['adamic_adar', 'common_neighbors', 'preferential_attachment', 'total_neighbors'],
    ['adamic_adar', 'common_neighbors', 'preferential_attachment', 'total_neighbors', 'similarity'],
    ['adamic_adar', 'similarity'],
    ['adamic_adar'],
]

In [3]:
def neural_network():
    return MLPClassifier(solver='adam', hidden_layer_sizes=(100, 50), random_state=0)

def logistic_regression():
    return LogisticRegression(random_state=0, solver='lbfgs', multi_class='ovr')

def knn(n_neighbors):
    return KNeighborsClassifier(n_neighbors=n_neighbors, weights='uniform')

def knn_5():
    return knn(5)

def knn_10():
    return knn(10)

def knn_20():
    return knn(20)

def knn_30():
    return knn(30)

def knn_40():
    return knn(40)

def knn_50():
    return knn(50)

def knn_60():
    return knn(60)

def knn_70():
    return knn(70)

def knn_100():
    return knn(100)

def knn_1():
    return knn(1)

def linear_svm():
    return LinearSVC()

def svm():
    return SVC()

def decision_tree():
    return DecisionTreeClassifier(max_depth=5, random_state=0)

def get_normalizer():
    return MinMaxScaler()

def calculate_scores(classifier, selected_features, train_df, test_df):
    normalizer = get_normalizer()
    normalizer.fit(train_df[selected_features])
    train_data = normalizer.transform(train_df[selected_features])
    test_data = normalizer.transform(test_df[selected_features])
    classifier.fit(train_data, train_df['label'])
    predictions = classifier.predict(test_data)
    res = [
        accuracy_score(test_df['label'], predictions),
        precision_score(test_df['label'], predictions),
        recall_score(test_df['label'], predictions)
    ]
    return res

def evaluate_dataset(train_path, test_path, classifier):
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    scores = []
    accuracy_scores = []
    for index, feature_columns in enumerate(features_combinations, 1):
        tmp = calculate_scores(classifier(), feature_columns, train_df, test_df)
        accuracy_value = tmp[0]
        accuracy_scores.append(accuracy_value)
        #print('accuracy %s %s' % (accuracy_value, feature_columns))
        scores.append([feature_columns, *tmp])
    global clf_accuracy_scores
    clf_accuracy_scores.append(accuracy_scores)

    return scores

def evaluate_datasets(classifier_fun):
    print(classifier_fun)
    all_scores = []
    for index, dataset in enumerate(datasets, 1):
        #print(f'dataset {index}', '-'*60)
        all_scores.append(evaluate_dataset(dataset[0], dataset[1], classifier_fun))
    return generate_statistics(all_scores)

def print_scores(scores):
    scores = sorted(scores, key=lambda x: x[1], reverse=True)

    for features, accuracy, precision, recall,  in scores:
        print(features)
        print('A:', accuracy, 'P:', precision, 'R:', recall, '\n')
        
def print_statistics(statistics, score):
    print(score, '-'*10)
    statistics = sorted(statistics[score], key=lambda x: x[1], reverse=True)
    for row in statistics:
        print(f'{row[0]}|AVG:{row[1]:.3f}|MIN:{row[3]:.3f}|MAX:{row[2]:.3f}|STD:{row[4]:.3f}')

def generate_statistics(all_scores):
    # all_scores dimensions: dataset, feature combination, evaluation score:(1: accuracy, 2: precision, 3:recall)
    number_of_datasets = len(datasets)
    number_of_features_combinations = len(features_combinations)
    
    accuracy_scores_per_features_combination = []
    for i in range(number_of_features_combinations):
        accuracies = []
        for j in range(number_of_datasets):
            accuracies.append(all_scores[j][i][1])
        accuracies = np.array(accuracies)
        accuracy_std = np.std(accuracies)
        accuracy_max = np.max(accuracies)
        accuracy_min = np.min(accuracies)
        accuracy_average = np.mean(accuracies)
        identifier = '-'.join(features_combinations[i])
        accuracy_scores_per_features_combination.append([
            identifier,
            accuracy_average,
            accuracy_max,
            accuracy_min,
            accuracy_std
        ])

    precision_scores_per_features_combination = []
    for i in range(number_of_features_combinations):
        precisions = []
        for j in range(number_of_datasets):
            precisions.append(all_scores[j][i][2])
        precisions = np.array(precisions)
        precision_std = np.std(precisions)
        precision_max = np.max(precisions)
        precision_min = np.min(precisions)
        precision_average = np.mean(precisions)
        identifier = '-'.join(features_combinations[i])
        precision_scores_per_features_combination.append([
            identifier,
            precision_average,
            precision_max,
            precision_min,
            precision_std
        ])
    
    recall_scores_per_features_combination = []
    for i in range(number_of_features_combinations):
        recalls = []
        for j in range(number_of_datasets):
            recalls.append(all_scores[j][i][3])
        recalls = np.array(recalls)
        recall_std = np.std(recalls)
        recall_max = np.max(recalls)
        recall_min = np.min(recalls)
        recall_average = np.mean(recalls)
        identifier = '-'.join(features_combinations[i])
        recall_scores_per_features_combination.append([
            identifier,
            recall_average,
            recall_max,
            recall_min,
            recall_std
        ])
    
    return {
        'accuracy': accuracy_scores_per_features_combination,
        'precision': precision_scores_per_features_combination,
        'recall': recall_scores_per_features_combination
    }

In [4]:
clf_accuracy_scores = []
statistics = evaluate_datasets(logistic_regression)

print_statistics(statistics, 'accuracy')
print('#'*10)
print_statistics(statistics, 'precision')
print('#'*10)
print_statistics(statistics, 'recall')
print('#'*10)

<function logistic_regression at 0x7fbfabd39d90>
accuracy ----------
adamic_adar-similarity|AVG:0.973|MIN:0.964|MAX:0.981|STD:0.005
adamic_adar|AVG:0.971|MIN:0.964|MAX:0.978|STD:0.005
adamic_adar-common_neighbors-preferential_attachment-total_neighbors-similarity|AVG:0.959|MIN:0.942|MAX:0.968|STD:0.007
adamic_adar-common_neighbors-preferential_attachment-total_neighbors|AVG:0.955|MIN:0.938|MAX:0.965|STD:0.007
##########
precision ----------
adamic_adar-similarity|AVG:0.977|MIN:0.962|MAX:0.991|STD:0.011
adamic_adar|AVG:0.968|MIN:0.945|MAX:0.985|STD:0.014
adamic_adar-common_neighbors-preferential_attachment-total_neighbors-similarity|AVG:0.961|MIN:0.909|MAX:0.984|STD:0.023
adamic_adar-common_neighbors-preferential_attachment-total_neighbors|AVG:0.955|MIN:0.907|MAX:0.981|STD:0.022
##########
recall ----------
adamic_adar|AVG:0.974|MIN:0.964|MAX:0.986|STD:0.008
adamic_adar-similarity|AVG:0.969|MIN:0.955|MAX:0.981|STD:0.007
adamic_adar-common_neighbors-preferential_attachment-total_neighbor

In [5]:
tmp_df = pd.DataFrame(clf_accuracy_scores, columns=['logistic_comb_1', 'logistic_comb_2', 'logistic_comb_3', 'logistic_comb_4'])

In [6]:
df_accuracy_scores = tmp_df

In [7]:
clf_accuracy_scores = []
statistics = evaluate_datasets(knn_50)

print_statistics(statistics, 'accuracy')
print('#'*10)
print_statistics(statistics, 'precision')
print('#'*10)
print_statistics(statistics, 'recall')
print('#'*10)

<function knn_50 at 0x7fbfabd39b70>
accuracy ----------
adamic_adar-similarity|AVG:0.967|MIN:0.948|MAX:0.979|STD:0.010
adamic_adar-common_neighbors-preferential_attachment-total_neighbors-similarity|AVG:0.959|MIN:0.925|MAX:0.976|STD:0.015
adamic_adar|AVG:0.957|MIN:0.941|MAX:0.969|STD:0.010
adamic_adar-common_neighbors-preferential_attachment-total_neighbors|AVG:0.956|MIN:0.925|MAX:0.975|STD:0.014
##########
precision ----------
adamic_adar-similarity|AVG:0.944|MIN:0.908|MAX:0.969|STD:0.018
adamic_adar-common_neighbors-preferential_attachment-total_neighbors-similarity|AVG:0.933|MIN:0.872|MAX:0.966|STD:0.026
adamic_adar-common_neighbors-preferential_attachment-total_neighbors|AVG:0.928|MIN:0.872|MAX:0.965|STD:0.025
adamic_adar|AVG:0.927|MIN:0.895|MAX:0.947|STD:0.019
##########
recall ----------
adamic_adar-similarity|AVG:0.993|MIN:0.990|MAX:0.996|STD:0.002
adamic_adar|AVG:0.992|MIN:0.987|MAX:0.999|STD:0.004
adamic_adar-common_neighbors-preferential_attachment-total_neighbors-similarity|

In [8]:
tmp_df = pd.DataFrame(clf_accuracy_scores, columns=['knn_50_comb_1', 'knn_50_comb_2', 'knn_50_comb_3', 'knn_50_comb_4'])
df_accuracy_scores = pd.concat([df_accuracy_scores, tmp_df], axis=1)

In [9]:
clf_accuracy_scores = []
statistics = evaluate_datasets(linear_svm)

print_statistics(statistics, 'accuracy')
print('#'*10)
print_statistics(statistics, 'precision')
print('#'*10)
print_statistics(statistics, 'recall')
print('#'*10)

<function linear_svm at 0x7fbfabd39400>
accuracy ----------
adamic_adar-similarity|AVG:0.972|MIN:0.957|MAX:0.981|STD:0.009
adamic_adar|AVG:0.968|MIN:0.953|MAX:0.980|STD:0.009
adamic_adar-common_neighbors-preferential_attachment-total_neighbors-similarity|AVG:0.963|MIN:0.936|MAX:0.976|STD:0.012
adamic_adar-common_neighbors-preferential_attachment-total_neighbors|AVG:0.959|MIN:0.931|MAX:0.971|STD:0.012
##########
precision ----------
adamic_adar-similarity|AVG:0.957|MIN:0.926|MAX:0.977|STD:0.017
adamic_adar|AVG:0.952|MIN:0.920|MAX:0.975|STD:0.018
adamic_adar-common_neighbors-preferential_attachment-total_neighbors-similarity|AVG:0.946|MIN:0.888|MAX:0.975|STD:0.026
adamic_adar-common_neighbors-preferential_attachment-total_neighbors|AVG:0.941|MIN:0.884|MAX:0.972|STD:0.026
##########
recall ----------
adamic_adar-similarity|AVG:0.989|MIN:0.985|MAX:0.994|STD:0.003
adamic_adar|AVG:0.987|MIN:0.981|MAX:0.994|STD:0.004
adamic_adar-common_neighbors-preferential_attachment-total_neighbors-similar

In [10]:
tmp_df = pd.DataFrame(clf_accuracy_scores, columns=['linear_svm_comb_1', 'linear_svm_comb_2', 'linear_svm_comb_3', 'linear_svm_comb_4'])
df_accuracy_scores = pd.concat([df_accuracy_scores, tmp_df], axis=1)

In [11]:
clf_accuracy_scores = []
statistics = evaluate_datasets(svm)

print_statistics(statistics, 'accuracy')
print('#'*10)
print_statistics(statistics, 'precision')
print('#'*10)
print_statistics(statistics, 'recall')
print('#'*10)

<function svm at 0x7fbfabd39510>
accuracy ----------
adamic_adar-similarity|AVG:0.966|MIN:0.950|MAX:0.979|STD:0.010
adamic_adar|AVG:0.961|MIN:0.937|MAX:0.977|STD:0.012
adamic_adar-common_neighbors-preferential_attachment-total_neighbors-similarity|AVG:0.953|MIN:0.871|MAX:0.976|STD:0.030
adamic_adar-common_neighbors-preferential_attachment-total_neighbors|AVG:0.946|MIN:0.826|MAX:0.974|STD:0.043
##########
precision ----------
adamic_adar-similarity|AVG:0.941|MIN:0.911|MAX:0.969|STD:0.019
adamic_adar|AVG:0.935|MIN:0.890|MAX:0.968|STD:0.024
adamic_adar-common_neighbors-preferential_attachment-total_neighbors-similarity|AVG:0.923|MIN:0.797|MAX:0.964|STD:0.047
adamic_adar-common_neighbors-preferential_attachment-total_neighbors|AVG:0.915|MIN:0.743|MAX:0.965|STD:0.063
##########
recall ----------
adamic_adar-similarity|AVG:0.994|MIN:0.990|MAX:0.996|STD:0.002
adamic_adar-common_neighbors-preferential_attachment-total_neighbors-similarity|AVG:0.993|MIN:0.989|MAX:0.996|STD:0.002
adamic_adar|AVG

In [12]:
tmp_df = pd.DataFrame(clf_accuracy_scores, columns=['svm_comb_1', 'svm_comb_2', 'svm_comb_3', 'svm_comb_4'])
df_accuracy_scores = pd.concat([df_accuracy_scores, tmp_df], axis=1)

In [13]:
clf_accuracy_scores = []
statistics = evaluate_datasets(decision_tree)

print_statistics(statistics, 'accuracy')
print('#'*10)
print_statistics(statistics, 'precision')
print('#'*10)
print_statistics(statistics, 'recall')
print('#'*10)

<function decision_tree at 0x7fbfabd39488>
accuracy ----------
adamic_adar-common_neighbors-preferential_attachment-total_neighbors-similarity|AVG:0.933|MIN:0.837|MAX:0.980|STD:0.046
adamic_adar-similarity|AVG:0.931|MIN:0.836|MAX:0.972|STD:0.045
adamic_adar-common_neighbors-preferential_attachment-total_neighbors|AVG:0.922|MIN:0.826|MAX:0.979|STD:0.057
adamic_adar|AVG:0.879|MIN:0.660|MAX:0.955|STD:0.094
##########
precision ----------
adamic_adar-common_neighbors-preferential_attachment-total_neighbors-similarity|AVG:0.891|MIN:0.755|MAX:0.969|STD:0.070
adamic_adar-similarity|AVG:0.887|MIN:0.754|MAX:0.955|STD:0.068
adamic_adar-common_neighbors-preferential_attachment-total_neighbors|AVG:0.878|MIN:0.742|MAX:0.967|STD:0.084
adamic_adar|AVG:0.825|MIN:0.595|MAX:0.922|STD:0.108
##########
recall ----------
adamic_adar-similarity|AVG:0.995|MIN:0.991|MAX:0.999|STD:0.002
adamic_adar-common_neighbors-preferential_attachment-total_neighbors-similarity|AVG:0.994|MIN:0.991|MAX:0.999|STD:0.003
adami

In [14]:
tmp_df = pd.DataFrame(clf_accuracy_scores, columns=['dt_comb_1', 'dt_comb_2', 'dt_comb_3', 'dt_comb_4'])
df_accuracy_scores = pd.concat([df_accuracy_scores, tmp_df], axis=1)

In [15]:
clf_accuracy_scores = []
statistics = evaluate_datasets(neural_network)

print_statistics(statistics, 'accuracy')
print('#'*10)
print_statistics(statistics, 'precision')
print('#'*10)
print_statistics(statistics, 'recall')
print('#'*10)

<function neural_network at 0x7fbfabd39f28>




accuracy ----------
adamic_adar-similarity|AVG:0.965|MIN:0.943|MAX:0.979|STD:0.012
adamic_adar|AVG:0.956|MIN:0.899|MAX:0.976|STD:0.024
adamic_adar-common_neighbors-preferential_attachment-total_neighbors-similarity|AVG:0.938|MIN:0.807|MAX:0.979|STD:0.054
adamic_adar-common_neighbors-preferential_attachment-total_neighbors|AVG:0.928|MIN:0.801|MAX:0.982|STD:0.061
##########
precision ----------
adamic_adar-similarity|AVG:0.941|MIN:0.898|MAX:0.968|STD:0.022
adamic_adar|AVG:0.928|MIN:0.834|MAX:0.968|STD:0.041
adamic_adar-common_neighbors-preferential_attachment-total_neighbors-similarity|AVG:0.902|MIN:0.721|MAX:0.971|STD:0.078
adamic_adar-common_neighbors-preferential_attachment-total_neighbors|AVG:0.888|MIN:0.715|MAX:0.975|STD:0.087
##########
recall ----------
adamic_adar-similarity|AVG:0.994|MIN:0.992|MAX:0.999|STD:0.002
adamic_adar-common_neighbors-preferential_attachment-total_neighbors|AVG:0.993|MIN:0.988|MAX:0.999|STD:0.003
adamic_adar-common_neighbors-preferential_attachment-total_

In [16]:
tmp_df = pd.DataFrame(clf_accuracy_scores, columns=['nn_comb_1', 'nn_comb_2', 'nn_comb_3', 'nn_comb_4'])
df_accuracy_scores = pd.concat([df_accuracy_scores, tmp_df], axis=1)