In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler

In [2]:
path = 'CORD19_GraphOfDocs/datasets'
datasets = [
    [f'{path}/dataset1/train_687.csv', f'{path}/dataset1/test_849.csv'],
    [f'{path}/dataset2/train_878.csv', f'{path}/dataset2/test_1573.csv'],
    [f'{path}/dataset3/train_1761.csv', f'{path}/dataset3/test_2662.csv'],
    [f'{path}/dataset4/train_3425.csv', f'{path}/dataset4/test_7869.csv'],
    [f'{path}/dataset5/train_5165.csv', f'{path}/dataset5/test_13107.csv'],
    [f'{path}/dataset6/train_5415.csv', f'{path}/dataset6/test_16434.csv'],
    [f'{path}/dataset7/train_6347.csv', f'{path}/dataset7/test_26144.csv'],
    [f'{path}/dataset8/train_8745.csv', f'{path}/dataset8/test_34965.csv'],
    [f'{path}/dataset9/train_13276.csv', f'{path}/dataset9/test_49747.csv']
]

features_combinations = [
    ['adamic_adar', 'common_neighbors', 'preferential_attachment', 'total_neighbors'],
    ['adamic_adar', 'common_neighbors', 'preferential_attachment', 'total_neighbors', 'similarity'],
    ['adamic_adar', 'similarity'],
    ['adamic_adar'],
]

In [3]:
def logistic_regression():
    return LogisticRegression(random_state=0, solver='lbfgs', multi_class='ovr')

def knn():
    return KNeighborsClassifier(n_neighbors=5, weights='uniform')

def linear_svm():
    return LinearSVC()

def decision_tree():
    return DecisionTreeClassifier(max_depth=5, random_state=0)

def get_normalizer():
    return MinMaxScaler()

def calculate_scores(classifier, selected_features, train_df, test_df):
    normalizer = get_normalizer()
    normalizer.fit(train_df[selected_features])
    train_data = normalizer.transform(train_df[selected_features])
    test_data = normalizer.transform(test_df[selected_features])
    classifier.fit(train_data, train_df['label'])
    predictions = classifier.predict(test_data)
    res = [
        accuracy_score(test_df['label'], predictions),
        precision_score(test_df['label'], predictions),
        recall_score(test_df['label'], predictions)
    ]
    return res

def evaluate_dataset(train_path, test_path, classifier):
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    scores = []

    for feature_columns in features_combinations: 
        scores.append([feature_columns, *calculate_scores(classifier(), feature_columns, train_df, test_df)])

    return scores

def evaluate_datasets(classifier_fun):
    print(classifier_fun)
    all_scores = []
    for index, dataset in enumerate(datasets, 1):
        #print(f'dataset {index}', '-'*60)
        all_scores.append(evaluate_dataset(dataset[0], dataset[1], classifier_fun))

    return generate_statistics(all_scores)

def print_scores(scores):
    scores = sorted(scores, key=lambda x: x[1], reverse=True)

    for features, accuracy, precision, recall,  in scores:
        print(features)
        print('A:', accuracy, 'P:', precision, 'R:', recall, '\n')
        
def print_statistics(statistics, score):
    print(score, '-'*10)
    statistics = sorted(statistics[score], key=lambda x: x[1], reverse=True)
    for row in statistics:
        print(f'{row[0]}|AVG:{row[1]:.3f}|MIN:{row[3]:.3f}|MAX:{row[2]:.3f}|STD:{row[4]:.3f}')

def generate_statistics(all_scores):
    # all_scores dimensions: dataset, feature combination, evaluation score:(1: accuracy, 2: precision, 3:recall)
    number_of_datasets = len(datasets)
    number_of_features_combinations = len(features_combinations)
    
    accuracy_scores_per_features_combination = []
    for i in range(number_of_features_combinations):
        accuracies = []
        for j in range(number_of_datasets):
            accuracies.append(all_scores[j][i][1])
        accuracies = np.array(accuracies)
        accuracy_std = np.std(accuracies)
        accuracy_max = np.max(accuracies)
        accuracy_min = np.min(accuracies)
        accuracy_average = np.mean(accuracies)
        identifier = '-'.join(features_combinations[i])
        accuracy_scores_per_features_combination.append([
            identifier,
            accuracy_average,
            accuracy_max,
            accuracy_min,
            accuracy_std
        ])

    precision_scores_per_features_combination = []
    for i in range(number_of_features_combinations):
        precisions = []
        for j in range(number_of_datasets):
            precisions.append(all_scores[j][i][2])
        precisions = np.array(precisions)
        precision_std = np.std(precisions)
        precision_max = np.max(precisions)
        precision_min = np.min(precisions)
        precision_average = np.mean(precisions)
        identifier = '-'.join(features_combinations[i])
        precision_scores_per_features_combination.append([
            identifier,
            precision_average,
            precision_max,
            precision_min,
            precision_std
        ])
    
    recall_scores_per_features_combination = []
    for i in range(number_of_features_combinations):
        recalls = []
        for j in range(number_of_datasets):
            recalls.append(all_scores[j][i][3])
        recalls = np.array(recalls)
        recall_std = np.std(recalls)
        recall_max = np.max(recalls)
        recall_min = np.min(recalls)
        recall_average = np.mean(recalls)
        identifier = '-'.join(features_combinations[i])
        recall_scores_per_features_combination.append([
            identifier,
            recall_average,
            recall_max,
            recall_min,
            recall_std
        ])
    
    return {
        'accuracy': accuracy_scores_per_features_combination,
        'precision': precision_scores_per_features_combination,
        'recall': recall_scores_per_features_combination
    }

In [4]:
statistics = evaluate_datasets(logistic_regression)

print_statistics(statistics, 'accuracy')
print('#'*10)
print_statistics(statistics, 'precision')
print('#'*10)
print_statistics(statistics, 'recall')
print('#'*10)

<function logistic_regression at 0x7f9b19369ea0>
accuracy ----------
adamic_adar-similarity|AVG:0.973|MIN:0.967|MAX:0.982|STD:0.005
adamic_adar|AVG:0.971|MIN:0.963|MAX:0.979|STD:0.005
adamic_adar-common_neighbors-preferential_attachment-total_neighbors-similarity|AVG:0.959|MIN:0.942|MAX:0.968|STD:0.007
adamic_adar-common_neighbors-preferential_attachment-total_neighbors|AVG:0.955|MIN:0.938|MAX:0.964|STD:0.008
##########
precision ----------
adamic_adar-similarity|AVG:0.975|MIN:0.950|MAX:0.991|STD:0.013
adamic_adar|AVG:0.968|MIN:0.945|MAX:0.986|STD:0.013
adamic_adar-common_neighbors-preferential_attachment-total_neighbors-similarity|AVG:0.961|MIN:0.910|MAX:0.984|STD:0.023
adamic_adar-common_neighbors-preferential_attachment-total_neighbors|AVG:0.955|MIN:0.907|MAX:0.980|STD:0.022
##########
recall ----------
adamic_adar|AVG:0.975|MIN:0.966|MAX:0.986|STD:0.007
adamic_adar-similarity|AVG:0.972|MIN:0.957|MAX:0.986|STD:0.008
adamic_adar-common_neighbors-preferential_attachment-total_neighbor

In [5]:
statistics = evaluate_datasets(knn)

print_statistics(statistics, 'accuracy')
print('#'*10)
print_statistics(statistics, 'precision')
print('#'*10)
print_statistics(statistics, 'recall')
print('#'*10)

<function knn at 0x7f9add918510>
accuracy ----------
adamic_adar-similarity|AVG:0.954|MIN:0.914|MAX:0.973|STD:0.017
adamic_adar|AVG:0.938|MIN:0.908|MAX:0.960|STD:0.018
adamic_adar-common_neighbors-preferential_attachment-total_neighbors-similarity|AVG:0.938|MIN:0.847|MAX:0.974|STD:0.036
adamic_adar-common_neighbors-preferential_attachment-total_neighbors|AVG:0.936|MIN:0.847|MAX:0.973|STD:0.035
##########
precision ----------
adamic_adar-similarity|AVG:0.921|MIN:0.859|MAX:0.956|STD:0.029
adamic_adar-common_neighbors-preferential_attachment-total_neighbors-similarity|AVG:0.899|MIN:0.768|MAX:0.960|STD:0.054
adamic_adar|AVG:0.897|MIN:0.850|MAX:0.935|STD:0.029
adamic_adar-common_neighbors-preferential_attachment-total_neighbors|AVG:0.896|MIN:0.768|MAX:0.959|STD:0.053
##########
recall ----------
adamic_adar-similarity|AVG:0.995|MIN:0.992|MAX:0.999|STD:0.002
adamic_adar|AVG:0.993|MIN:0.989|MAX:0.999|STD:0.003
adamic_adar-common_neighbors-preferential_attachment-total_neighbors-similarity|AVG

In [6]:
statistics = evaluate_datasets(linear_svm)

print_statistics(statistics, 'accuracy')
print('#'*10)
print_statistics(statistics, 'precision')
print('#'*10)
print_statistics(statistics, 'recall')
print('#'*10)

<function linear_svm at 0x7f9add918598>
accuracy ----------
adamic_adar-similarity|AVG:0.972|MIN:0.956|MAX:0.981|STD:0.009
adamic_adar|AVG:0.969|MIN:0.954|MAX:0.980|STD:0.009
adamic_adar-common_neighbors-preferential_attachment-total_neighbors-similarity|AVG:0.963|MIN:0.936|MAX:0.976|STD:0.012
adamic_adar-common_neighbors-preferential_attachment-total_neighbors|AVG:0.959|MIN:0.933|MAX:0.971|STD:0.012
##########
precision ----------
adamic_adar-similarity|AVG:0.957|MIN:0.925|MAX:0.977|STD:0.017
adamic_adar|AVG:0.953|MIN:0.920|MAX:0.976|STD:0.018
adamic_adar-common_neighbors-preferential_attachment-total_neighbors-similarity|AVG:0.946|MIN:0.889|MAX:0.976|STD:0.026
adamic_adar-common_neighbors-preferential_attachment-total_neighbors|AVG:0.942|MIN:0.886|MAX:0.972|STD:0.026
##########
recall ----------
adamic_adar-similarity|AVG:0.989|MIN:0.986|MAX:0.994|STD:0.002
adamic_adar|AVG:0.987|MIN:0.982|MAX:0.995|STD:0.004
adamic_adar-common_neighbors-preferential_attachment-total_neighbors-similar

In [7]:
statistics = evaluate_datasets(decision_tree)

print_statistics(statistics, 'accuracy')
print('#'*10)
print_statistics(statistics, 'precision')
print('#'*10)
print_statistics(statistics, 'recall')
print('#'*10)

<function decision_tree at 0x7f9add918730>
accuracy ----------
adamic_adar-common_neighbors-preferential_attachment-total_neighbors-similarity|AVG:0.930|MIN:0.839|MAX:0.980|STD:0.047
adamic_adar-similarity|AVG:0.928|MIN:0.837|MAX:0.972|STD:0.046
adamic_adar-common_neighbors-preferential_attachment-total_neighbors|AVG:0.922|MIN:0.828|MAX:0.979|STD:0.056
adamic_adar|AVG:0.880|MIN:0.663|MAX:0.958|STD:0.093
##########
precision ----------
adamic_adar-common_neighbors-preferential_attachment-total_neighbors-similarity|AVG:0.887|MIN:0.759|MAX:0.969|STD:0.071
adamic_adar-similarity|AVG:0.885|MIN:0.758|MAX:0.956|STD:0.068
adamic_adar-common_neighbors-preferential_attachment-total_neighbors|AVG:0.878|MIN:0.746|MAX:0.968|STD:0.081
adamic_adar|AVG:0.827|MIN:0.600|MAX:0.929|STD:0.107
##########
recall ----------
adamic_adar-similarity|AVG:0.995|MIN:0.991|MAX:0.999|STD:0.002
adamic_adar-common_neighbors-preferential_attachment-total_neighbors-similarity|AVG:0.995|MIN:0.991|MAX:0.999|STD:0.003
adami