# Evaluate predictions
This notebook serves as a general evaluation of the predictions made on the test dataset.

The prediction file should have the following columns:
- contributor
- type (Human, Bot)

We will compare the predictions made with the real labels and compute the following metrics:
- recall
- precision
- weighted f1-score
- accuracy
- roc-auc
- num_contributors
- tn_fp_fn_tp

In [23]:
import pandas as pd

from sklearn.metrics import confusion_matrix, make_scorer, precision_recall_fscore_support
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, roc_auc_score
from sklearn.metrics import precision_recall_curve, auc

## Parameters
The metrics will be added at the end the given evaluation file. The "NAME" variable is used to identify the evaluation in the file.

In [24]:
RESOURCE_FOLDER = '../resources'
DATA_FOLDER = f'{RESOURCE_FOLDER}/data'
EVAL_FOLDER = f'{RESOURCE_FOLDER}/evals'

LABELS_FILE = 'contributors.csv'
PRED_FILE = 'ghmap_predictions.csv'
EVAL_FILE = 'evaluation.csv'
NAME = 'ghmap'

# 1 - Load data

## 1.1 - Load predictions

In [25]:
df_prediction = pd.read_csv(f"{EVAL_FOLDER}/{PRED_FILE}")
df_prediction.drop(columns=['confidence'], inplace=True)
df_prediction.head()

Unnamed: 0,contributor,type
0,otc-zuul[bot],Bot
1,ljharb,Human
2,juancarlospaco,Human
3,tldr-bot,Bot
4,jakirkham,Human


Remove the unknown (< 5 events) and invalid (not found) contributors.

In [26]:
# Remove the 'Unknown' and Invalid types
df_prediction = df_prediction[df_prediction['type'] != 'Unknown']
df_prediction = df_prediction[df_prediction['type'] != 'Invalid']

Convert Bot and Human to 1 and 0 respectively.

In [27]:
contributor_map = {
    'Bot': 1,
    'Human': 0
}

df_prediction['type'] = df_prediction['type'].map(contributor_map)

# Set column type to int
df_prediction['type'] = df_prediction['type'].astype(int)
df_prediction.head()

Unnamed: 0,contributor,type
0,otc-zuul[bot],1
1,ljharb,0
2,juancarlospaco,0
3,tldr-bot,1
4,jakirkham,0


## 1.2 - Load labels

In [28]:
df_labels = pd.read_csv(f"{DATA_FOLDER}/{LABELS_FILE}")
# rename the column bot as true_type
df_labels.rename(columns={'bot': 'true_type'}, inplace=True)

# Merge the labels with the predictions
df_compare = df_prediction.merge(df_labels, on='contributor')

df_compare.head()

Unnamed: 0,contributor,type,true_type
0,otc-zuul[bot],1,1
1,ljharb,0,0
2,juancarlospaco,0,0
3,tldr-bot,1,1
4,jakirkham,0,0


# 2 - Evaluate predictions

## 2.1 - Define the metrics

In [29]:
# Taken from RABBIT

def botrecall(y_true, y_pred):
    return precision_recall_fscore_support(y_true,y_pred,zero_division=0.0)[1][1]
bot_recall = make_scorer(botrecall, greater_is_better=True)

def humanrecall(y_true, y_pred):
    return precision_recall_fscore_support(y_true,y_pred,zero_division=0.0)[1][0]
human_recall = make_scorer(humanrecall, greater_is_better=True)

def botprecision(y_true, y_pred):
    return precision_recall_fscore_support(y_true,y_pred,zero_division=0.0)[0][1]
bot_precision = make_scorer(botprecision, greater_is_better=True)

def humanprecision(y_true, y_pred):
    return precision_recall_fscore_support(y_true,y_pred,zero_division=0.0)[0][0]
human_precision = make_scorer(humanprecision, greater_is_better=True)

def wpscore(y_true, y_pred):
    return precision_recall_fscore_support(y_true,y_pred,average='weighted',zero_division=0.0)[0]
wprecision_score = make_scorer(wpscore, greater_is_better=True)

def wrscore(y_true, y_pred):
    return precision_recall_fscore_support(y_true,y_pred,average='weighted',zero_division=0.0)[1]
wrecall_score = make_scorer(wrscore, greater_is_better=True)

def auc_pr(y_true, y_pred):
    P, R, T = precision_recall_curve(y_true, y_pred)
    return auc(R, P)

auc_pr = make_scorer(auc_pr, greater_is_better=True)

## 2.2 - Evaluate predictions

In [30]:
y_pred = df_compare['type']
y_test = df_compare['true_type']

perf = {
    'precision': [precision_score(y_test, y_pred, average='weighted', zero_division=0.0)],
    'bot_precision': [botprecision(y_test, y_pred)],
    'human_precision': [humanprecision(y_test, y_pred)],
    'recall': [recall_score(y_test, y_pred, average='weighted', zero_division=0.0)],
    'bot_recall': [botrecall(y_test, y_pred)],
    'human_recall': [humanrecall(y_test, y_pred)],
    'weighted_f1': [f1_score(y_test, y_pred, average='weighted', zero_division=0.0)],
    'accuracy': [accuracy_score(y_test, y_pred)],
    'tn_fp_fn_tp': [confusion_matrix(y_test, y_pred).ravel()],
    'num_contrib': df_prediction.index.nunique(),
    'roc_auc': [roc_auc_score(y_test, y_pred)]
}

performance = pd.DataFrame.from_dict(perf)
performance.index = [NAME]

In [31]:
display(performance)

Unnamed: 0,precision,bot_precision,human_precision,recall,bot_recall,human_recall,weighted_f1,accuracy,tn_fp_fn_tp,num_contrib,roc_auc
ghmap,0.894093,0.897638,0.890585,0.894057,0.888312,0.899743,0.894051,0.894057,"[350, 39, 43, 342]",774,0.894027


## 2.3 - Save the evaluation

In [32]:
df_eval = pd.read_csv(f"{EVAL_FOLDER}/{EVAL_FILE}", index_col=0)

In [33]:
# If the evaluation already exists, replace the line
if NAME in df_eval.index:
    df_eval.drop(index=NAME, inplace=True)
df_eval = pd.concat([df_eval, performance], axis=0)
df_eval.to_csv(f"{EVAL_FOLDER}/{EVAL_FILE}", index=True)
