In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter
from sklearn.externals import joblib
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_validate, cross_val_predict
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_curve
from sklearn.ensemble import VotingClassifier
from ipywidgets import interact, interactive
from preprocessing import VandalismEncoder, RevisionGrouper, FeatureSelector
from plotting import plot_confusion_matrix, plot_scores, plot_precision_recall, plot_roc, plot_information_gain, plot_multilabel_precision_recall, plot_multilabel_roc
from persistence import load_all_classifiers
from baseline import BASELINE_FEATURE_COLUMNS

  from numpy.core.umath_tests import inner1d


In [2]:
# Load classifiers
classifiers = load_all_classifiers('models-all-0c25f9f')

Loading model tag_1
Loading model tag_11
Loading model tag_12
Loading model tag_2
Loading model tag_3
Loading model tag_5
Loading model tag_6
Loading model tag_9


---
## Import Dataset

In [3]:
df = pd.read_csv('./data/features-0c25f9f.csv', index_col='revision_id')
unbiased_df = pd.read_csv('./data/features-0c25f9f-changedtables.csv', index_col='revision_id')
df.head()

Unnamed: 0_level_0,isContributorAnonymous,isBot,timeOfDay,localizedTimeOfDay,dayOfWeek,localizedDayOfWeek,isMinorEdit,hasPreviousSameContributor,timeSinceLastArticleEdit,timeSinceLastArticleEditBySameContributor,...,userCommentLength,hasNumericOutlierInColumns,hasNumericOutlierInRows,hasNumericOutlierInChangedCellValues,tableDataTypeInformationGain,templateUseFlags,templateUseFifaFlags,templateUsePageLink,templateUseYesNo,tag_id
revision_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
228189784,0.0,0.0,13.0,-1.0,7.0,-1.0,0.0,0.0,-1.0,-1.0,...,34.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,12
730449352,0.0,0.0,2.0,-1.0,2.0,-1.0,0.0,1.0,25.0,25.0,...,0.0,1.0,0.0,0.0,0.134129,0.0,0.0,0.0,0.0,12
216224416,0.0,0.0,17.0,-1.0,6.0,-1.0,1.0,0.0,7837.0,-1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12
539338621,0.0,1.0,1.0,-1.0,4.0,-1.0,1.0,0.0,2108.0,-1.0,...,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,12
746191686,1.0,0.0,20.0,23.0,2.0,2.0,0.0,0.0,4077.0,-1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12


In [4]:
result_df = pd.read_csv('./data/test_evaluation_revisions_0.81_martin_reviewed.csv')

In [5]:
unbiased_df.head()

Unnamed: 0_level_0,isContributorAnonymous,isBot,timeOfDay,localizedTimeOfDay,dayOfWeek,localizedDayOfWeek,isMinorEdit,hasPreviousSameContributor,timeSinceLastArticleEdit,timeSinceLastArticleEditBySameContributor,...,userCommentLength,hasNumericOutlierInColumns,hasNumericOutlierInRows,hasNumericOutlierInChangedCellValues,tableDataTypeInformationGain,templateUseFlags,templateUseFifaFlags,templateUsePageLink,templateUseYesNo,tag_id
revision_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
621617689,0.0,1.0,12.0,-1.0,7.0,-1.0,1.0,0.0,54787.0,-1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
261968370,0.0,0.0,22.0,-1.0,7.0,-1.0,0.0,1.0,3.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04918,0.0,
789282625,0.0,0.0,13.0,-1.0,4.0,-1.0,1.0,0.0,205.0,-1.0,...,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
193783324,0.0,0.0,21.0,-1.0,7.0,-1.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
258489681,0.0,0.0,2.0,-1.0,3.0,-1.0,1.0,0.0,6.0,12662.0,...,0.0,0.0,0.0,1.0,0.002788,0.0,0.0,0.0,0.0,


In [6]:
# Reduce size of test set:
threshold_df = unbiased_df.tail(8000).head(3000)
unbiased_df = unbiased_df.head(3000)

---
## Preprocessing

In [7]:
def preprocessing(frame, features, tags):
    feature_columns = None
    selected_tags = []
    
    if features == 'baseline':
        feature_columns = BASELINE_FEATURE_COLUMNS
    
    if tags == 'all vandalism':
        selected_tags = [1, 2, 3, 5, 6, 9, 11]
    else:
        selected_tags.append(tags)

    preprocess_pipeline = Pipeline([
        ('vandalism_encoder', VandalismEncoder(selected_tags)),
        ('revision_grouper', RevisionGrouper()),
        ('feature_selector', FeatureSelector(feature_columns))
    ])

    print('Feature config:', features, 'Selected tags:', selected_tags)
    return preprocess_pipeline.transform(frame)\
        .drop_duplicates()\
        .fillna(0)

In [8]:
processed_df = preprocessing(df, 'all', 'all vandalism')
processed_unbiased_df = preprocessing(unbiased_df, 'all', 'all vandalism')
processed_threshold_df = preprocessing(threshold_df, 'all', 'all vandalism')

Feature config: all Selected tags: [1, 2, 3, 5, 6, 9, 11]
Feature config: all Selected tags: [1, 2, 3, 5, 6, 9, 11]
Feature config: all Selected tags: [1, 2, 3, 5, 6, 9, 11]


In [9]:
# X, y split
X_train = processed_df.drop(['is_vandalism'], axis=1)
y_train = processed_df['is_vandalism']

X_test = processed_unbiased_df.drop(['is_vandalism'], axis=1)
y_test = processed_unbiased_df['is_vandalism']

X_test_threshold = processed_threshold_df.drop(['is_vandalism'], axis=1)
y_test_threshold = processed_threshold_df['is_vandalism']

In [10]:
print('X train', X_train.shape)
print('y train', y_train.shape)
print('X test', X_test.shape)
print('y test', y_test.shape)

X train (4960, 121)
y train (4960,)
X test (3000, 121)
y test (3000,)


In [11]:
estimators = [(str(tag_id), model['clf']) for tag_id, model in classifiers.items()]
vandalism_estimators = list(filter(lambda e: e[0] != '12', estimators))
vandalism_clf = VotingClassifier(vandalism_estimators, voting='soft')
no_vandalism_clf = list(filter(lambda e: e[0] == '12', estimators))[0][1]

---
## Evaluate classifier and choose threshold

In [None]:
# Evaluate classifier
clf = VotingClassifier([('0', no_vandalism_clf), ('1', vandalism_clf)], voting='soft')

y_train_predict = cross_val_predict(clf, X_train, y_train, cv=10, n_jobs=-1, method='predict')
y_train_predict_proba = cross_val_predict(clf, X_train, y_train, cv=10, n_jobs=-1, method='predict_proba')

  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


In [None]:
plot_confusion_matrix(y_test, y_train_predict)

In [None]:
plot_precision_recall(y_train, y_train_predict_proba[:, 1])

In [None]:
plot_roc(y_train, y_train_predict_proba[:, 1])

In [None]:
# Choose Threshold
wanted_precision = 0.85
precision, recall, thresholds = precision_recall_curve(y_train, y_train_predict_proba[:, 1])
indexed_precision = [(i, p) for i, p in enumerate(precision) if p >= wanted_precision]
no_vandalism_count = indexed_precision[0][0]
threshold = thresholds[indexed_precision[0][0]]

print('Precision:', indexed_precision[0][1])
print('Recall:', recall[indexed_precision[0][0]])
print('Threshold:', threshold)

---
## Predict vandalism

In [None]:
# Train classifier on entire test set
clf.fit(X_train, y_train)

In [None]:
threshold = 0.65

In [None]:
# Predict probabilities for threshold dataset
y_test_predict_proba = clf.predict_proba(X_test_threshold)

In [None]:
y_result = y_test.to_frame()
y_result['proba_no_vandalism'] = y_test_predict_proba[:, 0]
y_result['proba_vandalism'] = y_test_predict_proba[:, 1]

In [None]:
y_vandalism = y_result.loc[y_result.proba_vandalism > threshold]
y_vandalism = y_vandalism.drop(['is_vandalism'], axis=1)

all_count = len(y_result)
vandalism_count = len(y_vandalism)

print('Num. predicted vandalism', len(y_vandalism))
print('Samples No Vandalism:', all_count - vandalism_count, '/', all_count)
print('Samples Vandalism:', vandalism_count, '/', all_count)
print('Vandalism rate:', (vandalism_count / all_count))

In [None]:
y_vandalism

In [None]:
# Save predicted vandalism revisions to disk
# y_vandalism.to_csv('./data/test_evaluation_revisions_%0.2f.csv' % wanted_precision)

---
## Results

In [None]:
result_df

In [None]:
# Plotting
import seaborn as sns

ranked = result_df.sort_values(['proba_vandalism'], ascending=[False])

ranked['tag'] = ranked['martins_tag (0 = NV)'].map(lambda x: 1 if x == '(1)' else x)
ranked['tag'] = ranked['tag'].map(lambda x: 0 if x == '(0)' else x)
ranked['tag'] = ranked['tag'].map(lambda x: int(x))
relevant_docs = ranked['tag'].value_counts()[1]
found_docs = 0

precisions = []
recalls = []
average_precisions = []
heat = []
indices = []

for i, (index, row) in enumerate(ranked.iterrows()):
    indices.append(i)
    heat.append(found_docs)
    
    if row['tag'] == 1:
        found_docs = found_docs + 1
        average_precisions.append(found_docs / (i + 1))
    precisions.append(found_docs / (i + 1))
    recalls.append(found_docs / relevant_docs)

average_precision = (sum(average_prevision) / len(average_prevision))


plt.figure(figsize=(6, 6))
ax = sns.lineplot(recalls, precisions, legend='full', label='Average Precision = %0.2f' % average_precision)
ax.set(xlabel='Recall', ylabel='Precision')
ax.legend(loc=9, bbox_to_anchor=(0.5, -0.15), ncol=2, frameon=False)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.savefig('figures/precision_recall_ranking.pdf', bbox_inches='tight')
plt.show()


ax = sns.lineplot(indices, heat)
ax.set(xlabel='Rank', ylabel='Number of vandalized revisions')
plt.plot([0, 197], [0, 197], 'k--',color='gray', alpha=0.2)
plt.xlim([0, 197])
plt.ylim([0, 197])
plt.savefig('figures/ranking.pdf', bbox_inches='tight')
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_curve, average_precision_score, auc, roc_curve
from sklearn.feature_selection import mutual_info_classif
from scipy import interp

def plot_confusion_matrix(y_true, y_pred):
    vandalism_count = y_true.value_counts()[True]
    no_vandalism_count = y_true.value_counts()[False]
    cfn_matrix = confusion_matrix(y_true, y_pred)
    cfn_norm_matrix = np.array([[1.0/no_vandalism_count, 1.0/no_vandalism_count], [1.0/vandalism_count, 1.0/vandalism_count]])
    norm_cfn_matrix = cfn_matrix * cfn_norm_matrix
    
    fig = plt.figure(figsize=(15,5))
    ax = fig.add_subplot(1,2,1)
    sns.heatmap(cfn_matrix, annot=True, fmt="d", linewidths=0.5, ax=ax)
    plt.title('Confusion Matrix')
    plt.ylabel('Real Classes')
    plt.xlabel('Predicted Classes')

    ax = fig.add_subplot(1,2,2)
    sns.heatmap(norm_cfn_matrix, linewidths=0.5, annot=True, ax=ax)
    plt.title('Normalized Confusion Matrix')
    plt.ylabel('Real Classes')
    plt.xlabel('Predicted Classes')
    plt.show()
    
    print('Classification Report')
    classes = ['No Vandalism', 'Vandalism']
    print(classification_report(y_true, y_pred, target_names=classes))
    
def plot_scores(scores):
    print('Cross validation scores')
    scores = list(scores.items())[2:]

    for score in scores:
        sns.lineplot(range(0, 10), score[1], label=score[0])
        plt.legend(bbox_to_anchor=(1.1, 1.05), frameon=False)
    plt.show()

def plot_precision_recall(y_true, y_predict_proba):
    precision, recall, thresholds = precision_recall_curve(y_true, y_predict_proba)
    average_precision = average_precision_score(y_true, y_predict_proba)
    pr_auc = auc(recall, precision)

    plt.title('Precision-Recall')
    plt.step(recall, precision, color='b', where='post', label='AUC = %0.2f' % pr_auc)
    plt.xticks(np.arange(0, 1.1, 0.1))
    plt.yticks(np.arange(0, 1.1, 0.1))
    plt.ylabel('Precision')
    plt.xlabel('Recall')
    plt.legend(loc='lower right', frameon=False)
    plt.show()
    
def plot_roc(Y_true, Y_predict_proba, output_path):
    fpr, tpr, _ = roc_curve(Y_true, Y_predict_proba)
    roc_auc = auc(fpr, tpr)
    
    plt.figure(figsize=(4, 4))
    plt.plot(fpr, tpr,label='vandalism (AUC = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.legend(loc=9, bbox_to_anchor=(0.5, -0.15), ncol=2, frameon=False)
    plt.savefig(output_path, bbox_inches='tight')
    plt.show()


def plot_feature_importance(clf, feature_labels):
    # Feature importance (weights in random forrest)
    feature_importance = clf.feature_importances_
    x_pos = np.arange(len(feature_importance))

    plt.gcf().set_size_inches(14, 6)
    plt.bar(x_pos, feature_importance, align='center')
    plt.xticks(x_pos, feature_labels, rotation='vertical')
    plt.ylabel('Feature Importance')
    plt.show()


def plot_information_gain(X, y):
    feature_labels = X.columns
    information_gain = mutual_info_classif(X, y)
    x_pos = np.arange(len(information_gain))

    plt.gcf().set_size_inches(14, 6)
    plt.bar(x_pos, information_gain, align='center')
    plt.xticks(x_pos, feature_labels, rotation='vertical')
    plt.ylabel('Information Gain')
    plt.show()


def plot_multilabel_classification_report(Y_true, Y_predict, tag_names):
    print(classification_report(Y_true, Y_predict, target_names=tag_names))


def plot_multilabel_precision_recall(Y_true, Y_predict_proba, tag_names, output_path=''):
    precision = dict()
    recall = dict()
    average_precision = dict()

    for i in range(0, len(Y_true[0])):
        precision[i], recall[i], _ = precision_recall_curve(Y_true[:, i], Y_predict_proba[:, i])
        average_precision[i] = average_precision_score(Y_true[:, i], Y_predict_proba[:, i])
        print('precision', precision[i])
        print('recall', recall[i])

    plt.figure(figsize=(6, 6))
    f_scores = np.linspace(0.2, 0.8, num=4)
    lines = []
    labels = []
    for f_score in f_scores:
        x = np.linspace(0.01, 1)
        y = f_score * x / (2 * x - f_score)
        l, = plt.plot(x[y >= 0], y[y >= 0], color='gray', alpha=0.2)
        plt.annotate('f1={0:0.1f}'.format(f_score), xy=(0.9, y[45] + 0.02))

    lines.append(l)
    labels.append('iso-f1 curves')

    for i in range(len(tag_names)):
        l, = plt.plot(recall[i], precision[i], lw=2)
        lines.append(l)
        labels.append('{0} (AUC = {1:0.2f})'
                    ''.format(tag_names[i], average_precision[i]))

    fig = plt.gcf()
    fig.subplots_adjust(bottom=0.25)
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('Recall')
    plt.ylabel('Precision')

    plt.legend(lines, labels, loc=9, bbox_to_anchor=(0.5, -0.15), ncol=2, frameon=False)
    plt.savefig(output_path, bbox_inches='tight')
    plt.show()


In [None]:
raised[['proba_no_vandalism', 'proba_vandalism']]

In [None]:
raised = result_df
raised['tag'] = result_df['martins_tag (0 = NV)'].map(lambda x: 1 if x == '(1)' else x)
raised['tag'] = raised['tag'].map(lambda x: 0 if x == '(0)' else x)
raised['tag'] = raised['tag'].map(lambda x: int(x))
proba_vandalism = raised[['proba_vandalism']]
tag = raised['tag']



y_test_van = tag.apply(lambda x: bool(x)).rename('vandalism')
y_test_multi = pd.concat([y_test_van], axis=1)

plot_multilabel_precision_recall(y_test_multi.as_matrix(),
                                 proba_vandalism.as_matrix(),
                                ['vandalism'],
                                'figures/precision_recall_ranking.pdf')