In [None]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [None]:
import pandas as pd
from classifier.single_label_classifier import SingleLabelClassifier
from sklearn.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score, roc_curve, auc, precision_recall_curve, average_precision_score, precision_recall_fscore_support
from preprocessing.features import BASELINE_FEATURE_COLUMNS
from preprocessing.preprocessor import Preprocessor
from preprocessing.tags import Tags
from evaluation.utils import create_confusion_matrix, create_classification_report, create_precision_recall_graph, create_receiver_operation_characteristic_graph, create_error_matrix_graph, create_feature_importance_graph, false_positives, false_negatives, create_revision_link_html, create_revisions_by_tag_graph
import csv

## Preprocess dataset

In [None]:
features = '../data/features-total-80c4772.csv'
tag_df = pd.read_csv('../data/tag.csv')

all_tags = [
    Tags.BLANKING,
    Tags.NONSENSE,
    Tags.QUALITY_ISSUE,
    Tags.PERSONAL_STORY,
    Tags.FALSE_FACT,
    Tags.SYNTAX,
    Tags.MERGE_CONFLICT,
    Tags.TEMPLATE,
    Tags.EDIT_WARS,
    Tags.SEO,
    Tags.INTENTION
]
undersampling_rates = [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]
scores = ['precision', 'recall', 'f1', 'support']
classes = ['no_vandalism', 'vandalism']
report = []

for rate in undersampling_rates:
    row = {}
    preprocessor = Preprocessor(features, all_tags, vandalism_sample_rate=0.3)
    output = preprocessor.run()
    clf = SingleLabelClassifier(output, 300)
        
    print(f'Rate: {rate}, Training')
    row['rate'] = rate
    y_train_predict, y_train_predict_proba = clf.train_predict()
    y_train = output.y_train
    prfs = precision_recall_fscore_support(y_train, y_train_predict)
    
    scores = ['precision', 'recall', 'f1', 'support']
    classes = ['no_vandalism', 'vandalism']
    for s, score_name in enumerate(scores):
        for c, class_name in enumerate(classes):
            row[f'train_{score_name}_{class_name}'] = prfs[s][c]
    
    print(f'Rate: {rate}, Test')
    y_predict, y_predict_proba = clf.test_predict()
    y = output.y_test
    prfs = precision_recall_fscore_support(y, y_predict)
    print(create_classification_report(y, y_predict))

    for s, score_name in enumerate(scores):
        for c, class_name in enumerate(classes):
            row[f'test_{score_name}_{class_name}'] = prfs[s][c]

    print(row)
    report.append(row)

with open('undersampling_report.csv') as f:
    writer = csv.writer(f)
    for row in report:
        writer.writedict(row)