# Distinguishing MHPs and Peers

This is a simple binary classifier using unigrams. 

In [2]:
""" imports """
import pandas as pd 
import random

from nltk.tokenize import word_tokenize
from collections import defaultdict, Counter
import numpy as np
import scipy.sparse as sp

from tqdm import tqdm
import time
from sklearn import svm, linear_model, naive_bayes



In [3]:
""" parameters """
KFOLDS = 10 # set desired num folds
test_proportion = 1 / KFOLDS

SHUFFLE_FIRST = True # true if shuffling before test splits
random_seed = 20

min_count = 5 # min occurences of word

LabelIDX = {'mhp': 0, 'peer': 1}
idx2Label = {v:k for k,v in LabelIDX.items()}


classifier = 'naive_bayes' # LinearSVC, LogisticRegression


In [4]:
""" load dataset """

datapath = "path to the dataset" # the data was in the same order as ../Data/task_data.csv

df = pd.read_pickle(datapath)

mhp_df = df[df['author-type'] == 'mhp']
peer_df = df[df['author-type'] == 'non-mhp']

mhp_text = list(mhp_df['top-reply-text'].values)
peer_text = list(peer_df['top-reply-text'].values)
mhp_index = list(mhp_df['top-reply-text'].index)
peer_index = list(peer_df['top-reply-text'].index) 

In [5]:
""" downsample """
random.seed(random_seed)

peer_sample = random.sample(list(zip(peer_text, peer_index)), k=len(mhp_df))
peer_text_sample, peer_index_sample = zip(*peer_sample)

In [6]:
""" determine train and test indices for the k-folds """

n_samples = len(mhp_df) # n_samples / class, based on size of smaller class for even split
master_indices = list(range(n_samples))

if SHUFFLE_FIRST:
    random.seed(random_seed)
    random.shuffle(master_indices)


fold_indices = {}
for fold in range(KFOLDS):

    test_idx_start = int((fold * test_proportion) * n_samples)
    test_idx_end = int(test_idx_start + test_proportion * n_samples)

    # get indices for fold
    test_range = list(range(test_idx_start, test_idx_end))
    train_range = list(range(0,test_idx_start)) + list(range(test_idx_end, n_samples))

    # shuffle indices
    test_range = [master_indices[i] for i in test_range]
    train_range = [master_indices[i] for i in train_range]

    # save indices
    fold_indices[fold] = {'train':train_range, 'test':test_range}

In [7]:
""" load functions """

def preprocess(sentence):
    return [i.lower() for i in word_tokenize(sentence)]

def get_vocab(text_list, min_count=5):
    """
    text_list: list of text instances
    """
    vocab_count = defaultdict(lambda: 0)
    for text in tqdm(text_list):
        tokens = preprocess(text)
        for t in tokens:
            vocab_count[t] += 1

    vocab_set = set()
    for k,v in vocab_count.items():
        if v >= min_count:
            vocab_set.add(k)
    return list(vocab_set)

def map_instance_features(text, vocab2index):
    tokens = preprocess(text)
    feature_map = {} # maps features (tokens) to indices in the feature vector
    for t in tokens:
        if t in feature_map:
            feature_map[vocab2index[t]] += 1
        elif t in vocab2index:
            feature_map[vocab2index[t]] = 1
    return feature_map

def vectorize(text_datapoints, vocab):
    vocab2index = {vocab[i]: i for i in range(len(vocab))} #vc

    feature_matrix_indices = [] # j_indices
    indptr = [0] 
    values = []
    
    for text_instance in tqdm(text_datapoints):
        feature_map = map_instance_features(text_instance, vocab2index)
        
        # feature map indices for this instance
        fm_idx = list(feature_map.keys())

        # feature map values
        fm_values = list(feature_map.values())

        feature_matrix_indices.extend(fm_idx)
        values.extend(fm_values)
        indptr.append(len(feature_matrix_indices))

    feature_matrix_indices = np.asarray(feature_matrix_indices)
    indptr = np.asarray(indptr)
    values = np.asarray(values)

    return sp.csr_matrix((values, feature_matrix_indices, indptr), shape=(len(indptr) - 1, len(vocab)))

In [8]:
""" run k-fold experiments """
start = time.time()

fold_vector_df = defaultdict(lambda:[])
results_df = defaultdict(lambda:[])


for fold in fold_indices:
    print("\nFold {}".format(fold + 1))
    print('-'*80)

    """
    1. Train / Test Split
    """

    train_range = fold_indices[fold]['train']
    test_range = fold_indices[fold]['test']

    train = [[mhp_text[i], 'mhp'] for i in train_range] + [[peer_text_sample[i], 'peer'] for i in train_range]
    train_index = [mhp_index[i] for i in train_range] + [peer_index_sample[i] for i in train_range]

    test = [[mhp_text[i], 'mhp'] for i in test_range] + [[peer_text_sample[i], 'peer'] for i in test_range]
    test_index = [mhp_index[i] for i in test_range] + [peer_index_sample[i] for i in test_range]

    points, labels = zip(*train)
    train_label_counts = Counter(labels)

    z = [i for i in range(len(points))]
    random.shuffle(z)
    p2 = [points[i] for i in z]
    l2 = [labels[i] for i in z]
    train_index = [train_index[i] for i in z]
    points = p2
    labels = l2

    """
    2. Get Vocab
    """
    vocab = get_vocab(points, min_count=min_count)
    vocabsize = len(vocab)


    """
    3. Vectorize Data
    """
    train_vecs = vectorize(points, vocab)

    tp, tl = zip(*test)
    test_vecs = vectorize(tp, vocab)

    """
    4. Get Labels
    """
    train_labels = [LabelIDX[i] for i in labels]
    train_label_counts = Counter(train_labels)


    test_labels = [LabelIDX[i] for i in tl]
    test_label_counts = Counter(test_labels)


    """
    5. Save a random basline
    """
    # a vector of labels that matches the distribution of the test labels
    test_label_distribution = [] 
    for label, count in test_label_counts.items():
        for i in range(count):
            test_label_distribution.append(label)

    random_baseline = [random.choice(test_label_distribution) for i in test_labels]

    random_baseline_correct = np.sum(np.array(random_baseline) == test_labels)
    random_baseline_accuracy = random_baseline_correct / len(test_labels)


    """
    6. Fit Classifier
    """
    if classifier == 'LinearSVC':
        print('Fitting LinearSVC classifier...')
        clf = svm.LinearSVC(verbose=1, max_iter=10000)
    elif classifier == 'LogisticRegression':
        print('Fitting LogisticRegression classifier...')
        clf = linear_model.LogisticRegression()
    else:
        print('Fitting naive_bayes classifier...')
        # defaults to naive_bayes
        clf = naive_bayes.MultinomialNB()

    clf.fit(train_vecs, train_labels)


    """
    7. Predict
    """
    predictions = clf.predict(test_vecs)
    total_correct_predictions = np.sum(predictions == test_labels)
    accuracy = total_correct_predictions / len(test_labels)


    """
    8. Output accuracy
    """
    print("\nFOLD {} RESULTS".format(fold + 1))
    print("+", "-" * 50, "+")
    print('Random baseline accuracy: {:.2%}'.format(random_baseline_accuracy))
    print('Model accuracy: {:.2%}'.format(accuracy))
    print("+", "-" * 50, "+", '\n')


    """
    9. Save info for analysis
    """
    fold_vector_df['seed'].extend([random_seed for i in test_labels])
    fold_vector_df['fold'].extend([fold for i in test_labels])
    fold_vector_df['predictions'].extend([idx2Label[pred] for pred in predictions])
    fold_vector_df['random-baseline'].extend([idx2Label[pred] for pred in random_baseline])
    fold_vector_df['actuals'].extend([idx2Label[pred] for pred in test_labels])
    fold_vector_df['index'].extend(test_index)


    results_df['seed'].append(random_seed)
    results_df['fold'].append(fold)
    results_df['accuracy'].append(accuracy)
    results_df['random_baseline_accuracy'].append(random_baseline_accuracy)
    results_df['vocabsize'].append(len(vocab))
    results_df['len(train)'].append(len(train))
    results_df['len(test)'].append(len(test))
    results_df['% train'].append(len(train) / (len(train) + len(test)))
    results_df['% test'].append(len(test) / (len(train) + len(test)))
    results_df['test label distribution'].append(test_label_counts)

results_df = pd.DataFrame(results_df, columns=results_df.keys())
fold_vector_df = pd.DataFrame(fold_vector_df, columns=fold_vector_df.keys())

end = time.time()

total_time = end - start

print(f"Complete. Execution time: {total_time}s")


  1%|          | 165/17434 [00:00<00:10, 1641.82it/s]


Fold 1
--------------------------------------------------------------------------------


100%|██████████| 17434/17434 [00:07<00:00, 2429.31it/s]
100%|██████████| 17434/17434 [00:07<00:00, 2445.34it/s]
100%|██████████| 1936/1936 [00:00<00:00, 2504.07it/s]
  2%|▏         | 285/17434 [00:00<00:06, 2836.01it/s]

Fitting naive_bayes classifier...

FOLD 1 RESULTS
+ -------------------------------------------------- +
Random baseline accuracy: 49.74%
Model accuracy: 69.37%
+ -------------------------------------------------- + 


Fold 2
--------------------------------------------------------------------------------


100%|██████████| 17434/17434 [00:07<00:00, 2455.61it/s]
100%|██████████| 17434/17434 [00:07<00:00, 2454.67it/s]
100%|██████████| 1936/1936 [00:00<00:00, 2404.42it/s]
  1%|▏         | 253/17434 [00:00<00:06, 2528.62it/s]

Fitting naive_bayes classifier...

FOLD 2 RESULTS
+ -------------------------------------------------- +
Random baseline accuracy: 50.46%
Model accuracy: 71.28%
+ -------------------------------------------------- + 


Fold 3
--------------------------------------------------------------------------------


100%|██████████| 17434/17434 [00:07<00:00, 2449.15it/s]
100%|██████████| 17434/17434 [00:07<00:00, 2434.46it/s]
100%|██████████| 1936/1936 [00:00<00:00, 2449.43it/s]
  1%|▏         | 229/17434 [00:00<00:07, 2284.73it/s]

Fitting naive_bayes classifier...

FOLD 3 RESULTS
+ -------------------------------------------------- +
Random baseline accuracy: 50.72%
Model accuracy: 72.31%
+ -------------------------------------------------- + 


Fold 4
--------------------------------------------------------------------------------


100%|██████████| 17434/17434 [00:07<00:00, 2410.30it/s]
100%|██████████| 17434/17434 [00:07<00:00, 2449.97it/s]
100%|██████████| 1936/1936 [00:00<00:00, 2482.10it/s]
  1%|▏         | 241/17434 [00:00<00:07, 2402.67it/s]

Fitting naive_bayes classifier...

FOLD 4 RESULTS
+ -------------------------------------------------- +
Random baseline accuracy: 49.54%
Model accuracy: 71.44%
+ -------------------------------------------------- + 


Fold 5
--------------------------------------------------------------------------------


100%|██████████| 17434/17434 [00:07<00:00, 2421.19it/s]
100%|██████████| 17434/17434 [00:07<00:00, 2389.41it/s]
100%|██████████| 1936/1936 [00:00<00:00, 2399.95it/s]
  1%|▏         | 260/17434 [00:00<00:06, 2591.12it/s]

Fitting naive_bayes classifier...

FOLD 5 RESULTS
+ -------------------------------------------------- +
Random baseline accuracy: 51.08%
Model accuracy: 68.80%
+ -------------------------------------------------- + 


Fold 6
--------------------------------------------------------------------------------


100%|██████████| 17434/17434 [00:07<00:00, 2384.45it/s]
100%|██████████| 17434/17434 [00:07<00:00, 2374.04it/s]
100%|██████████| 1936/1936 [00:00<00:00, 2444.82it/s]
  1%|▏         | 246/17434 [00:00<00:06, 2456.35it/s]

Fitting naive_bayes classifier...

FOLD 6 RESULTS
+ -------------------------------------------------- +
Random baseline accuracy: 49.33%
Model accuracy: 72.16%
+ -------------------------------------------------- + 


Fold 7
--------------------------------------------------------------------------------


100%|██████████| 17434/17434 [00:07<00:00, 2426.39it/s]
100%|██████████| 17434/17434 [00:07<00:00, 2412.88it/s]
100%|██████████| 1936/1936 [00:00<00:00, 2300.06it/s]
  1%|▏         | 253/17434 [00:00<00:06, 2513.47it/s]

Fitting naive_bayes classifier...

FOLD 7 RESULTS
+ -------------------------------------------------- +
Random baseline accuracy: 50.31%
Model accuracy: 72.83%
+ -------------------------------------------------- + 


Fold 8
--------------------------------------------------------------------------------


100%|██████████| 17434/17434 [00:07<00:00, 2428.15it/s]
100%|██████████| 17434/17434 [00:07<00:00, 2417.47it/s]
100%|██████████| 1936/1936 [00:00<00:00, 2365.08it/s]
  1%|▏         | 232/17434 [00:00<00:07, 2317.83it/s]

Fitting naive_bayes classifier...

FOLD 8 RESULTS
+ -------------------------------------------------- +
Random baseline accuracy: 50.93%
Model accuracy: 69.89%
+ -------------------------------------------------- + 


Fold 9
--------------------------------------------------------------------------------


100%|██████████| 17434/17434 [00:07<00:00, 2380.16it/s]
100%|██████████| 17434/17434 [00:07<00:00, 2395.82it/s]
100%|██████████| 1936/1936 [00:00<00:00, 2376.95it/s]
  2%|▏         | 268/17434 [00:00<00:06, 2678.38it/s]

Fitting naive_bayes classifier...

FOLD 9 RESULTS
+ -------------------------------------------------- +
Random baseline accuracy: 51.45%
Model accuracy: 69.99%
+ -------------------------------------------------- + 


Fold 10
--------------------------------------------------------------------------------


100%|██████████| 17434/17434 [00:07<00:00, 2420.64it/s]
100%|██████████| 17434/17434 [00:07<00:00, 2409.14it/s]
100%|██████████| 1936/1936 [00:00<00:00, 2353.51it/s]

Fitting naive_bayes classifier...

FOLD 10 RESULTS
+ -------------------------------------------------- +
Random baseline accuracy: 49.69%
Model accuracy: 69.89%
+ -------------------------------------------------- + 

Complete.





In [14]:
""" save experiment info to file """

results_df.to_pickle('./clf-results/results_df.pickle')
fold_vector_df.to_pickle('./clf-results/fold_vector_df.pickle')

## Analysis

In [16]:
""" feature space """

print("Num Features (= |V|)")
print("+", "-" * 50, "+")

print("Minimum features across folds:", min(results_df['vocabsize'].values))
print("Maximum features across folds:", max(results_df['vocabsize'].values))

print("+", "-" * 50, "+")

Num Features (= |V|)
+ -------------------------------------------------- +
Minimum features across folds: 8668
Maximum features across folds: 8703
+ -------------------------------------------------- +


In [15]:
""" accuracy avg over folds """

correct = np.sum(fold_vector_df['random-baseline'].values == fold_vector_df['actuals'].values)
accuracy = correct / len(fold_vector_df['random-baseline'].values)
print("Random baseline accuracy across all folds: {:.2%}".format(accuracy))


correct = np.sum(fold_vector_df['predictions'].values == fold_vector_df['actuals'].values)
accuracy = correct / len(fold_vector_df['predictions'].values)
print("Model accuracy across all folds: {:.2%}".format(accuracy))

results_df

Random baseline accuracy across all folds: 50.33%
Model accuracy across all folds: 70.80%


Unnamed: 0,seed,fold,accuracy,random_baseline_accuracy,vocabsize,len(train),len(test),% train,% test,test label distribution
0,20,0,0.693698,0.497417,8672,17434,1936,0.900052,0.099948,"{0: 968, 1: 968}"
1,20,1,0.71281,0.504649,8694,17434,1936,0.900052,0.099948,"{0: 968, 1: 968}"
2,20,2,0.72314,0.507231,8692,17434,1936,0.900052,0.099948,"{0: 968, 1: 968}"
3,20,3,0.71436,0.495351,8697,17434,1936,0.900052,0.099948,"{0: 968, 1: 968}"
4,20,4,0.688017,0.510847,8696,17434,1936,0.900052,0.099948,"{0: 968, 1: 968}"
5,20,5,0.721591,0.493285,8703,17434,1936,0.900052,0.099948,"{0: 968, 1: 968}"
6,20,6,0.728306,0.503099,8686,17434,1936,0.900052,0.099948,"{0: 968, 1: 968}"
7,20,7,0.698864,0.509298,8668,17434,1936,0.900052,0.099948,"{0: 968, 1: 968}"
8,20,8,0.699897,0.514463,8672,17434,1936,0.900052,0.099948,"{0: 968, 1: 968}"
9,20,9,0.698864,0.496901,8683,17434,1936,0.900052,0.099948,"{0: 968, 1: 968}"


In [20]:
""" precision and recall """

def clf_performance(fold_vector_df, metric='accuracy', label='o5'):
    """
    metric: string specifying the performance metric (default='accuracy'
            other options: 'precision', 'recall', 'f1_score')
    class_type: default='mhp'
                other options: 'peer'
    """

    class_df = fold_vector_df[fold_vector_df['predictions'] == label]
    other_df = fold_vector_df[fold_vector_df['predictions'] != label]

    tp = np.sum(class_df['predictions'].values == class_df['actuals'].values)
    tn = np.sum(other_df['predictions'].values == other_df['actuals'].values)
    fp = np.sum(class_df['predictions'].values != class_df['actuals'].values)
    fn = np.sum(other_df['predictions'].values != other_df['actuals'].values)
        
    if metric == 'accuracy':
        return (tp + tn) / (tp + fn + fp + tn)
    elif metric == 'precision':
        return tp / (tp + fp)
    elif metric == 'recall':
        return tp / (tp + fn)
    elif metric == 'f1_score':
        prec = tp / (tp + fp)
        recall = tp / (tp + fn)
        return 2 * prec * recall / (prec + recall)
    else:
        return -1




table = defaultdict(lambda:[])
table['Model'].append(classifier)

labels = ['mhp', 'peer']

for label in labels:
    precision = clf_performance(fold_vector_df, label=label, metric='precision')
    recall = clf_performance(fold_vector_df, label=label, metric='recall')
    f1 = clf_performance(fold_vector_df, label=label, metric='f1_score')

    column = '{}: '.format(label)
    table[column + 'P'].append("{:.3f}".format(precision))
    table[column + 'R'].append("{:.3f}".format(recall))
    table[column + 'F1'].append("{:.3f}".format(f1))

table = pd.DataFrame(table, columns=table.keys())

print("Table: Precision and Recall results.")
print("+", "-" * 100, "+")
print(table.to_markdown(index=None))
print("+", "-" * 100, "+")



Table: Precision and Recall results.
+ ---------------------------------------------------------------------------------------------------- +
| Model       |   mhp: P |   mhp: R |   mhp: F1 |   peer: P |   peer: R |   peer: F1 |
|:------------|---------:|---------:|----------:|----------:|----------:|-----------:|
| naive_bayes |    0.718 |    0.685 |     0.701 |     0.699 |     0.731 |      0.715 |
+ ---------------------------------------------------------------------------------------------------- +


In [17]:
""" statistical significance test using non-parametric bootstrap resampling """

def prediction_differences(model_predictions, baseline_predictions, actuals):
    differences = []
    for model_pred, baseline_pred, actual in zip(model_predictions, baseline_predictions, actuals):

        # both correct
        if model_pred == actual and baseline_pred == actual:
            difference = 0
        # both incorrect
        elif model_pred != actual and baseline_pred != actual:
            difference = 0
        # model correct but baseline is not
        elif model_pred == actual and baseline_pred != actual:
            difference = 1
        # baseline correct but model is not
        elif model_pred != actual and baseline_pred == actual:
            difference = -1

        differences.append(difference)
    return differences

def bootstrap_resampling_test(differences, N=10000):
    # 1 = helped, 0 = not helped
    resample_results = []
    K = len(differences)
    print("Beginning bootstrap resampling")
    for i in tqdm(range(N)):
        samples = random.choices(differences, k=K)
        summed_differences = sum(samples)
        helped = summed_differences > 0
        resample_results.append(helped)

    helped_counts = Counter(resample_results)

    p_value = (N - helped_counts[True]) / N
    return p_value, helped_counts


# get difference between predictions and random baseline
prediction_differences = prediction_differences(fold_vector_df['predictions'].values, fold_vector_df['random-baseline'].values, fold_vector_df['actuals'].values)

# run test
N = 10000
p_value, helped_counts = bootstrap_resampling_test(prediction_differences, N=N)



print("\nStatistical significance test results")
print("+", "-" * 50, "+")

print("Model helps in {}/{} or {:.2%} of bootstrap resamples".format(helped_counts[True], sum(helped_counts.values()), helped_counts[True]/sum(helped_counts.values())))

print("p-value < {}".format((N - helped_counts[True] + 1) / N ))

correct = np.sum(fold_vector_df['predictions'].values == fold_vector_df['actuals'].values)
accuracy = correct / len(fold_vector_df['predictions'].values)
print("Accuracy across all folds: {:.2%}".format(accuracy))
print("+", "-" * 50, "+")


  0%|          | 49/10000 [00:00<00:20, 485.49it/s]

Beginning bootstrap resampling


100%|██████████| 10000/10000 [00:19<00:00, 511.51it/s]


Statistical significance test results
+ -------------------------------------------------- +
Model helps in 10000/10000 or 100.00% of bootstrap resamples
p-value < 0.0001
Accuracy across all folds: 70.80%
+ -------------------------------------------------- +



