<a href="https://colab.research.google.com/github/GoldPapaya/info256-applied-nlp/blob/main/6.tests/ParametricTest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dbamman/anlp25/blob/main/6.tests/ParametricTest.ipynb)

This notebook explores a simple hypothesis test checking whether the accuracy of a trained model for binary classification is meaningfully different from a majority class baseline.  We test this making a parametric assumption: we assume that the binary correct/incorrect results follow a binomial distribution (and approximate the binomial with a normal distribution).

In [1]:
import sys
from collections import Counter
from math import sqrt

import numpy as np
import pandas as pd
from scipy import sparse
from scipy.stats import norm
from sklearn import linear_model, preprocessing

Download the Convote dataset.

In [2]:
# get Convote data
!wget --no-check-certificate https://raw.githubusercontent.com/dbamman/anlp25/refs/heads/main/data/convote/train.tsv -O convote_train.tsv
!wget --no-check-certificate https://raw.githubusercontent.com/dbamman/anlp25/refs/heads/main/data/convote/dev.tsv -O convote_dev.tsv
!wget --no-check-certificate https://raw.githubusercontent.com/dbamman/anlp25/refs/heads/main/data/convote/test.tsv -O convote_test.tsv

--2025-09-25 23:31:14--  https://raw.githubusercontent.com/dbamman/anlp25/refs/heads/main/data/convote/train.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4660140 (4.4M) [text/plain]
Saving to: ‘convote_train.tsv’


2025-09-25 23:31:14 (107 MB/s) - ‘convote_train.tsv’ saved [4660140/4660140]

--2025-09-25 23:31:14--  https://raw.githubusercontent.com/dbamman/anlp25/refs/heads/main/data/convote/dev.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 351382 (343K) [text/plain]
Saving to: ‘convote_dev.tsv’


2025-09-25 23:31:14

In [3]:
def read_data(filename):
    df = pd.read_csv(filename, names=["label", "text"], sep="\t")

    return df.text.to_list(), df.label.to_list()

In [4]:
x_train, y_train = read_data("convote_train.tsv")
x_dev, y_dev = read_data("convote_dev.tsv")

In [5]:
def majority_class(y_train, y_dev):
    label_counts = Counter(y_train)
    majority = label_counts.most_common(1)[0][0]

    correct = 0.
    for label in y_dev:
        if label == majority:
            correct += 1

    print("%s\t%.3f" % (majority, correct/len(y_dev)))
    return correct / len(y_dev)

In [6]:
# Here's a sample dictionary we can create by inspecting the output of the Mann-Whitney test (in 2.compare/)

dem_dictionary = set(["republican","cut", "opposition"])
repub_dictionary = set(["growth","economy"])

def political_dictionary_feature(tokens):
    feats = {}
    for word in tokens:
        if word in dem_dictionary:
            feats["word_in_dem_dictionary"] = 1
        if word in repub_dictionary:
            feats["word_in_repub_dictionary"] = 1
    return feats

In [7]:
def unigram_feature(tokens):
    feats = {}
    for word in tokens:
        feats["UNIGRAM_%s" % word] = 1
    return feats

In [8]:
def build_features(x_train, feature_functions):
    data = []
    for doc in x_train:
        feats = {}
        tokens = doc.split(" ")

        for function in feature_functions:
            feats.update(function(tokens))

        data.append(feats)
    return data

In [9]:
# This helper function converts a dictionary of feature names to unique numerical ids
def create_vocab(data):
    feature_vocab = {}
    idx = 0
    for doc in data:
        for feat in doc:
            if feat not in feature_vocab:
                feature_vocab[feat] = idx
                idx += 1

    return feature_vocab

In [10]:
# This helper function converts a dictionary of feature names to a sparse representation
# that we can fit in a scikit-learn model.  This is important because almost all feature
# values will be 0 for most documents (note: why?), and we don't want to save them all in
# memory.

def features_to_ids(data, feature_vocab):
    new_data = sparse.lil_matrix((len(data), len(feature_vocab)))
    for idx,doc in enumerate(data):
        for f in doc:
            if f in feature_vocab:
                new_data[idx, feature_vocab[f]] = doc[f]
    return new_data

In [11]:
# This function trains a model and returns the predicted and true labels for test data
def evaluate(x_train, x_dev, y_train, y_dev, feature_functions):
    x_train_feat = build_features(x_train, feature_functions)
    x_dev_feat = build_features(x_dev, feature_functions)

    # just create vocabulary from features in *training* data
    feature_vocab = create_vocab(x_train_feat)

    x_train_ids = features_to_ids(x_train_feat, feature_vocab)
    x_dev_ids = features_to_ids(x_dev_feat, feature_vocab)

    logreg = linear_model.LogisticRegression(C=1.0, solver='lbfgs', penalty='l2', max_iter=10000)
    logreg.fit(x_train_ids, y_train)
    predictions = logreg.predict(x_dev_ids)
    return (predictions, y_dev)

In [12]:
baseline = majority_class(y_train, y_dev)

R	0.506


In [13]:
def binomial_test(predictions, targets, baseline, significance_level=0.95):
    correct = [int(prediction == target) for prediction, target in zip(predictions, targets)]

    success_rate = np.mean(correct)

    # two-tailed test
    critical_value = (1 - significance_level) / 2
    # ppf finds z such that p(X < z) = critical_value
    z_alpha = -1 * norm.ppf(critical_value)
    print("Critical value: %.3f\tz_alpha: %.3f" % (critical_value, z_alpha))

    # the standard error is the square root of (the variance/sample size)
    # the variance for a binomial test is p*(1-p)
    standard_error = sqrt((success_rate * (1-success_rate)) / len(correct))

    Z = (success_rate - baseline) / standard_error
    lower = success_rate - z_alpha * standard_error
    upper = success_rate + z_alpha * standard_error
    pval = norm.cdf(-abs(Z))

    print ("Accuracy: %.3f, n = %s" % (success_rate, len(correct)))
    print("%s%% Confidence interval: [%.3f,%.3f]" % (significance_level*100, lower, upper))

    print("Z score: %.3f" % Z)
    print("p-value: %.5f" % pval)

    print ("Critical region corresponding to z_alpha=[%.3f,%.3f]: [%.3f, %.3f]" % (-z_alpha, z_alpha, baseline-z_alpha*standard_error, baseline+z_alpha*standard_error))
    print ("Can we reject null that %.3f is different from %.3f at %s significance level? %s" % (success_rate, baseline, significance_level*100, "Yes" if Z < -z_alpha or Z > z_alpha else "No"))

In [14]:
features = [political_dictionary_feature]
predictions, targets = evaluate(x_train, x_dev, y_train, y_dev, features)
binomial_test(predictions, targets, baseline, significance_level=.95)

Critical value: 0.025	z_alpha: 1.960
Accuracy: 0.541, n = 257
95.0% Confidence interval: [0.480,0.602]
Z score: 1.127
p-value: 0.12996
Critical region corresponding to z_alpha=[-1.960,1.960]: [0.445, 0.567]
Can we reject null that 0.541 is different from 0.506 at 95.0 significance level? No


In [15]:
features = [unigram_feature]
predictions, targets = evaluate(x_train, x_dev, y_train, y_dev, features)
binomial_test(predictions, targets, baseline, significance_level=.95)

Critical value: 0.025	z_alpha: 1.960
Accuracy: 0.693, n = 257
95.0% Confidence interval: [0.636,0.749]
Z score: 6.489
p-value: 0.00000
Critical region corresponding to z_alpha=[-1.960,1.960]: [0.449, 0.562]
Can we reject null that 0.693 is different from 0.506 at 95.0 significance level? Yes
