# Binary Classifiers

This section aims to implement models that predict whether an answer to a quetsion is within a document or not. 

3 models are implemented: two logistic regression models with GloVe and BPEmb embeddings, respectively, and a neural network which utilizies BERT embeddings.

In [1]:
%run /app/prepare_data.py english

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Adding  english to dict


In [33]:
from tqdm import tqdm
import numpy as np

# Logistic Regression with BPEmb embeddings

In [34]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from bpemb import BPEmb

In [35]:
bpemb_model = BPEmb(lang='en', dim=50, vs=25000) # change language here if needed
print(f"Showing an example of the model for the word hello:\n")
print(f"Subword tokens: {bpemb_model.encode('hello')}")
print(f"Values: {bpemb_model.embed('hello')}\n")
print("The model is divided into two subwords, 'he' and 'llo' - each with different values.\nThe value of the subword depends on the context.")

Showing an example of the model for the word hello:

Subword tokens: ['▁hel', 'lo']
Values: [[ 0.101289  0.067421  0.445592 -0.31668   0.210379 -0.102614  0.329681
   0.3211    0.045224  1.001417 -0.173007  0.146657  0.431487  0.265748
   0.367141  0.022575 -0.531671 -0.071386  0.141523 -0.026373  0.077574
   0.329998 -0.250906  0.071113 -0.104565  0.209173 -0.340813  0.079831
  -0.293946 -0.055277  0.375396  0.030002 -0.093638  0.300789 -0.793242
   0.253135  0.018888  0.031131  0.574456  0.155547  0.040618 -0.023185
   0.596075 -0.035391 -0.431783  0.649094 -0.141042  0.618206  0.156273
  -0.257083]
 [-0.104277 -0.03539   0.248758 -0.41444   0.476999  0.258883  0.035058
   0.273445 -0.342508  0.628149 -0.196841 -0.167787  0.058616 -0.096752
   0.11608  -0.638848 -0.09259  -0.081885 -0.453536  0.233586  0.508515
   0.261741 -0.00718   0.31664   0.021489 -0.263001 -0.267236  0.040581
  -0.72942   0.035982  0.260991  0.577699  0.028912  0.001675  0.082996
   0.405256 -0.345392 -0.188424

In [51]:
###  vector representation for each text; we decided to average the vector embeddings of each word
def get_bpemb_features(dataset, bpemb):
    X = [bpemb.embed(" ".join(x)).mean(0) for x in tqdm(dataset[:, 0])]
    y = list(dataset[:, 1])
    return X, y

In [52]:
for lang in languages:
    print(f"Training a logistic regression model for the {lang} language...")
    X_train = [item['doc_words'] for item in train_set_dict[lang]]
    X_val = [item['doc_words'] for item in val_set_dict[lang]]
    y_train = [item['answerable'] for item in train_set_dict[lang]]
    y_val = [item['answerable'] for item in val_set_dict[lang]]
    X_train_bpemb, y_train_bpemb = get_bpemb_features(np.transpose((X_train, y_train)), bpemb_model)
    X_val_bpemb, y_val_bpemb = get_bpemb_features(np.transpose((X_val, y_val)), bpemb_model)
    print(f"The first 15 words of the first text in the training set for {lang}: \n {X_train[0][:15]}...")
    print(f"The vector representation of that text: \n {X_train_bpemb[0][:15]} ...\n")
    classifier = LogisticRegression(penalty='l2', max_iter=1000)
    classifier.fit(X_train_bpemb, y_train_bpemb)
    preds = classifier.predict(X_val_bpemb)
    print(f"Classification report for the {lang} language:")
    print(classification_report(y_val_bpemb, preds))


Training a logistic regression model for the english language...


  result = getattr(asarray(obj), method)(*args, **kwds)
100%|██████████| 7389/7389 [00:04<00:00, 1769.22it/s]
100%|██████████| 990/990 [00:00<00:00, 1697.09it/s]


The first 15 words of the first text in the training set for english: 
 ['When', 'was', 'quantum', 'field', 'theory', 'developed', '?', ' [SEP] ', 'Quantum', 'field', 'theory', 'naturally', 'began', 'with', 'the']...
The vector representation of that text: 
 [-0.12067941 -0.4207224   0.1596494   0.12411743 -0.26608768  0.05689759
  0.12732057 -0.33251977  0.17155759  0.03208057 -0.14067082  0.04167471
  0.25372058  0.28912488  0.05295061] ...

Classification report for the english language:
              precision    recall  f1-score   support

           0       0.69      0.63      0.66       495
           1       0.66      0.71      0.68       495

    accuracy                           0.67       990
   macro avg       0.67      0.67      0.67       990
weighted avg       0.67      0.67      0.67       990



In [38]:
def get_unigram_features(X, vectorizer):
  X1 = vectorizer.transform(X[0])
  X2 = vectorizer.transform(X[1])
  X_ = sparse_hstack([X1, X2], format='csr')
  return X_

# Logistic Regression with GloVe embeddings

This embedding is only for the english language. 

In [39]:
import spacy
from scipy.sparse import hstack as sparse_hstack
import gensim.downloader
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
glove_vectors = gensim.downloader.load('glove-wiki-gigaword-100')
nlp = spacy.load('en_core_web_sm')

In [40]:
hello_vector = glove_vectors["hello"]
print(f"Showing an example of the model for the word hello:\n")
print(f"Embedding for 'hello': {hello_vector}")

print(f"The value still depends on context, but it is not divided into subwords.")

Showing an example of the model for the word hello:

Embedding for 'hello': [ 0.26688    0.39632    0.6169    -0.77451   -0.1039     0.26697
  0.2788     0.30992    0.0054685 -0.085256   0.73602   -0.098432
  0.5479    -0.030305   0.33479    0.14094   -0.0070003  0.32569
  0.22902    0.46557   -0.19531    0.37491   -0.7139    -0.51775
  0.77039    1.0881    -0.66011   -0.16234    0.9119     0.21046
  0.047494   1.0019     1.1133     0.70094   -0.08696    0.47571
  0.1636    -0.44469    0.4469    -0.93817    0.013101   0.085964
 -0.67456    0.49662   -0.037827  -0.11038   -0.28612    0.074606
 -0.31527   -0.093774  -0.57069    0.66865    0.45307   -0.34154
 -0.7166    -0.75273    0.075212   0.57903   -0.1191    -0.11379
 -0.10026    0.71341   -1.1574    -0.74026    0.40452    0.18023
  0.21449    0.37638    0.11239   -0.53639   -0.025092   0.31886
 -0.25013   -0.63283   -0.011843   1.377      0.86013    0.20476
 -0.36815   -0.68874    0.53512   -0.46556    0.27389    0.4118
 -0.854     

In [41]:
def text_to_word_embeddings(texts, glove_vectors):
    """
    Converts a list of texts into sequences of word embeddings.
    Handles sequences of arbitrary lengths without truncation.
    """
    embeddings = []
    for text in texts:
        words = text.split()  # Tokenize text
        word_embeddings = [
            glove_vectors[word] for word in words if word in glove_vectors
        ]
        if not word_embeddings:
            # Default to a single zero vector for empty texts
            word_embeddings = [np.zeros(100)]
        embeddings.append(np.array(word_embeddings))
    return embeddings

In [42]:
for lang in languages:
    print(f"Training a logistic regression model for the {lang} language...")
    
    # Combine question and document text for training and validation
    X_train = [
        f"{q} {d}" for q, d in zip(train_set_dict[lang]['question_text'], train_set_dict[lang]['document_plaintext'])
    ]
    X_val = [
        f"{q} {d}" for q, d in zip(val_set_dict[lang]['question_text'], val_set_dict[lang]['document_plaintext'])
    ]
    
    y_train = [item['answerable'] for item in train_set_dict[lang]]
    y_val = [item['answerable'] for item in val_set_dict[lang]]
    # Convert texts to sequences of concatenated word embeddings
    X_train_embedded = text_to_word_embeddings(X_train, glove_vectors)
    X_val_embedded = text_to_word_embeddings(X_val, glove_vectors)
    
    # Logistic regression requires fixed input size
    # Use average pooling as a simple solution
    X_train_fixed = np.array([np.mean(x.reshape(-1, 100), axis=0) for x in X_train_embedded])
    X_val_fixed = np.array([np.mean(x.reshape(-1, 100), axis=0) for x in X_val_embedded])
    
    print(f"The first text in the training set for {lang}: \n {X_train[0]}")
    print(f"The GloVe representation (averaged): \n {X_train_fixed[0]} \n")
    
    # Train logistic regression classifier
    classifier = LogisticRegression(penalty='l2', max_iter=1000)
    classifier.fit(X_train_fixed, y_train)
    
    # Predict and evaluate
    preds = classifier.predict(X_val_fixed)
    report = classification_report(y_val, preds)
    print(f"Classification report for the {lang} language:\n{report}")


Training a logistic regression model for the english language...
The first text in the training set for english: 
 When was quantum field theory developed? Quantum field theory naturally began with the study of electromagnetic interactions, as the electromagnetic field was the only known classical field as of the 1920s.[8]:1
The GloVe representation (averaged): 
 [-0.06727608  0.22719015  0.21718472  0.13494685  0.15845731  0.03247473
  0.15612006 -0.06552999 -0.6081013   0.15663871  0.08369958 -0.18789771
  0.1966625   0.07496168  0.26428124 -0.26750705  0.2341432   0.2525144
 -0.015551   -0.19929105  0.12247273 -0.2077063   0.35634932  0.01751207
  0.23028867 -0.24267037  0.04861677 -0.27640134  0.13263808  0.17019576
 -0.32764196  0.37578768 -0.09675872  0.19454098 -0.07862025  0.00471331
 -0.06276357  0.6262489  -0.07622276 -0.15160385 -0.48151222 -0.13967013
 -0.04628307 -0.18813136 -0.03559667 -0.00297623  0.32025963 -0.03209976
 -0.16816299 -0.37845576  0.1398484   0.01465227  0

# The BERT Model

In [None]:
from transformers import AutoTokenizer, AutoModel, DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset, DatasetDict
from sklearn.metrics import f1_score, accuracy_score

In [None]:
model_name = "bert-base-multilingual-cased"
val_set_dict_, test_set_dict_ = separate_val_set(val_set_dict, languages, test_size=0.1, random_state=42)
tokenizer = AutoTokenizer.from_pretrained(model_name)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Processing language: english
Validation set size for english: 891
Test set size for english: 99


In [None]:
print(f"Showing an example of tokenizing from BERT for the word hello:\n")
model = AutoModel.from_pretrained(model_name)
tokens = tokenizer.tokenize("hello")
print(f"Tokens: {tokens}")
encoded = tokenizer("hello", return_tensors="pt")
print(f"Encoded IDs: {encoded['input_ids']}")
decoded = tokenizer.convert_ids_to_tokens(encoded['input_ids'][0])
print(f"Token Sequence: {decoded}")

Showing an example of tokenizing from BERT for the word hello:

Tokens: ['hell', '##o']
Encoded IDs: tensor([[  101, 61694, 10133,   102]])
Token Sequence: ['[CLS]', 'hell', '##o', '[SEP]']


In [None]:
id2label = {0: "no_answer", 1: "answer"}
label2id = {"no_answer": 0, "answer": 1}
model = AutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=2, id2label=id2label, label2id=label2id
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def bert_reformat(example):
    """
    Match the format of the data to the BERT model.
    """
    example.pop("document_title")
    example["text"] = example.pop("question_text") + example.pop("document_plaintext")
    example['label'] = int(len(example['annotations']['answer_text'][0]) > 0)
    example.pop("annotations")
    return example


def bert_prepare(train_dict, val_dict, test_dict):
    """
    Prepares datasets for BERT by applying reformatting and selecting columns.

    Parameters:
        train_dict (dict): Dictionary containing train datasets by language.
        val_dict (dict): Dictionary containing validation datasets by language.
        test_dict (dict): Dictionary containing test datasets by language.

    Returns:
        dict: Dictionary of DatasetDicts for each language.
    """
    def filter_columns(dataset, required_columns):
        # helper function to select columns
        return dataset.map(lambda x: {key: x[key] for key in required_columns})

    dict_list = {}
    for key in train_dict.keys():
        hugging_dict = DatasetDict()

        # reformatting the validation and test datasets: train is already formatted correctly since we didn't split it
        val_dataset = Dataset.from_dict(val_dict[key])
        test_dataset = Dataset.from_dict(test_dict[key])

        # Reformat and transform the train dataset
        hugging_dict['train'] = train_dict[key].map(bert_reformat)

        # Reformat and transform the validation dataset
        hugging_dict['validation'] = val_dataset.map(bert_reformat)

        # Reformat and transform the test dataset
        hugging_dict['test'] = test_dataset.map(bert_reformat)

        dict_list[key] = hugging_dict

    return dict_list
def tokenizer_bert(batch):
    return tokenizer(batch['text'], truncation=True, padding='max_length', max_length=512)
bert_data = bert_prepare(train_set_dict, val_set_dict_, test_set_dict_)
tokenized_bert_data = bert_data['english'].map(tokenizer_bert, batched=True)

Map:   0%|          | 0/891 [00:00<?, ? examples/s]

Map:   0%|          | 0/99 [00:00<?, ? examples/s]

Map:   0%|          | 0/7389 [00:00<?, ? examples/s]

Map:   0%|          | 0/891 [00:00<?, ? examples/s]

Map:   0%|          | 0/99 [00:00<?, ? examples/s]

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy_score(predictions=predictions, references=labels)

training_args = TrainingArguments(
    output_dir="english",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    logging_steps=100,
    report_to=[]
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_bert_data['train'],
    eval_dataset=tokenized_bert_data['validation'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()
trainer.save_model("english_classifier")

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,0.3869,0.327685,0.84624
2,0.2588,0.338619,0.867565


In [None]:
predictions = trainer.predict(tokenized_bert_data["test"])
predicted_labels = predictions.predictions.argmax(-1)
print(f"Predicted labels: {predicted_labels[:100]}")
print(f"True labels: {tokenized_bert_data['test']['label'][:100]}")

Predicted labels: [0 1 0 1 1 0 1 1 1 1 0 1 1 1 0 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0 0 1 0 0 1 0 1
 0 0 1 0 0 0 0 1 1 0 0 0 0 0 1 0 1 1 0 0 1 1 0 1 1 0 1 0 1 0 1 0 1 1 1 1 1
 1 1 0 0 1 0 1 1 1 0 0 0 0 0 0 0 1 0 1 0 0 1 1 0 1]
True labels: [1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1]


In [None]:
true_labels = tokenized_bert_data["test"]["label"]
acc = accuracy_score(y_true=predicted_labels, y_pred=true_labels)
f1 = f1_score(true_labels, predicted_labels)
print(f"Accuracy: {acc}")
print(f"F1 score: {f1}")

Accuracy: 0.8181818181818182
F1 score: 0.8043478260869565
