In [10]:
import pandas as pd
import numpy as np
from transformers import BertTokenizer, AutoModelForSequenceClassification, AutoTokenizer, AdamW, get_linear_schedule_with_warmup
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import f1_score
from peft import get_peft_model, LoraConfig, TaskType
import faiss
from sentence_transformers import SentenceTransformer
import os

In [11]:
# Load data
data = pd.read_parquet("auto_tagging_data_v2.parquet")
all_tags = sorted(list(set(tag for tags in data['Tags'] for tag in tags)))
number_of_tags = len(all_tags)

In [25]:
data.head(10)

Unnamed: 0,Id,Title,Body,Tags
0,6,The Two Cultures: statistics vs. machine learn...,"<p>Last year, I read a blog post from <a href=...",[machine-learning]
1,21,Forecasting demographic census,<p>What are some of the ways to forecast demog...,[forecasting]
2,22,Bayesian and frequentist reasoning in plain En...,<p>How would you describe in plain English the...,[bayesian]
3,31,What is the meaning of p values and t values i...,<p>After taking a statistics course and then t...,"[hypothesis-testing, t-test, p-value, interpre..."
4,36,Examples for teaching: Correlation does not me...,"<p>There is an old saying: ""Correlation does n...",[correlation]
5,93,Robust nonparametric estimation of hazard/surv...,<p>We're trying to use a Gaussian process to m...,"[nonparametric, survival]"
6,95,How Large a Difference Can Be Expected Between...,<p>I have been using various GARCH-based model...,[time-series]
7,103,What is your favorite data visualization blog?,<p>What is the best blog on data visualization...,"[data-visualization, references]"
8,113,What are some good frameworks for method selec...,<p>I have been looking into theoretical framew...,[machine-learning]
9,114,What statistical blogs would you recommend?,<p>What statistical research blogs would you r...,[references]


In [22]:
all_tags

['algorithms',
 'anova',
 'arima',
 'autocorrelation',
 'bayesian',
 'binary-data',
 'binomial',
 'bootstrap',
 'cart',
 'categorical-data',
 'chi-squared',
 'classification',
 'clustering',
 'conditional-probability',
 'confidence-interval',
 'correlation',
 'covariance',
 'cox-model',
 'cross-validation',
 'data-mining',
 'data-transformation',
 'data-visualization',
 'dataset',
 'deep-learning',
 'distributions',
 'econometrics',
 'estimation',
 'expected-value',
 'experiment-design',
 'factor-analysis',
 'feature-selection',
 'forecasting',
 'generalized-linear-model',
 'goodness-of-fit',
 'hypothesis-testing',
 'inference',
 'interaction',
 'interpretation',
 'least-squares',
 'linear-model',
 'logistic',
 'machine-learning',
 'mathematical-statistics',
 'matlab',
 'maximum-likelihood',
 'mcmc',
 'mean',
 'missing-data',
 'mixed-model',
 'model',
 'model-selection',
 'modeling',
 'monte-carlo',
 'multilevel-analysis',
 'multiple-comparisons',
 'multiple-regression',
 'multivariate

In [12]:
# Initialize tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Load a pre-trained sentence transformer model for embedding
retrieval_model = SentenceTransformer('all-MiniLM-L6-v2')

# Encode all data bodies for retrieval indexing
corpus_embeddings = retrieval_model.encode(data['Body'].tolist())
dimension = corpus_embeddings.shape[1]

# Initialize FAISS index
index = faiss.IndexFlatL2(dimension)
index.add(np.array(corpus_embeddings))

In [13]:
# Retrieve context function
def retrieve_context(input_text, top_k=3):
    input_embedding = retrieval_model.encode([input_text])
    _, top_k_indices = index.search(np.array(input_embedding), top_k)
    return [data['Body'].iloc[i] for i in top_k_indices[0]]


# Function to augment text with context
def augment_with_context(input_text):
    context = retrieve_context(input_text)
    return input_text + " " + " ".join(context)


def encode_tags(tags):
    encoding = np.zeros(len(all_tags))
    for tag in tags:
        if tag in all_tags:
            index = all_tags.index(tag)
            encoding[index] = 1
    return encoding

In [14]:
labels = data['Tags'].apply(encode_tags)

model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=number_of_tags)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
# LoRA configuration
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    inference_mode=False,
    r=16,
    lora_alpha=32,
    lora_dropout=0.2,
)
lora_model = get_peft_model(model, lora_config)

In [16]:
class AutoTaggingDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = augment_with_context(self.texts[idx])  # Augment input text with context
        label = torch.tensor(self.labels[idx], dtype=torch.float)
        
        inputs = self.tokenizer(text, padding='max_length', truncation=True, max_length=self.max_length, return_tensors="pt")
        inputs = {k: v.squeeze(0) for k, v in inputs.items()}
        
        return inputs, label

In [17]:
# Initialize DataLoader
dataset = AutoTaggingDataset(data["Body"], labels, tokenizer)
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
lora_model.to(device)

optimizer = AdamW(lora_model.parameters(), lr=2e-4, weight_decay=0.001)
epochs = 10
warmup_steps = int(0.1 * len(dataloader))
total_steps = len(dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps)

patience = 2
best_val_loss = float('inf')
early_stop_counter = 0

for epoch in range(epochs):
    lora_model.train()
    total_loss = 0
    all_preds = []
    all_labels = []

    for batch in dataloader:
        inputs, labels = batch
        inputs = {k: v.to(device) for k, v in inputs.items()}
        labels = labels.to(device)

        optimizer.zero_grad()

        outputs = lora_model(**inputs)
        logits = outputs.logits

        loss = torch.nn.BCEWithLogitsLoss()(logits, labels)

        loss.backward()

        torch.nn.utils.clip_grad_norm_(lora_model.parameters(), max_norm=1.0)

        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

        probs = torch.sigmoid(logits).detach().cpu().numpy()
        preds = (probs > 0.5).astype(int)
        all_preds.extend(preds)
        all_labels.extend(labels.cpu().numpy())

    avg_train_loss = total_loss / len(dataloader)
    all_preds = np.array(all_preds)
    all_labels = np.array(all_labels)

    exact_match_ratio = np.mean(np.all(all_preds == all_labels, axis=1))
    label_accuracy = np.mean(np.equal(all_preds, all_labels).astype(float), axis=0).mean()
    f1 = f1_score(all_labels, all_preds, average='macro')

    if avg_train_loss < best_val_loss:
        best_val_loss = avg_train_loss
        early_stop_counter = 0
    else:
        early_stop_counter += 1
        if early_stop_counter >= patience:
            print(f"Early stopping at epoch {epoch + 1}")
            break

    print(f"Epoch {epoch + 1}, Loss: {avg_train_loss:.4f}, Exact Match Ratio: {exact_match_ratio:.4f}, "f"Label-based Accuracy: {label_accuracy:.4f}, F1 Score: {f1:.4f}")



Epoch 1, Loss: 0.0880, Exact Match Ratio: 0.0229, Label-based Accuracy: 0.9771, F1 Score: 0.0289
Epoch 2, Loss: 0.0662, Exact Match Ratio: 0.0827, Label-based Accuracy: 0.9819, F1 Score: 0.1126
Epoch 3, Loss: 0.0611, Exact Match Ratio: 0.1146, Label-based Accuracy: 0.9826, F1 Score: 0.1888
Epoch 4, Loss: 0.0582, Exact Match Ratio: 0.1326, Label-based Accuracy: 0.9831, F1 Score: 0.2427
Epoch 5, Loss: 0.0562, Exact Match Ratio: 0.1420, Label-based Accuracy: 0.9834, F1 Score: 0.2825
Epoch 6, Loss: 0.0549, Exact Match Ratio: 0.1520, Label-based Accuracy: 0.9836, F1 Score: 0.3090
Epoch 7, Loss: 0.0538, Exact Match Ratio: 0.1610, Label-based Accuracy: 0.9839, F1 Score: 0.3291
Epoch 8, Loss: 0.0529, Exact Match Ratio: 0.1654, Label-based Accuracy: 0.9840, F1 Score: 0.3447
Epoch 9, Loss: 0.0522, Exact Match Ratio: 0.1711, Label-based Accuracy: 0.9842, F1 Score: 0.3544
Epoch 10, Loss: 0.0516, Exact Match Ratio: 0.1745, Label-based Accuracy: 0.9843, F1 Score: 0.3609


In [18]:
save_directory = "saved_model"
os.makedirs(save_directory, exist_ok=True)
model_path = os.path.join(save_directory, "lora_model.pth")


def save_model(model, tokenizer, path=model_path):
    torch.save(model.state_dict(), path)
    print(f"Model saved to {path}")
    tokenizer.save_pretrained(save_directory)  # Save the tokenizer to the same directory


# Save the model if training finished successfully
if early_stop_counter < patience:
    save_model(lora_model, tokenizer)
else:
    print("Early stopping occurred, model not saved.")


def load_model(path=model_path):
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=number_of_tags)
    lora_model = get_peft_model(model, lora_config)

    # Load the saved state dictionary
    lora_model.load_state_dict(torch.load(path))
    print(f"Model loaded from {path}")

    # Load the tokenizer
    tokenizer = AutoTokenizer.from_pretrained(save_directory)

    return lora_model, tokenizer

Model saved to saved_model\lora_model.pth


In [19]:
def predict_tags(text):
    lora_model.eval()
    augmented_text = augment_with_context(text)  # Augment input text with context

    # Print the augmented text
    print(f"Augmented Text: {augmented_text}\n")

    inputs = tokenizer(augmented_text, padding="max_length", truncation=True, max_length=128, return_tensors="pt").to(
        device)

    with torch.no_grad():
        logits = lora_model(**inputs).logits
        probs = torch.sigmoid(logits).cpu().numpy()

    values_array = np.array(probs[0])
    top_3_indices = np.argsort(values_array)[-3:]
    result = np.zeros_like(values_array)
    result[top_3_indices] = values_array[top_3_indices]

    tags = (result > 0.5).astype(int)

    if not tags.any():
        tags[np.argmax(values_array)] = 1

    return tags


def binary_to_tags(binary_output, tag_list):
    predicted_tags = [tag_list[i] for i in range(len(binary_output)) if binary_output[i] == 1]
    return predicted_tags


# Load model and tokenizer for future predictions
loaded_model, loaded_tokenizer = load_model()
loaded_model.to(device)


def predict_tags_with_loaded_model(text):
    loaded_model.eval()
    augmented_text = augment_with_context(text)

    # Print augmented text
    print(f"Augmented Text: {augmented_text}\n")

    inputs = loaded_tokenizer(augmented_text, padding="max_length", truncation=True, max_length=128,
                              return_tensors="pt").to(device)

    with torch.no_grad():
        logits = loaded_model(**inputs).logits
        probs = torch.sigmoid(logits).cpu().numpy()

    values_array = np.array(probs[0])
    top_3_indices = np.argsort(values_array)[-3:]
    result = np.zeros_like(values_array)
    result[top_3_indices] = values_array[top_3_indices]

    tags = (result > 0.5).astype(int)

    if not tags.any():
        tags[np.argmax(values_array)] = 1

    return tags


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  lora_model.load_state_dict(torch.load(path))


Model loaded from saved_model\lora_model.pth


In [20]:
new_text = "<p>After taking a statistics course and then trying to help fellow students, I noticed one subject that inspires much head-desk banging is interpreting the results of statistical hypothesis tests.  It seems that students easily learn how to perform the calculations required by a given test but get hung up on interpreting the results.  Many computerized tools report test results in terms of p values or t values.</p>\n\n<p>How would you explain the following points to college students taking their first course in statistics:</p>\n\n<ul>\n<li><p>What does a p-value mean in relation to the hypothesis being tested?  Are there cases when one should be looking for a high p-value or a low p-value?</p></li>\n<li><p>What is the relationship between a p-value and a t-value?</p></li>\n</ul>\n"

predicted_tags = predict_tags_with_loaded_model(new_text)
print(binary_to_tags(predicted_tags, all_tags))

Augmented Text: <p>After taking a statistics course and then trying to help fellow students, I noticed one subject that inspires much head-desk banging is interpreting the results of statistical hypothesis tests.  It seems that students easily learn how to perform the calculations required by a given test but get hung up on interpreting the results.  Many computerized tools report test results in terms of p values or t values.</p>

<p>How would you explain the following points to college students taking their first course in statistics:</p>

<ul>
<li><p>What does a p-value mean in relation to the hypothesis being tested?  Are there cases when one should be looking for a high p-value or a low p-value?</p></li>
<li><p>What is the relationship between a p-value and a t-value?</p></li>
</ul>
 <p>After taking a statistics course and then trying to help fellow students, I noticed one subject that inspires much head-desk banging is interpreting the results of statistical hypothesis tests.  It

In [21]:
new_text = "<p>I have been looking into theoretical frameworks for method selection (note: not model selection) and have found very little systematic, mathematically-motivated work. By 'method selection', I mean a framework for distinguishing the appropriate (or better, optimal) method with respect to a problem, or problem type.</p>\n\n<p>What I have found is substantial, if piecemeal, work on particular methods and their tuning (i.e. prior selection in Bayesian methods), and method selection via bias selection (e.g. <a href=http://portal.acm.org/citation.cfm?id=218546>Inductive Policy: The Pragmatics of Bias Selection</a>). I may be unrealistic at this early stage of machine learning's development, but I was hoping to find something like what <a href=ftp://ftp.sas.com/pub/neural/measurement.html>measurement theory</a> does in prescribing admissible transformations and tests by scale type, only writ large in the arena of learning problems.</p>\n\n<p>Any suggestions?</p>\n"
predicted_tags = predict_tags(new_text)

print(binary_to_tags(predicted_tags, all_tags))

Augmented Text: <p>I have been looking into theoretical frameworks for method selection (note: not model selection) and have found very little systematic, mathematically-motivated work. By 'method selection', I mean a framework for distinguishing the appropriate (or better, optimal) method with respect to a problem, or problem type.</p>

<p>What I have found is substantial, if piecemeal, work on particular methods and their tuning (i.e. prior selection in Bayesian methods), and method selection via bias selection (e.g. <a href=http://portal.acm.org/citation.cfm?id=218546>Inductive Policy: The Pragmatics of Bias Selection</a>). I may be unrealistic at this early stage of machine learning's development, but I was hoping to find something like what <a href=ftp://ftp.sas.com/pub/neural/measurement.html>measurement theory</a> does in prescribing admissible transformations and tests by scale type, only writ large in the arena of learning problems.</p>

<p>Any suggestions?</p>
 <p>I have been