In [1]:
from google.colab import drive
drive.mount('/content/drive')
!cd /content/drive/MyDrive/SOTA_Challenge
#Ensure that you downloaded the sota dataset from https://github.com/jd-coderepos/sota/
!cp -r "/content/drive/MyDrive/SOTA_Challenge/sota-master.zip" "/content/"
!unzip -q "/content/sota-master.zip" -d "/content/sota_data"

Mounted at /content/drive


# **1. Data Preprocessing**

In [4]:
from transformers import BertTokenizer
# Initialize tokenizer since we use it for the data preprocessing
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [5]:
import re
# The following method extracts the most relevant sections and parts from the article
# For training
def extract_latex_content(content):
    # Extract title
    title = re.search(r'\\title\{([\s\S]*?)\}', content)
    title = title.group(1) if title else "Title not found"

    # Extract abstract
    abstract = re.search(r'\\begin\{abstract\}([\s\S]*?)\\end\{abstract\}', content)
    abstract = abstract.group(1).strip() if abstract else "Abstract not found"

    # Extract shortened experimental section
    experimental_short = re.search(r'\\section\{.*?[Ee]xperiment.*?\}([\s\S]*?)(\n\n|\\section)', content)
    experimental_short = experimental_short.group(1).strip() if experimental_short else "Experimental Setup section not found"

    # Extract full experimental section
    experimental_long = re.search(r'\\section\{.*?[Ee]xperiment.*?\}([\s\S]*?)(\\section)', content)
    experimental_long = experimental_long.group(1).strip() if experimental_long else "Experimental Setup section not found"

    # Extract experimental section without tables, we add these add the end
    no_table_experiment = re.sub(r'\\begin\{(table|table\*|wraptable)\}([\s\S]*?)\\end\{(table|table\*|wraptable)\}', '', experimental_long)

    # Extract results section
    results = re.search(r'\\section\{.*?[Rr]esult.*?\}([\s\S]*?)(\\section)', content)
    results = results.group(1).strip() if results else "No results section found"

    # Define regex for extracting tables
    table_patterns = [
        r"\\begin\{table\}([\s\S]*?)\\end\{table\}",
        r"\\begin\{table\*\}([\s\S]*?)\\end\{table\*\}",
        r"\\begin\{wraptable\}([\s\S]*?)\\end\{wraptable\}",
    ]
    # Extract tables
    tables = []
    for pattern in table_patterns:
        tables_initial = re.findall(pattern, content, re.DOTALL)
        tables.extend(tables_initial)

    table_str =""
    for table in tables:
        table_annotated = "\\begin{table}" + table + "\\end{table}"
        table_str = table_str + table_annotated + "\n"

    # Concatenate all parts
    full_text = f"Title: {title}\nAbstract: {abstract}\nExperiments: {no_table_experiment}\nResults: {results}\nTables:\n{table_str}"
    clean_text = re.sub(r'\\cite{[^}]*}|\\citet{[^}]*}|\\citep{[^}]*}', '', full_text)

    return clean_text

In [6]:
import re
# The following method extracts the most relevant sections and parts from the article
# For inference
def extract_latex_content_validation(content):
    # Extract title
    title = re.search(r'\\title\{([\s\S]*?)\}', content)
    title = title.group(1) if title else "Title not found"

    # Extract abstract
    abstract = re.search(r'\\begin\{abstract\}([\s\S]*?)\\end\{abstract\}', content)
    abstract = abstract.group(1).strip() if abstract else "Abstract not found"

    # Extract shortened experimental section
    experimental_short = re.search(r'\\section\{.*?[Ee]xperiment.*?\}([\s\S]*?)(\n\n|\\section)', content)
    experimental_short = experimental_short.group(1).strip() if experimental_short else "Experimental Setup section not found"

    # Extract full experimental section
    experimental_long = re.search(r'\\section\{.*?[Ee]xperiment.*?\}([\s\S]*?)(\\section)', content)
    experimental_long = experimental_long.group(1).strip() if experimental_long else "Experimental Setup section not found"

    # Extract experimental section without tables, we add these add the end
    no_table_experiment = re.sub(r'\\begin\{(table|table\*|wraptable)\}([\s\S]*?)\\end\{(table|table\*|wraptable)\}', '', experimental_long)

    # Extract results section
    results = re.search(r'\\section\{.*?[Rr]esult.*?\}([\s\S]*?)(\\section)', content)
    results = results.group(1).strip() if results else "No results section found"

    # Define regex for extracting tables
    table_patterns = [
        r"\\begin\{table\}([\s\S]*?)\\end\{table\}",
        r"\\begin\{table\*\}([\s\S]*?)\\end\{table\*\}",
        r"\\begin\{wraptable\}([\s\S]*?)\\end\{wraptable\}",
    ]
    # Extract tables
    tables = []
    for pattern in table_patterns:
        tables_initial = re.findall(pattern, content, re.DOTALL)
        tables.extend(tables_initial)

    table_str =""
    for table in tables:
        table_annotated = "\\begin{table}" + table + "\\end{table}"
        table_str = table_str + table_annotated + "\n"

    # Concatenate all parts
    full_text = f"Title: {title}\nAbstract: {abstract}\nExperiments: {no_table_experiment}\nResults: {results}\nTables:\n{table_str}"

    tokenized_full = tokenizer(full_text)

    # We only keep the first 5'000 tokens of the articles to ensure enough space in the context length
    # For this, we reduce the articles gradually by omitting sections, and lastly crop the article
    if len(tokenized_fulltext['input_ids']) >= 5000:
      full_text = f"Title: {title}\nAbstract: {abstract}\nExperiments: {experimental_short}\nResults: {results}\nTables:\n{table_str}"
      tokenized_fulltext = tokenizer(full_text)
      print("shortened experiments section")
      print(len(tokenized_fulltext['input_ids']))
    if len(tokenized_fulltext['input_ids']) >= 5000:
      full_text = f"Title: {title}\nAbstract: {abstract}\nExperiments: Omitted\nResults: {results}\nTables:\n{table_str}"
      tokenized_fulltext = tokenizer(full_text)
      print("omitted experiments section")
      print(len(tokenized_fulltext['input_ids']))
    if len(tokenized_fulltext['input_ids']) >= 5000:
      full_text = f"Title: {title}\nAbstract: {abstract}\nExperiments: Omitted\nResults: Omitted\nTables:\n{table_str}"
      tokenized_fulltext = tokenizer(full_text)
      print("omitted results section")
      print(len(tokenized_fulltext['input_ids']))
    if len(tokenized_fulltext['input_ids']) >= 5000:
      print("limited to tokens[0:5000]")
      print(len(tokenized_fulltext['input_ids']))
      tokenized = tokenized_fulltext['input_ids'][1:5000]
      full_text = tokenizer.decode(tokenized)

    # Remove citations
    clean_text = re.sub(r'\\cite{[^}]*}|\\citet{[^}]*}|\\citep{[^}]*|\\url{[^}]*}', '', full_text)
    return clean_text

In [7]:
import os
import json
import pandas as pd
from sklearn.model_selection import train_test_split

def load_data(dataset_path):
    articles = []
    labels = []

    for article_id_folder in os.listdir(dataset_path):
        folder_path = os.path.join(dataset_path, article_id_folder)
        tex_file = os.path.join(folder_path, f"{article_id_folder}.tex")
        annotation_file = os.path.join(folder_path, "annotations.json")

        with open(tex_file, 'r', encoding='utf-8',  errors='ignore') as file:
            article_text = file.read()

        text = extract_latex_content(article_text)

        with open(annotation_file, 'r') as file:
            annotations = file.read()
            label = 1 if "unanswerable" not in annotations else 0

        articles.append(text)
        labels.append(label)

    return pd.DataFrame({'text': articles, 'label': labels})

dataset_path = "/content/sota_data/sota-master/dataset/train"
dataset = load_data(dataset_path)


# **2. Prepare and train BERT model for classification**

In [8]:
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
import torch

class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Tokenizer and model initialization
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Prepare dataset and dataloader
train_texts = dataset['text'].tolist()
train_labels = dataset['label'].tolist()
train_dataset = TextDataset(train_texts, train_labels, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
pip install torch transformers

In [32]:
import numpy as np  # Add this line at the beginning of your script

from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, random_split
from torch.optim import Adam
import torch
import torch.nn as nn

# Initialize tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

texts = dataset['text'].tolist()
labels = dataset['label'].tolist()

full_dataset = TextDataset(texts, labels, tokenizer)

# Split the dataset into training and validation sets
train_size = int(0.8 * len(full_dataset))
val_size = len(full_dataset) - train_size
train_dataset, val_dataset = random_split(full_dataset, [train_size, val_size])

# Data loaders
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# Optimizer and loss function
optimizer = AdamW(model.parameters(), lr=2e-5)
loss_fn = nn.CrossEntropyLoss()

# Training function
def train_epoch(model, data_loader, loss_fn, optimizer, device, n_examples):
    model = model.train()
    losses = []
    correct_predictions = 0

    for d in data_loader:
        input_ids = d['input_ids'].to(device)
        attention_mask = d['attention_mask'].to(device)
        labels = d['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        _, preds = torch.max(outputs.logits, dim=1)
        loss = outputs.loss

        correct_predictions += torch.sum(preds == labels)
        losses.append(loss.item())

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    return correct_predictions.double() / n_examples, np.mean(losses)

# Training loop
num_epochs = 3

for epoch in range(num_epochs):
    print(f'Epoch {epoch + 1}/{num_epochs}')
    print('-' * 10)

    train_acc, train_loss = train_epoch(
        model,
        train_loader,
        loss_fn,
        optimizer,
        device,
        len(train_dataset)
    )

    print(f'Train loss {train_loss} accuracy {train_acc}')

    # Save the model after training
    torch.save(model.state_dict(), 'bert_classification_model.pth')

print("Training completed.")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3
----------
Train loss 0.19908801547764665 accuracy 0.9333672431332655
Epoch 2/3
----------
Train loss 0.14474737638171134 accuracy 0.956968463886063
Epoch 3/3
----------
Train loss 0.13323986055473422 accuracy 0.9612410986775177
Training completed.


In [33]:
import torch

# Predict for the own validation set
def predict(model, data_loader, device, return_labels=False):
    model = model.eval()
    predictions = []
    labels = []

    with torch.no_grad():
        for d in data_loader:
            input_ids = d['input_ids'].to(device)
            attention_mask = d['attention_mask'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs.logits, dim=1)
            predictions.extend(preds.to('cpu').numpy())
            if return_labels:
                labels.extend(d['labels'].to('cpu').numpy())

    return (predictions, labels) if return_labels else predictions

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
predictions, true_labels = predict(model, val_loader, device, return_labels=True)

In [34]:
from sklearn.metrics import classification_report

print(classification_report(true_labels, predictions))

              precision    recall  f1-score   support

           0       0.94      0.92      0.93       884
           1       0.96      0.97      0.96      1574

    accuracy                           0.95      2458
   macro avg       0.95      0.94      0.95      2458
weighted avg       0.95      0.95      0.95      2458



In [None]:
import os
import json
import pandas as pd
from sklearn.model_selection import train_test_split

def load_data(dataset_path):
    test_filenames = []
    test_articles = []

    for article_id_folder in os.listdir(dataset_path):
        folder_path = os.path.join(dataset_path, article_id_folder)
        tex_file = os.path.join(folder_path, f"{article_id_folder}.tex")
        annotation_file = os.path.join(folder_path, "annotations.json")

        with open(tex_file, 'r', encoding='utf-8',  errors='ignore') as file:
            article_text = file.read()
        text = extract_latex_content_validation(article_text)

        test_filenames.append(tex_file)
        test_articles.append(text)

    return test_filenames, test_articles

testset_path = "/content/sota_data/sota-master/codalab/blind-validation-dataset"
filenames, test_texts, = load_data(testset_path)
test_dataset = pd.DataFrame({'text': test_texts})

Note: The following part writes the labeles {0,1} directly to a CSV file. To calculate the score on classification accuracy, we added the models to the accompanied notebook "Project_SOTA_Task4_Extraction_Notebook.ipynb".

In [38]:
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader
import torch
import pandas as pd

# Load the model and tokenizer
model_path = 'bert_classification_model.pth'
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
model.load_state_dict(torch.load(model_path))
model.eval()  # Set the model to evaluation mode

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# Load test data
test_texts = test_dataset['text'].tolist()
test_labels = [0] * len(test_texts)

# Create DataLoader for test data
test_dataset = TextDataset(test_texts, test_labels, tokenizer)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

# Prediction function
def predict(model, data_loader, device):
    model = model.eval()
    predictions = []
    filename_list = []
    i = 0

    with torch.no_grad():
        for d in data_loader:
            i += 1
            input_ids = d['input_ids'].to(device)
            attention_mask = d['attention_mask'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs.logits, dim=1)
            predictions.extend(preds.to('cpu').numpy())
    return predictions

# Run predictions for the provided validation set
test_predictions = predict(model, test_loader, device)

test_df = pd.DataFrame({
    'articles': filenames,
    'text': test_texts
})

# Add predictions to the DataFrame
test_df['predicted_label'] = test_predictions

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [40]:
test_df.to_csv('/content/results_bert_model_classification.csv', index=False)

# **2. Logistic Regression**

In [41]:
articles_train, articles_test, labels_train, labels_test = train_test_split(dataset['text'], dataset['label'], test_size=0.2, random_state=42)

In [42]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=13000, stop_words='english')
X_train = vectorizer.fit_transform(articles_train)
X_test = vectorizer.transform(articles_test)

In [43]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

model = LogisticRegression(max_iter=1000)
model.fit(X_train, labels_train)

# Predict on the test set
predictions = model.predict(X_test)

# Evaluate the model
print(classification_report(labels_test, predictions))

              precision    recall  f1-score   support

           0       0.94      0.92      0.93       908
           1       0.95      0.97      0.96      1550

    accuracy                           0.95      2458
   macro avg       0.95      0.94      0.95      2458
weighted avg       0.95      0.95      0.95      2458



In [None]:
import os
import json
import pandas as pd
from sklearn.model_selection import train_test_split

def load_data(dataset_path):
    test_filenames = []
    test_articles = []

    for article_id_folder in os.listdir(dataset_path):
        folder_path = os.path.join(dataset_path, article_id_folder)
        tex_file = os.path.join(folder_path, f"{article_id_folder}.tex")
        annotation_file = os.path.join(folder_path, "annotations.json")

        with open(tex_file, 'r', encoding='utf-8',  errors='ignore') as file:
            article_text = file.read()
        text = extract_latex_content_validation(article_text)

        test_filenames.append(tex_file)
        test_articles.append(text)

    return test_filenames, test_articles

testset_path = "/content/sota_data/sota-master/codalab/blind-validation-dataset"
filenames, test_texts, = load_data(testset_path)
test_dataset = pd.DataFrame({'text': test_texts})

Note: The following part writes the labeles {0,1} directly to a CSV file. To calculate the score on classification accuracy, we added the models to the accompanied notebook "Project_SOTA_Task4_Extraction_Notebook.ipynb".

In [66]:
import numpy as np
import csv
import os
import numpy as np
from skimage.transform import resize
from tensorflow import keras

# Define the output CSV file path
output_csv_file = "/content/results_logistic_regression_classification.csv"

X_testset = vectorizer.transform(test_dataset['text'])

def predict_labels(model, x_test):
    predictions = model.predict(x_test)
    return predictions

# Predict labels
predicted_labels = predict_labels(model, X_testset)

# Write predictions to CSV
with open(output_csv_file, 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['filename', 'label'])
    for filename, label in zip(test_filenames, predicted_labels):
        writer.writerow([filename, label])

print(f"CSV file with test predictions saved at: {output_csv_file}")

CSV file with test predictions saved at: /content/results_logistic_regression_classification.csv
