In [2]:
from google.colab import drive
drive.mount('/content/drive')
!cd /content/drive/MyDrive/SOTA_Challenge
#Ensure that you downloaded the sota dataset from https://github.com/jd-coderepos/sota/
!cp -r "/content/drive/MyDrive/SOTA_Challenge/sota-master.zip" "/content/"
!unzip -q "/content/sota-master.zip" -d "/content/sota_data"

Mounted at /content/drive


# **0. Load model and tokenizer**

In [3]:
%%capture
# Installs Unsloth, Xformers (Flash Attention) and all other packages
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps xformers "trl<0.9.0" peft accelerate bitsandbytes

In [4]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 8192
dtype = None
load_in_4bit = True

model, tokenizer = FastLanguageModel.from_pretrained(
    # Change this to unsloth/llama-3-8b-bnb-4bit for the llama3 8b model
    model_name = "unsloth/llama-3-8b-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use token if using gated models like meta-llama/Llama-2-7b-hf
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


config.json:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

==((====))==  Unsloth: Fast Llama patching release 2024.6
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.0+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. Xformers = 0.0.26.post1. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/172 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/464 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# **1. Data Preprocessing**

In [5]:
import json

def safe_json_loads(s):
    try:
        return json.loads(s)
    except json.JSONDecodeError:
        try:
            # Attempt to fix single quote issues and re-parse
            safe_s = s.replace("'", '"')
            json_safe = json.loads(safe_s)
            return json_safe
        except json.JSONDecodeError:
            # If JSON parsing fails again, return 'unanswerable'
            return "unanswerable"

In [6]:
import re
# The following method extracts the most relevant sections and parts from the article
# For training
def extract_and_concatenate_latex_content(content):
    # Extract title
    title = re.search(r'\\title\{([\s\S]*?)\}', content)
    title = title.group(1) if title else "Title not found"

    # Extract abstract
    abstract = re.search(r'\\begin\{abstract\}([\s\S]*?)\\end\{abstract\}', content)
    abstract = abstract.group(1).strip() if abstract else "Abstract not found"

    # Extract shortened experimental section
    experimental_short = re.search(r'\\section\{.*?[Ee]xperiment.*?\}([\s\S]*?)(\n\n|\\section)', content)
    experimental_short = experimental_short.group(1).strip() if experimental_short else "Experimental Setup section not found"

    # Extract full experimental section
    experimental_long = re.search(r'\\section\{.*?[Ee]xperiment.*?\}([\s\S]*?)(\\section)', content)
    experimental_long = experimental_long.group(1).strip() if experimental_long else "Experimental Setup section not found"

    # Extract experimental section without tables, we add these add the end
    no_table_experiment = re.sub(r'\\begin\{(table|table\*|wraptable)\}([\s\S]*?)\\end\{(table|table\*|wraptable)\}', '', experimental_long)

    # Extract results section
    results = re.search(r'\\section\{.*?[Rr]esult.*?\}([\s\S]*?)(\\section)', content)
    results = results.group(1).strip() if results else "No results section found"

    # Define regex for extracting tables
    table_patterns = [
        r"\\begin\{table\}([\s\S]*?)\\end\{table\}",
        r"\\begin\{table\*\}([\s\S]*?)\\end\{table\*\}",
        r"\\begin\{wraptable\}([\s\S]*?)\\end\{wraptable\}",
    ]
    # Extract tables
    tables = []
    for pattern in table_patterns:
        tables_initial = re.findall(pattern, content, re.DOTALL)
        tables.extend(tables_initial)

    table_str =""
    for table in tables:
        table_annotated = "\\begin{table}" + table + "\\end{table}"
        table_str = table_str + table_annotated + "\n"

    # Concatenate all parts
    full_text = f"Title: {title}\nAbstract: {abstract}\nExperiments: {no_table_experiment}\nTables:\n{table_str}\nResults: {results}"
    full_text = f"Title: {title}\nAbstract: {abstract}\nExperiments: {no_table_experiment}\nResults: {results}\nTables:\n{table_str}"

    # Remove citations
    clean_text = re.sub(r'\\cite{[^}]*}|\\citet{[^}]*}|\\citep{[^}]*}', '', full_text)

    return clean_text

In [7]:
import os
import json
import re
from datasets import Dataset
import pandas as pd

def preprocess_data(dataset_path):
    # Lists to store processed data
    data = []
    filenames = []
    articles = []
    labels = []

    instruction = "Your task is to extract all pertinent (Task, Dataset, Metric, Score) tuples from a given AI paper text. Your response is a list of JSONs of the tuples in the format [{'LEADERBOARD': {'Task': 'task', 'Dataset': 'dataset', 'Metric': 'metric', 'Score': 'score'}}]. Make sure all of them are real compounds of pertinent (Task, Dataset, Metric, Score) tuples and do not include duplicates. Be very strict, only report the most pertinent tuples that were explicitly highlighted in the text. If you don't find any tuples in the text or you are not sure, respond with 'unanswerable'."

    num_unanswerble = 0
    # Iterate over LaTeX files and their corresponding JSON annotations
    for article_id_folder in os.listdir(dataset_path):

        folder_path = os.path.join(dataset_path, article_id_folder)
        tex_file = os.path.join(folder_path, f"{article_id_folder}.tex")
        annotation_file = os.path.join(folder_path, "annotations.json")

        # Read latex file
        with open(tex_file, 'r', encoding='utf-8',  errors='ignore') as file:
            latex_content = file.read()

        # Extract text from LaTeX content
        text = extract_and_concatenate_latex_content(latex_content)
        if text == None:
          continue

        # Read JSON annotations
        with open(annotation_file, 'r', encoding='utf-8') as ann_file:
            raw_annotations = ann_file.read()
            # Set label for the classification model
            label = 1 if "unanswerable" not in raw_annotations else 0

        annotations = safe_json_loads(raw_annotations)

        use_text = False
        if annotations != "unanswerable":
            for annotation in annotations:
                # Loop annotations and check if they are found in the accompanied article
                tdms = annotation['LEADERBOARD']
                num_fields = 0
                for key, value in tdms.items():
                    pattern = re.escape(value.lower())
                    match = re.search(pattern, text.lower(), re.S)
                    if match:
                        num_fields += 1
                if num_fields == 4: # If Task, Dataset, Metric and Score are found at least once, then use text for training
                    use_text = True
        if use_text:
            data_entry = {"instruction": instruction, "input": text, "output": str(annotations)}
            # Check token length of the prompt to not exceed context length
            tokenized_entry = tokenizer(f"### Instruction: {data_entry['instruction']}\n### Text: {data_entry['input']}\n### Response: {data_entry['output']}")
            if len(tokenized_entry['input_ids']) <= 7000:
              data.append(data_entry)

        filenames.append(tex_file)
        articles.append(text)
        labels.append(label)

    data_labelled = pd.DataFrame({'text': articles, 'label': labels})

    return data, data_labelled

dataset_path = '/content/sota_data/sota-master/dataset/train'

data, data_labelled = preprocess_data(dataset_path)

Token indices sequence length is longer than the specified maximum sequence length for this model (26341 > 8192). Running this sequence through the model will result in indexing errors


In [8]:
print("Number of training data entries: " + str(len(data)))

Number of training data entries: 875


# **2. Prepare and train logistic regression classification model**

First, we train a Logistic Regression model to classify whether the AI paper reports tuples or not. To use the pretrained BERT model, replace this code with the BERT finetuning code from the accompanied file "Project_SOTA_Task4_Classification_Notebook.ipynb"

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

articles_train, articles_test, labels_train, labels_test = train_test_split(data_labelled['text'], data_labelled['label'], test_size=0.2, random_state=42)
# Create vectors
vectorizer = TfidfVectorizer(max_features=13000, stop_words='english', ngram_range=(1, 2), max_df=0.7, min_df=10)
X_train = vectorizer.fit_transform(articles_train)
X_test = vectorizer.transform(articles_test)

# Build logistic regression model
model_labeling = LogisticRegression(max_iter=1000, solver='saga')
model_labeling.fit(X_train, labels_train)

# Predict on the test set
predictions = model_labeling.predict(X_test)

# Evaluate
print(classification_report(labels_test, predictions))

              precision    recall  f1-score   support

           0       0.93      0.90      0.91       860
           1       0.95      0.96      0.95      1598

    accuracy                           0.94      2458
   macro avg       0.94      0.93      0.93      2458
weighted avg       0.94      0.94      0.94      2458



# **3. Prepare and finetune LLM extraction model**

We now add LoRA adapters

In [13]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    max_seq_length = max_seq_length,
)

Unsloth 2024.6 patched 48 layers with 48 QKV layers, 48 O layers and 48 MLP layers.


<a name="Data"></a>
### Prompt Preparation


In [10]:
alpaca_prompt = """### Instruction:
{}

### Text:
{}

### Response:
{}"""

In [11]:
EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
BOS_TOKEN = tokenizer.bos_token # Must add EOS_TOKEN

print(EOS_TOKEN)
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Must add BOS_ and EOS_TOKEN
        text = BOS_TOKEN + alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

# Convert list of dictionaries into a Hugging Face Dataset
dataset = Dataset.from_list(data)
dataset = dataset.map(formatting_prompts_func, batched = True,)

<|end_of_text|>


Map:   0%|          | 0/875 [00:00<?, ? examples/s]

<a name="Train"></a>
### Train the model


In [24]:
from trl import SFTTrainer
from transformers import TrainingArguments

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 10,
        num_train_epochs = 1, # Change this value for longer training
        learning_rate = 2e-4,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

Map:   0%|          | 0/899 [00:00<?, ? examples/s]

In [25]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 899 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 112
 "-____-"     Number of trainable parameters = 108,920,832


Step,Training Loss
1,1.0103
2,1.1789
3,1.1241
4,1.3138
5,1.3603
6,1.2271
7,1.2086
8,1.3198
9,1.1644
10,1.1456


# **4. Inference**


In [12]:
import os

test_dataset_path = '/content/sota_data/sota-master/codalab/blind-validation-dataset'
output_path = '/content/drive/MyDrive/SOTA_Challenge/RESULTS_codellama-34b_ft'
os.makedirs(output_path, exist_ok=True)

In [13]:
import re
# The following method extracts the most relevant sections and parts from the article
# For inference
def extract_latex_content_validation(content):
    # Extract title
    title = re.search(r'\\title\{([\s\S]*?)\}', content)
    title = title.group(1) if title else "Title not found"

    # Extract abstract
    abstract = re.search(r'\\begin\{abstract\}([\s\S]*?)\\end\{abstract\}', content)
    abstract = abstract.group(1).strip() if abstract else "Abstract not found"

    # Extract shortened experimental section
    experimental_short = re.search(r'\\section\{.*?[Ee]xperiment.*?\}([\s\S]*?)(\n\n|\\section)', content)
    experimental_short = experimental_short.group(1).strip() if experimental_short else "Experimental Setup section not found"

    # Extract full experimental section
    experimental_long = re.search(r'\\section\{.*?[Ee]xperiment.*?\}([\s\S]*?)(\\section)', content)
    experimental_long = experimental_long.group(1).strip() if experimental_long else "Experimental Setup section not found"

    # Extract experimental section without tables, we add these add the end
    no_table_experiment = re.sub(r'\\begin\{(table|table\*|wraptable)\}([\s\S]*?)\\end\{(table|table\*|wraptable)\}', '', experimental_long)

    # Extract results section
    results = re.search(r'\\section\{.*?[Rr]esult.*?\}([\s\S]*?)(\\section)', content)
    results = results.group(1).strip() if results else "No results section found"

    # Define regex for extracting tables
    table_patterns = [
        r"\\begin\{table\}([\s\S]*?)\\end\{table\}",
        r"\\begin\{table\*\}([\s\S]*?)\\end\{table\*\}",
        r"\\begin\{wraptable\}([\s\S]*?)\\end\{wraptable\}",
    ]
    # Extract tables
    tables = []
    for pattern in table_patterns:
        tables_initial = re.findall(pattern, content, re.DOTALL)
        tables.extend(tables_initial)

    table_str =""
    for table in tables:
        table_annotated = "\\begin{table}" + table + "\\end{table}"
        table_str = table_str + table_annotated + "\n"

    # Concatenate all parts
    full_text = f"Title: {title}\nAbstract: {abstract}\nExperiments: {no_table_experiment}\nResults: {results}\nTables:\n{table_str}"
    tokenized_fulltext = tokenizer(full_text)

    # We only keep the first 5'000 tokens of the articles to ensure enough space in the context length
    # For this, we reduce the articles gradually by omitting sections, and lastly crop the article
    if len(tokenized_fulltext['input_ids']) >= 5000:
      full_text = f"Title: {title}\nAbstract: {abstract}\nExperiments: {experimental_short}\nResults: {results}\nTables:\n{table_str}"
      tokenized_fulltext = tokenizer(full_text)
      print("shortened experiments section")
      print(len(tokenized_fulltext['input_ids']))
    if len(tokenized_fulltext['input_ids']) >= 5000:
      full_text = f"Title: {title}\nAbstract: {abstract}\nExperiments: Omitted\nResults: {results}\nTables:\n{table_str}"
      tokenized_fulltext = tokenizer(full_text)
      print("omitted experiments section")
      print(len(tokenized_fulltext['input_ids']))
    if len(tokenized_fulltext['input_ids']) >= 5000:
      full_text = f"Title: {title}\nAbstract: {abstract}\nExperiments: Omitted\nResults: Omitted\nTables:\n{table_str}"
      tokenized_fulltext = tokenizer(full_text)
      print("omitted results section")
      print(len(tokenized_fulltext['input_ids']))
    if len(tokenized_fulltext['input_ids']) >= 5000:
      print("limited to tokens[0:5000]")
      print(len(tokenized_fulltext['input_ids']))
      tokenized = tokenized_fulltext['input_ids'][1:5000]
      full_text = tokenizer.decode(tokenized)

    # Remove citations
    clean_text = re.sub(r'\\cite{[^}]*}|\\citet{[^}]*}|\\citep{[^}]*}', '', full_text)
    return clean_text

In [15]:
def prompt_two_models(article_id_folder, instruction, text):
  test_articles = []
  test_filenames = []

  test_filenames.append(article_id_folder)
  test_articles.append(text)
  xtest = vectorizer.transform([text])
  prediction = model_labeling.predict(xtest)

  if prediction[0] == 1:
      inputs = tokenizer(
      [
          alpaca_prompt.format(
              instruction, # instruction
              f"{text}", # input
              "", # output - empty for generation
          )
      ], return_tensors = "pt").to("cuda")

      input_ids = inputs["input_ids"]

      print("Article ID: " + article_id_folder)
      print("prompt length: "+str(input_ids.shape[1]))
      # Calculate amount of free tokens
      max_new_tokens = max_seq_length-input_ids.shape[1]

      outputs = model.generate(**inputs, max_new_tokens = max_new_tokens, use_cache = True)
      response_token = tokenizer.batch_decode(outputs)
      print(response_token)
      pattern = r"Response:\n(.+?)<"
      match = re.search(pattern, response_token[0])
      if match:
        response = match.group(1)
        print(response)
      else:
        response = "no answer"
        print(response)
  else:
    response = "unanswerable"
    print(folder_path)
    print(response)
  return response


def prompt_llm_only(instruction, text):
  inputs = tokenizer(
  [
      alpaca_prompt.format(
          instruction, # instruction
          f"{text}", # input
          "", # output - leave this blank for generation!
      )
  ], return_tensors = "pt").to("cuda")

  input_ids = inputs["input_ids"]

  print("Article ID: " + article_id_folder)
  print("prompt length: "+str(input_ids.shape[1]))
  # Calculate amount of free tokens
  max_new_tokens = max_seq_length-input_ids.shape[1]

  outputs = model.generate(**inputs, max_new_tokens = max_new_tokens, use_cache = True)
  response_token = tokenizer.batch_decode(outputs)
  print(response_token)
  pattern = r"Response:\n(.+?)<"
  match = re.search(pattern, response_token[0])
  if match:
    response = match.group(1)
    print(response)
  else:
    response = "no answer"
    print(response)

  return response

In [None]:
instruction = "Your task is to extract all pertinent (Task, Dataset, Metric, Score) tuples from a given AI paper text. Your response is a list of JSONs of the tuples in the format [{'LEADERBOARD': {'Task': 'task', 'Dataset': 'dataset', 'Metric': 'metric', 'Score': 'score'}}]. Make sure all of them are real compounds of pertinent (Task, Dataset, Metric, Score) tuples and do not include duplicates. Be very strict, only report the most pertinent tuples that were explicitly highlighted in the text. If you don't find any tuples in the text or you are not sure, respond with 'unanswerable'."

FastLanguageModel.for_inference(model) # Enable native 2x faster inference

for article_id_folder in os.listdir(test_dataset_path):
    folder_path = os.path.join(test_dataset_path, article_id_folder)
    tex_file = os.path.join(folder_path, f"{article_id_folder}.tex")

    with open(tex_file, 'r', encoding='utf-8',  errors='ignore') as file:
        latex_content = file.read()
    # Extract text from LaTeX content
    text = extract_latex_content_validation(latex_content)

    response = prompt_llm_only(instruction, text)
    #response = prompt_two_models(article_id_folder, instruction, text)

    output_path_folder = os.path.join(output_path, article_id_folder)
    os.makedirs(output_path_folder, exist_ok=True)

    output_path_anno = os.path.join(output_path_folder, "annotations.json")

    with open(output_path_anno, 'w') as file:
        file.write(response)
