In [9]:
pip install datasets transformers



In [10]:
from datasets import load_dataset

# Load arxiv subset
dataset = load_dataset("scientific_papers", "arxiv")
print(dataset)

README.md:   0%|          | 0.00/8.27k [00:00<?, ?B/s]

scientific_papers.py:   0%|          | 0.00/5.35k [00:00<?, ?B/s]

The repository for scientific_papers contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/scientific_papers.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading data:   0%|          | 0.00/3.62G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/880M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/203037 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/6436 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/6440 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['article', 'abstract', 'section_names'],
        num_rows: 203037
    })
    validation: Dataset({
        features: ['article', 'abstract', 'section_names'],
        num_rows: 6436
    })
    test: Dataset({
        features: ['article', 'abstract', 'section_names'],
        num_rows: 6440
    })
})


In [11]:
def extract_problem_statement(article_text):
    # Use the first 1-2 sentences as proxy for the research problem
    sentences = article_text.split('. ')
    return '. '.join(sentences[:2]) + '.'

dataset_pairs = []

for sample in dataset["train"].select(range(200)):  # Limit to 200 for now
    abstract = sample['abstract'].strip().replace('\n', ' ')
    problem_statement = extract_problem_statement(sample['article'])
    dataset_pairs.append({
        'input': f"Abstract: {abstract}",
        'output': f"Research Problem: {problem_statement}"
    })

print("Example Pair:\n")
print("Input:\n", dataset_pairs[0]['input'])
print("\nOutput:\n", dataset_pairs[0]['output'])

Example Pair:

Input:
 Abstract: additive models play an important role in semiparametric statistics .   this paper gives learning rates for regularized kernel based methods for additive models .   these learning rates compare favourably in particular in high dimensions to recent results on optimal learning rates for purely nonparametric regularized kernel based quantile regression using the gaussian radial basis function kernel , provided the assumption of an additive model is valid .   additionally , a concrete example is presented to show that a gaussian function depending only on one variable lies in a reproducing kernel hilbert space generated by an additive gaussian kernel , but does not belong to the reproducing kernel hilbert space generated by the multivariate gaussian kernel of the same variance .    *   key words and phrases . * additive model , kernel , quantile regression , semiparametric , rate of convergence , support vector machine .

Output:
 Research Problem: additive

In [12]:
import json

with open("dataset_pairs.json", "w") as f:
    json.dump(dataset_pairs, f)

print("Saved 200 pairs to dataset_pairs.json")

Saved 200 pairs to dataset_pairs.json


In [13]:
!pip install accelerate

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.wh

In [14]:
from transformers import AutoTokenizer
from datasets import Dataset

# Load tokenizer (we'll use a small pretrained GPT2 tokenizer)
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # GPT-2 doesn't have pad token

# Load your dataset
import json

with open("dataset_pairs.json", "r") as f:
    pairs = json.load(f)

# Turn into HF Dataset
hf_dataset = Dataset.from_list(pairs)

# Preprocess
def tokenize(example):
    text = example['input'] + "\n" + example['output']
    return tokenizer(text, truncation=True, padding='max_length', max_length=256)

tokenized_dataset = hf_dataset.map(tokenize)
tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.1)

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [23]:
!pip install --upgrade transformers



In [24]:
from transformers import GPT2LMHeadModel, TrainingArguments, Trainer

# Load tiny GPT2 model
model = GPT2LMHeadModel.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))  # Add pad token

# Training config
training_args = TrainingArguments(
    output_dir="./results",
    # Replace 'evaluation_strategy' with 'do_eval' and set it to True if you want to evaluate
    do_eval=True,
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=5,
    weight_decay=0.01,
    save_strategy="no",  # No checkpointing for speed
    logging_steps=10,
    report_to="none"
)

In [28]:
def add_labels(example):
    example["labels"] = example["input_ids"].copy()
    return example

tokenized_dataset = tokenized_dataset.map(add_labels)

Map:   0%|          | 0/180 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

In [29]:
train_dataset = tokenized_dataset["train"]
eval_dataset = tokenized_dataset["test"]

In [30]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
)

trainer.train()

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
10,4.1103
20,3.8116
30,3.7683
40,3.5964
50,3.5966
60,3.4525
70,3.2852
80,3.3067
90,3.2498
100,3.2163


TrainOutput(global_step=225, training_loss=3.2771630265977647, metrics={'train_runtime': 3812.8314, 'train_samples_per_second': 0.236, 'train_steps_per_second': 0.059, 'total_flos': 117581414400000.0, 'train_loss': 3.2771630265977647, 'epoch': 5.0})

In [32]:
from transformers import pipeline

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=50)

prompt = "Abstract: In this paper, we propose a novel method for improving neural machine translation. \nResearch Problem:"
output = pipe(prompt)[0]['generated_text']

print("Generated:\n", output)


Device set to use cpu


Generated:
 Abstract: In this paper, we propose a novel method for improving neural machine translation. 
Research Problem: we investigate the potential of neural machine translation in the language language in the near future.
the goal of this research is to improve the accuracy of translation algorithms by changing the form factor and to the mechanism of translation to improve the precision.
this study


In [36]:
from transformers import pipeline

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=50)

prompt = "Abstract: In this paper, we propose a novel method for improving neural machine translation. \nResearch Problem:"
output = pipe(prompt)[0]['generated_text']

print("Generated:\n", output)

Device set to use cpu


Generated:
 Abstract: In this paper, we propose a novel method for improving neural machine translation. 
Research Problem: classification and localization of singleton stoichiometry values through hierarchical - class action detection.
experiments in stoichiometry estimation have often induced large numbers of identical values and can sometimes lead to erroneous classification of certain values at a given pixel level.



In [37]:
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
prompt = "Abstract: In this paper, we propose a novel method for improving neural machine translation. \nResearch Problem:"
output = pipe(prompt)[0]['generated_text']
print(output)

Device set to use cpu


Abstract: In this paper, we propose a novel method for improving neural machine translation. 
Research Problem: many techniques have been proposed and presented to improve machine translation in recently launched applications.
we argue that improved machine translation techniques should improve the accuracy


In [39]:
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
prompt = "Abstract: In this paper, we explore a new approach for Natural Language Processing \nResearch Problem:"
output = pipe(prompt)[0]['generated_text']
print(output)

Device set to use cpu


Abstract: In this paper, we explore a new approach for Natural Language Processing 
Research Problem: with a real language processing framework to improve quality of existing knowledge .    we describe techniques based on finite - and in this paper, finite


In [34]:
metrics = trainer.evaluate()
print(metrics)

{'eval_loss': 3.4094104766845703, 'eval_runtime': 28.3593, 'eval_samples_per_second': 0.705, 'eval_steps_per_second': 0.176, 'epoch': 5.0}


In [35]:
import math
print("Perplexity:", math.exp(metrics["eval_loss"]))

Perplexity: 30.247407450385964
