In [1]:
# pip install transformers datasets
# pip install pandas
# pip install sentencepiece

In [2]:
import pandas as pd

df = pd.read_csv('/Users/stevensu/Desktop/SciSketch-Summer/Processed Data/training_data.csv')

In [3]:
import pandas as pd
import ast

# Load the dataset

# Function to extract phrases from the JSON object
def extract_phrases(json_str):
    # Parse the JSON string into a Python list of dictionaries
    json_obj = ast.literal_eval(json_str)
    # Extract the 'text' field from each dictionary
    phrases = [item['text'] for item in json_obj]
    return phrases

# Apply the function to the 'processed_image' column to create a new column 'phrases'
df['phrases'] = df['processed_image'].apply(extract_phrases)

# Now, the DataFrame `data` has a new column `phrases` containing lists of phrases
print(df[['Abstract', 'phrases']].head())


                                            Abstract  \
0  Gustatory receptors (GRs) are critical for ins...   
1  Thalamocortical loops have a central role in c...   
2  Nucleotide oligomerization domain (NOD)-like r...   
3  Nitric oxide (NO) is a gasotransmitter require...   
4  Embryogenesis requires substantial coordinatio...   

                                             phrases  
0  [Bombyx mori Gr9, Ligand-binding pocket, polar...  
1  [Stimulus, error, Behavior, pre  lpost|, Corte...  
2  [Priming, 126 KKKK PI4P 1, trans-Golgi network...  
3  [NITRATION, ONOO, NOz-Linolenic acid, PRX, PRX...  
4  [Molecular recording, Lineage Tracing in Trunk...  


In [4]:

# Function to remove text after and including "Keywords:"
def remove_keywords(text):
    return text.split('Keywords:')[0].strip()

# Apply the function to the 'Abstract' column
df['Abstract'] = df['Abstract'].apply(remove_keywords)

# Now, the DataFrame `data` has a cleaned `Abstract` column and a new `phrases` column
print(df[['Abstract', 'phrases']].head())

                                            Abstract  \
0  Gustatory receptors (GRs) are critical for ins...   
1  Thalamocortical loops have a central role in c...   
2  Nucleotide oligomerization domain (NOD)-like r...   
3  Nitric oxide (NO) is a gasotransmitter require...   
4  Embryogenesis requires substantial coordinatio...   

                                             phrases  
0  [Bombyx mori Gr9, Ligand-binding pocket, polar...  
1  [Stimulus, error, Behavior, pre  lpost|, Corte...  
2  [Priming, 126 KKKK PI4P 1, trans-Golgi network...  
3  [NITRATION, ONOO, NOz-Linolenic acid, PRX, PRX...  
4  [Molecular recording, Lineage Tracing in Trunk...  


In [5]:
import pandas as pd
from datasets import Dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, TrainingArguments, Trainer

# Combine 'Abstract' and 'phrases' into the format suitable for T5
def combine_for_t5(abstract, phrases):
    # Convert phrases array to a string
    phrases_str = '; '.join(phrases)
    return f"abstract: {abstract}", f"phrases: {phrases_str}"

# Apply the function to each row to create input-output pairs
df['input_text'], df['target_text'] = zip(*df.apply(lambda row: combine_for_t5(row['Abstract'], row['phrases']), axis=1))

# Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(df[['input_text', 'target_text']])

# Load the model and tokenizer
model_name = 't5-base'
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Tokenize the data
def tokenize_function(examples):
    model_inputs = tokenizer(examples['input_text'], max_length=512, truncation=True, padding='max_length')
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples['target_text'], max_length=512, truncation=True, padding='max_length')
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

tokenized_dataset = dataset.map(tokenize_function, batched=True)


  from .autonotebook import tqdm as notebook_tqdm
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Map: 100%|██████████| 31096/31096 [00:15<00:00, 2057.71 examples/s]


In [6]:
pip install accelerate -U

Note: you may need to restart the kernel to use updated packages.


In [7]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=3e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=3,
)

# Create the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,
)

# Train the model
trainer.train()

# Save the model
model.save_pretrained('./finetuned_t5_model')
tokenizer.save_pretrained('./finetuned_t5_model')

  0%|          | 10/23322 [02:01<87:30:55, 13.51s/it]

KeyboardInterrupt: 