# Data Handling

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('udc_dataset_no_duplicate_titles.csv')

In [3]:
df.columns

Index(['uid', 'text', ' udc_1', ' udc_2', ' udc_3', ' udc_4', ' udc_5', 'type',
       'generated_title', 'desc_custom_id', 'generated_description'],
      dtype='object')

In [4]:
df_label_list = df.copy()
df_label_list['labels'] = df_label_list[' udc_1']



In [5]:
df_text_desc_combined = df_label_list.copy()
df_text_desc_combined['text_desc'] = df_text_desc_combined['generated_title'].fillna('') + ' ' + df_text_desc_combined['generated_description'].fillna('')
df_text_desc_combined.head()

Unnamed: 0,uid,text,udc_1,udc_2,udc_3,udc_4,udc_5,type,generated_title,desc_custom_id,generated_description,labels,text_desc
0,0,A Beida,1¢(533.22),,,,,book,Exploring Contemporary Philosophies at Beida,0-request-book-0,"""Exploring Contemporary Philosophies at Beida""...",1¢(533.22),"Exploring Contemporary Philosophies at Beida ""..."
1,0,A Beida,1¢(533.22),,,,,book,Exploring Chinese Culture in Modern Times,0-request-book-1,"""Exploring Chinese Culture in Modern Times"" of...",1¢(533.22),"Exploring Chinese Culture in Modern Times ""Exp..."
2,0,A Beida,1¢(533.22),,,,,book,Whispers of the Old Library,0-request-book-2,"In ""Whispers of the Old Library,"" A Beida weav...",1¢(533.22),"Whispers of the Old Library In ""Whispers of th..."
3,0,A Beida,1¢(533.22),,,,,book,Whispers of the Eastern Lotus,0-request-book-3,"""Whispers of the Eastern Lotus"" is a captivati...",1¢(533.22),"Whispers of the Eastern Lotus ""Whispers of the..."
4,0,A Beida,1¢(533.22),,,,,article,Investigating the Differential Genetic Express...,0-request-sci-0,This article explores the variations in geneti...,1¢(533.22),Investigating the Differential Genetic Express...


In [6]:
df_pruned = df_text_desc_combined[['text_desc', 'labels']].copy()
df_pruned.head()

Unnamed: 0,text_desc,labels
0,"Exploring Contemporary Philosophies at Beida ""...",1¢(533.22)
1,"Exploring Chinese Culture in Modern Times ""Exp...",1¢(533.22)
2,"Whispers of the Old Library In ""Whispers of th...",1¢(533.22)
3,"Whispers of the Eastern Lotus ""Whispers of the...",1¢(533.22)
4,Investigating the Differential Genetic Express...,1¢(533.22)


## Preparing the model

In [7]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("t5-small")
model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")


  from .autonotebook import tqdm as notebook_tqdm


In [8]:
import torch

In [9]:
class GenDataset(torch.utils.data.Dataset):
    def __init__(self, tokenizer, inputs, targets, max_length=512):
        self.encodings = tokenizer(
            inputs,
            truncation=True,
            padding=True,
            max_length=max_length)
        self.targets = tokenizer(
            targets,
            truncation=True,
            padding=True,
            max_length=32)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.targets['input_ids'][idx])
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

In [10]:
dataset = GenDataset(
    tokenizer=tokenizer,
    inputs=df_pruned['text_desc'].tolist(),
    targets=df_pruned['labels'].tolist(),
    max_length=512
)

## Train/test split

In [12]:
# For multi-label classification, we'll use a simpler approach
# Since StratifiedShuffleSplit doesn't work well with multi-label data
from sklearn.model_selection import train_test_split
import numpy as np

# Get the total number of samples
total_samples = len(dataset)
indices = np.arange(total_samples)

# First split: 80% train+val, 20% test
trainval_idx, test_idx = train_test_split(indices, test_size=0.2, random_state=42)

# Second split: from the 80%, take 80% for train and 20% for validation
# This gives us 64% train, 16% val, 20% test
train_idx, val_idx = train_test_split(trainval_idx, test_size=0.2, random_state=42)

print(f"Total samples: {total_samples}")
print(f"Train samples: {len(train_idx)}")
print(f"Validation samples: {len(val_idx)}")
print(f"Test samples: {len(test_idx)}")



Total samples: 37373
Train samples: 23918
Validation samples: 5980
Test samples: 7475


In [13]:
# Create subset datasets
from torch.utils.data import Subset

train_dataset = Subset(dataset, train_idx)
val_dataset = Subset(dataset, val_idx)
test_dataset = Subset(dataset, test_idx)

print(f"Train dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(val_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")

Train dataset size: 23918
Validation dataset size: 5980
Test dataset size: 7475


## Training

In [None]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
from transformers import EarlyStoppingCallback

In [21]:
# GPU memory monitoring and optimization functions
def print_gpu_memory():
    if torch.cuda.is_available():
        print(f"GPU Memory Allocated: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
        print(f"GPU Memory Cached: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")
    else:
        print("CUDA not available")

def clear_gpu_cache():
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        print("GPU cache cleared")

# Print initial memory usage
print("Initial GPU memory status:")
print_gpu_memory()

Initial GPU memory status:
GPU Memory Allocated: 0.69 GB
GPU Memory Cached: 2.53 GB


In [22]:
training_args = Seq2SeqTrainingArguments(
    output_dir='./results',
    eval_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=20,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    load_best_model_at_end=True,
    predict_with_generate=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    logging_dir='./logs',
    logging_steps=100
)

In [30]:
# Create the trainer without early stopping first to avoid callback issues
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    processing_class=tokenizer
)

# A weird bug where the early stopping callback is not recognized
# #early_stopping = EarlyStoppingCallback(early_stopping_patience=3)
#trainer.add_callback(early_stopping)

In [18]:
from sklearn.metrics import f1_score, precision_score, recall_score
import numpy as np

In [32]:
trainer.train(resume_from_checkpoint=True)

There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


Epoch,Training Loss,Validation Loss
6,0.7777,0.714141
7,0.739,0.686634
8,0.7234,0.663166
9,0.7008,0.643227
10,0.6766,0.625965
11,0.6633,0.611457
12,0.6353,0.597258
13,0.6092,0.586195
14,0.6087,0.575529
15,0.5992,0.568066


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=59800, training_loss=0.4869249179291486, metrics={'train_runtime': 13114.2668, 'train_samples_per_second': 36.476, 'train_steps_per_second': 4.56, 'total_flos': 4.767143215890432e+16, 'train_loss': 0.4869249179291486, 'epoch': 20.0})

In [20]:
model.save_pretrained('./seq2seq_unpruned/udc_model_small')
tokenizer.save_pretrained('./seq2seq_unpruned/udc_model_small')

('./seq2seq_unpruned/udc_model_small\\tokenizer_config.json',
 './seq2seq_unpruned/udc_model_small\\special_tokens_map.json',
 './seq2seq_unpruned/udc_model_small\\tokenizer.json')