# KNN Classification - Standard Model

In [2]:
import pandas as pd

In [None]:
# Import the dataset
df = pd.read_csv('udc_dataset_no_duplicate_titles.csv')

In [None]:
# Create a new DataFrame with labels
df_label_list = df.copy()
df_label_list['labels'] = df_label_list[[' udc_1', ' udc_2', ' udc_3', ' udc_4', ' udc_5']].values.tolist()

# Strips and removes . separators
df_label_list['labels'] = df_label_list['labels'].apply(lambda x: [str(i).strip().replace('.','') for i in x if pd.notnull(i)])
df_label_list.head() 

Unnamed: 0,uid,text,udc_1,udc_2,udc_3,udc_4,udc_5,type,generated_title,desc_custom_id,generated_description,labels
0,0,A Beida,1¢(533.22),,,,,book,Exploring Contemporary Philosophies at Beida,0-request-book-0,"""Exploring Contemporary Philosophies at Beida""...",[1¢(53322)]
1,0,A Beida,1¢(533.22),,,,,book,Exploring Chinese Culture in Modern Times,0-request-book-1,"""Exploring Chinese Culture in Modern Times"" of...",[1¢(53322)]
2,0,A Beida,1¢(533.22),,,,,book,Whispers of the Old Library,0-request-book-2,"In ""Whispers of the Old Library,"" A Beida weav...",[1¢(53322)]
3,0,A Beida,1¢(533.22),,,,,book,Whispers of the Eastern Lotus,0-request-book-3,"""Whispers of the Eastern Lotus"" is a captivati...",[1¢(53322)]
4,0,A Beida,1¢(533.22),,,,,article,Investigating the Differential Genetic Express...,0-request-sci-0,This article explores the variations in geneti...,[1¢(53322)]


## Preprocessing

In [6]:
from sklearn.preprocessing import MultiLabelBinarizer

In [7]:
mlb = MultiLabelBinarizer()
label_matrix = mlb.fit_transform(df_label_list['labels'])
label_matrix

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [None]:
# Create a new title and description combined
df_text_desc_combined = df_label_list.copy()
df_text_desc_combined['text_desc'] = df_text_desc_combined['generated_title'].fillna('') + ' ' + df_text_desc_combined['generated_description'].fillna('')
df_text_desc_combined.head()

Unnamed: 0,uid,text,udc_1,udc_2,udc_3,udc_4,udc_5,type,generated_title,desc_custom_id,generated_description,labels,text_desc
0,0,A Beida,1¢(533.22),,,,,book,Exploring Contemporary Philosophies at Beida,0-request-book-0,"""Exploring Contemporary Philosophies at Beida""...",[1¢(53322)],"Exploring Contemporary Philosophies at Beida ""..."
1,0,A Beida,1¢(533.22),,,,,book,Exploring Chinese Culture in Modern Times,0-request-book-1,"""Exploring Chinese Culture in Modern Times"" of...",[1¢(53322)],"Exploring Chinese Culture in Modern Times ""Exp..."
2,0,A Beida,1¢(533.22),,,,,book,Whispers of the Old Library,0-request-book-2,"In ""Whispers of the Old Library,"" A Beida weav...",[1¢(53322)],"Whispers of the Old Library In ""Whispers of th..."
3,0,A Beida,1¢(533.22),,,,,book,Whispers of the Eastern Lotus,0-request-book-3,"""Whispers of the Eastern Lotus"" is a captivati...",[1¢(53322)],"Whispers of the Eastern Lotus ""Whispers of the..."
4,0,A Beida,1¢(533.22),,,,,article,Investigating the Differential Genetic Express...,0-request-sci-0,This article explores the variations in geneti...,[1¢(53322)],Investigating the Differential Genetic Express...


In [None]:
# Prune the DataFrame to keep only relevant columns
df_pruned = df_text_desc_combined[['text_desc', 'labels']].copy()
df_pruned.head()

Unnamed: 0,text_desc,labels
0,"Exploring Contemporary Philosophies at Beida ""...",[1¢(53322)]
1,"Exploring Chinese Culture in Modern Times ""Exp...",[1¢(53322)]
2,"Whispers of the Old Library In ""Whispers of th...",[1¢(53322)]
3,"Whispers of the Eastern Lotus ""Whispers of the...",[1¢(53322)]
4,Investigating the Differential Genetic Express...,[1¢(53322)]


## Training

In [10]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

encodings = tokenizer(df_pruned['text_desc'].tolist(), 
                      truncation=True, 
                      padding=True)

  from .autonotebook import tqdm as notebook_tqdm


In [11]:
import torch

In [12]:
# Check GPU availability and configure device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

if torch.cuda.is_available():
    print(f"GPU Name: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
    print(f"CUDA Version: {torch.version.cuda}")
else:
    print("CUDA is not available. Training will use CPU.")

Using device: cuda
GPU Name: NVIDIA GeForce RTX 3050 6GB Laptop GPU
GPU Memory: 6.0 GB
CUDA Version: 12.8


In [None]:
class BookDataset(torch.utils.data.Dataset):
    """Custom Dataset for loading book data with encodings and labels."""
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    def __getitem__(self, idx):
        """Get item by index."""
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        # Ensure float32 for BCEWithLogitsLoss (fixes the RuntimeError you encountered)
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float32)
        return item

    def __len__(self):
        return len(self.labels)

dataset = BookDataset(encodings, label_matrix)
print(f"Dataset created with {len(dataset)} samples")

Dataset created with 37373 samples


In [14]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased",
                                                            num_labels=label_matrix.shape[1],
                                                            problem_type="multi_label_classification")

# Move model to GPU if available
model = model.to(device)
print(f"Model moved to: {next(model.parameters()).device}")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model moved to: cuda:0


In [15]:
from sklearn.model_selection import train_test_split



In [None]:
# I should've used stratified sampling, but there was an error I could not fix.
from sklearn.model_selection import train_test_split
import numpy as np

# Get the total number of samples
total_samples = len(dataset)
indices = np.arange(total_samples)

# First split: 80% train+val, 20% test
trainval_idx, test_idx = train_test_split(indices, test_size=0.2, random_state=42)

# Second split: from the 80%, take 80% for train and 20% for validation
train_idx, val_idx = train_test_split(trainval_idx, test_size=0.2, random_state=42)

print(f"Total samples: {total_samples}")
print(f"Train samples: {len(train_idx)}")
print(f"Validation samples: {len(val_idx)}")
print(f"Test samples: {len(test_idx)}")



Total samples: 37373
Train samples: 23918
Validation samples: 5980
Test samples: 7475


In [17]:
# Create subset datasets
from torch.utils.data import Subset

train_dataset = Subset(dataset, train_idx)
val_dataset = Subset(dataset, val_idx)
test_dataset = Subset(dataset, test_idx)

print(f"Train dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(val_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")

Train dataset size: 23918
Validation dataset size: 5980
Test dataset size: 7475


In [18]:
from transformers import Trainer, TrainingArguments

In [19]:
# GPU memory monitoring and optimization functions
def print_gpu_memory():
    if torch.cuda.is_available():
        print(f"GPU Memory Allocated: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
        print(f"GPU Memory Cached: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")
    else:
        print("CUDA not available")

def clear_gpu_cache():
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        print("GPU cache cleared")

# Print initial memory usage
print("Initial GPU memory status:")
print_gpu_memory()

Initial GPU memory status:
GPU Memory Allocated: 0.43 GB
GPU Memory Cached: 0.46 GB


In [20]:
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_dir='./logs',
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="f1"
)

In [22]:
from sklearn.metrics import f1_score, precision_score, recall_score
import numpy as np

In [23]:
def compute_metrics(pred):
    logits, labels = pred
    probs = torch.sigmoid(torch.tensor(logits)).numpy()
    preds = (probs > 0.5).astype(int)
    f1 = f1_score(labels, preds, average='micro')
    precision = precision_score(labels, preds, average='micro')
    recall = recall_score(labels, preds, average='micro')
    return {
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [24]:
trainer = Trainer(model=model,
                  args=training_args,
                  train_dataset=train_dataset,
                    eval_dataset=val_dataset,
                    compute_metrics=compute_metrics)

In [25]:
trainer.train()

Epoch,Training Loss,Validation Loss,F1,Precision,Recall
1,0.0019,0.001837,0.0,0.0,0.0
2,0.0017,0.001744,0.0,0.0,0.0
3,0.0017,0.001755,0.0,0.0,0.0
4,0.0017,0.001758,0.0,0.0,0.0
5,0.0017,0.00176,0.0,0.0,0.0


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


TrainOutput(global_step=14950, training_loss=0.004362873588517358, metrics={'train_runtime': 12755.7858, 'train_samples_per_second': 9.375, 'train_steps_per_second': 1.172, 'total_flos': 2.805414237002328e+16, 'train_loss': 0.004362873588517358, 'epoch': 5.0})

## Saving

In [26]:
model.save_pretrained('./classification_unpruned/udc_model_1')
tokenizer.save_pretrained('./classification_unpruned/udc_model_1')


('./classification_unpruned/udc_model_1\\tokenizer_config.json',
 './classification_unpruned/udc_model_1\\special_tokens_map.json',
 './classification_unpruned/udc_model_1\\vocab.txt',
 './classification_unpruned/udc_model_1\\added_tokens.json',
 './classification_unpruned/udc_model_1\\tokenizer.json')