In [None]:
# ! pip install -q transformers datasets torch scikit-learn

In [1]:
! pip install -q 'accelerate>=0.26.0'

In [19]:
import torch
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset, DatasetDict
from sklearn.metrics import accuracy_score

# Load tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

# Build Training Set

df_sampled contains training data which is retrieved using build_training_set method from Utils.py

In [20]:
from Utils import build_training_set
from sklearn.model_selection import train_test_split

corpus_location = './Documents'  # Path to your document folder
num_cases = 3000  # Number of training instances
chunk_size = 512  # Experiment with different chunk sizes
use_title = False  # Include document title if needed
respect_sentence_boundaries = True  # Try both True and False

# Build training set
df_sampled = build_training_set(num_cases, chunk_size, use_title, respect_sentence_boundaries)
df_sampled.head()

Unnamed: 0,Category,Chunk
2493,Philosophy,"Produced by Keith G. RichardsonChrist, Christi..."
33,Social Sciences,The CRIMINOLOGY SERIES.1. The Female Offender....
3683,Philosophy,"Produced by Heiko Evermann, Sandra Eder and th..."
4129,Social Sciences,"Produced by Chris Curnow, Paul Clark, and the ..."
3193,Philosophy,"Produced by Juliet Sutherland, Julia Neufeld a..."


# Tokenization
Split data into train and validation set. Tokenize.

In [21]:
# Convert category names to numeric labels
category_to_int = {cat: idx for idx, cat in enumerate(df_sampled['Category'].unique())}
df_sampled['labels'] = df_sampled['Category'].map(category_to_int)

# Split dataset into training and validation
train_data, val_data = train_test_split(df_sampled, test_size=0.2)

# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples['Chunk'], padding='max_length', truncation=True, max_length=512)

# Convert to Hugging Face dataset
train_dataset = Dataset.from_pandas(train_data)
val_dataset = Dataset.from_pandas(val_data)

print(val_dataset)

Dataset({
    features: ['Category', 'Chunk', 'labels', '__index_level_0__'],
    num_rows: 600
})


# Train the model 

I tried different combinations of Chunk size and sentence alignment. And stored the best model to hugging face.

In [22]:
# Apply tokenization
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

# Define model
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=len(category_to_int))

# Training arguments
training_args = TrainingArguments(
    output_dir='./distilbert_text_classification_multiclass_512',
    evaluation_strategy='epoch',
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy='epoch',
    logging_dir='./logs',
    logging_steps=10,
    hub_model_id='distilbert_text_classification_multiclass_512'
)

# Compute metrics
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.argmax(axis=1)
    return {'accuracy': accuracy_score(labels, predictions)}

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

# Save the trained model
model.save_pretrained('./distilbert_text_classification_multiclass_512')
tokenizer.save_pretrained('./distilbert_text_classification_multiclass_512')

print("Model training complete and saved!")

Map:   0%|          | 0/2400 [00:00<?, ? examples/s]

Map:   0%|          | 0/600 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6258,0.58341,0.763333
2,0.5246,0.62865,0.778333
3,0.2514,0.70069,0.785


Model training complete and saved!


# What I tried:
1. Chunk size 512, sentence alignment = True (Best accuracy was obtained with these hyperparameters.)
2. Chunk size 256, sentence alignment = True
3. Chunk size 512, sentence alignment = False
4. Chunk size 256, sentence alignment = False

To push the best model to hugging face, I renamed path to specific model name. Same for the testing.

# Test

Testing the model before saving it. I got best results for Chunk_size 512 and sentence alignment was set to True. Gave a small chunk from a random document to the model, and it recognized the category correctly.

In [23]:
import torch
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification

# Load the trained model and tokenizer
model_path = './distilbert_text_classification_multiclass_512'
tokenizer = DistilBertTokenizerFast.from_pretrained(model_path)
model = DistilBertForSequenceClassification.from_pretrained(model_path)

# Move model to the correct device (GPU if available, otherwise CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()  # Set model to evaluation mode

# Define category mapping (same as during training)
category_to_int = {cat: idx for idx, cat in enumerate(df_sampled['Category'].unique())}
int_to_category = {v: k for k, v in category_to_int.items()}

# Function to predict category
def predict_category(text):
    inputs = tokenizer(text, padding="max_length", truncation=True, max_length=512, return_tensors="pt")
    inputs = {key: value.to(device) for key, value in inputs.items()}  # Move input to correct device

    with torch.no_grad():  # Disable gradient calculations for inference
        outputs = model(**inputs)
        logits = outputs.logits
        predicted_class = torch.argmax(logits, dim=1).item()  # Get predicted class index
    
    return int_to_category[predicted_class]

# Test with a sample text
# sample_text = ""
sample_text = "A Few Words About These United States Population Statistics.All figures listed below for years before 1992 are US CensusBureau figures as per the source files.  Where there were anassortment of figures for a specific year, we averaged them.1992 was an estimate.  Years after 1992 are our estimates ona predicted growth rate of 1%, as the average growth rate of"
predicted_label = predict_category(sample_text)
print(f"Predicted Category: {predicted_label}")


Predicted Category: Social Sciences


# Save best model to Hugging face

Saving model to hugging face. Here is the link to my model. 
https://huggingface.co/gaurinm30/distilbert_text_classification_multiclass_512

In [24]:
import json

# Load JSON file
with open('secrets.json', 'r') as file:
    secrets = json.load(file)

HF_TOKEN = secrets["HF_TOKEN"]

from huggingface_hub import login
login(token=HF_TOKEN)

In [26]:
from huggingface_hub import upload_file

# Upload model to Hugging Face Hub
trainer.push_to_hub()
# model.push_to_hub("distilbert_text_classification_multiclass_512")
# tokenizer.push_to_hub("distilbert_text_classification_multiclass_512")

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.37k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/gaurinm30/distilbert_text_classification_multiclass_512/commit/2b0078370edb32a8108765a5691330931db8bd10', commit_message='End of training', commit_description='', oid='2b0078370edb32a8108765a5691330931db8bd10', pr_url=None, repo_url=RepoUrl('https://huggingface.co/gaurinm30/distilbert_text_classification_multiclass_512', endpoint='https://huggingface.co', repo_type='model', repo_id='gaurinm30/distilbert_text_classification_multiclass_512'), pr_revision=None, pr_num=None)