In [None]:
import pandas as pd
from datasets import Dataset
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from transformers import TrainingArguments, Trainer

In [None]:
df = pd.read_csv("Dataset.csv")

label_names = df['label'].unique().tolist()
label_map = {label: i for i, label in enumerate(label_names)}
print("Label mapping:", label_map)

df['label'] = df['label'].map(label_map)

dataset = Dataset.from_pandas(df)

dataset = dataset.train_test_split(test_size=0.2, seed=42)
model_name = "./"
df

Label mapping: {'exam': 0, 'admin': 1, 'academic': 2, 'event': 3}


Unnamed: 0,text,label
0,The exam results will be published online,0
1,The semester deadline has been extended,1
2,exam guidelines have been updated,1
3,The academic deadline has been extended,1
4,Staff exam meeting will be held in the main hall,1
...,...,...
615,New college course materials are uploaded online,2
616,Extra sports classes are scheduled for this week,2
617,Extra semester classes are scheduled for this ...,2
618,Final sports exam instructions have been updated,0


In [None]:
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=4)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def tokenize(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True)

dataset = dataset.map(tokenize, batched=True)
dataset = dataset.remove_columns(["text"])
dataset = dataset.rename_column("label", "labels")
dataset.set_format("torch")


Map:   0%|          | 0/496 [00:00<?, ? examples/s]

Map:   0%|          | 0/124 [00:00<?, ? examples/s]

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    save_steps=500,
    logging_steps=500,
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=tokenizer
)

trainer.train()

  trainer = Trainer(


Step,Training Loss


TrainOutput(global_step=310, training_loss=0.22628208283455142, metrics={'train_runtime': 124.029, 'train_samples_per_second': 19.995, 'train_steps_per_second': 2.499, 'total_flos': 328530866012160.0, 'train_loss': 0.22628208283455142, 'epoch': 5.0})

In [None]:
import torch

text = "New admin course materials are uploaded online"
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
inputs = {key: value.to(device) for key, value in inputs.items()}
model.to(device)

outputs = model(**inputs)
predicted_class = outputs.logits.argmax(-1).item()

label_map =  {0:'exam', 1: 'admin', 2: 'academic', 3: 'event'}
print("Predicted:", label_map[predicted_class])

Predicted: academic
