In [4]:
%pip install transformers datasets torch scikit-learn python-docx


Note: you may need to restart the kernel to use updated packages.


In [10]:
from docx import Document
from datasets import Dataset, DatasetDict
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import torch
import pandas as pd

In [13]:
# Function to read data from .docx and convert it into a DataFrame
def process_docx_to_dataframe(file_path):
    document = Document(file_path)
    texts, severity_levels = [], []
    
    for paragraph in document.paragraphs:
        line = paragraph.text.strip()
        if line:  # Skip empty lines
            try:
                # Evaluate the dictionary-style line and extract fields
                data = eval(line)
                texts.append(data['text'])
                severity_levels.append(data['severity_level'])
            except Exception as e:
                print(f"Skipping line due to error: {e}")
                continue  # Skip invalid lines
    
    # Create a DataFrame
    df = pd.DataFrame({'text': texts, 'label': severity_levels})
    return df

# Path to your .docx file
file_path = r"/Users/kunal/Downloads/VIT Downloads/Other Downloads/innosafe/CLEANcorpus4.docx"

# Process the .docx file
df = process_docx_to_dataframe(file_path)

# Display the DataFrame
print(df.head())

# Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(df)
print(dataset)


                                      text  label
0                        Hey, how are you?      0
1                       That's a sexy car.      0
2  I can't stop thinking about last night.      5
3        Want to exchange explicit photos?      9
4         You look pretty in that picture.      2
Dataset({
    features: ['text', 'label'],
    num_rows: 687
})


In [14]:
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['text'], df['label'], test_size=0.2, random_state=42
)

# Convert to Hugging Face Dataset
train_dataset = Dataset.from_dict({'text': train_texts.tolist(), 'label': train_labels.tolist()})
test_dataset = Dataset.from_dict({'text': test_texts.tolist(), 'label': test_labels.tolist()})
dataset = DatasetDict({'train': train_dataset, 'test': test_dataset})

In [15]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Tokenize datasets
def tokenize_function(example):
    return tokenizer(example['text'], padding='max_length', truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Prepare for PyTorch
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")

Map:   0%|          | 0/549 [00:00<?, ? examples/s]

Map:   0%|          | 0/138 [00:00<?, ? examples/s]

In [16]:
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=11)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
training_args = TrainingArguments(
    output_dir="./results",          # Output directory
    eval_strategy="epoch",    # Evaluate after each epoch
    learning_rate=2e-5,             # Learning rate
    per_device_train_batch_size=8,  # Batch size for training
    per_device_eval_batch_size=16,  # Batch size for evaluation
    num_train_epochs=5,             # Number of training epochs
    weight_decay=0.01,              # Weight decay
    logging_dir="./logs",           # Logging directory
    logging_steps=10,
    save_strategy="epoch",          # Save checkpoint every epoch
    load_best_model_at_end=True     # Load the best model after training
)

In [18]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    processing_class=tokenizer
)

In [19]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,2.3312,2.27668
2,2.033,2.070662
3,1.8698,1.977864
4,1.6998,1.921424
5,1.5789,1.90238


TrainOutput(global_step=345, training_loss=1.953773562113444, metrics={'train_runtime': 1475.8578, 'train_samples_per_second': 1.86, 'train_steps_per_second': 0.234, 'total_flos': 363681371612160.0, 'train_loss': 1.953773562113444, 'epoch': 5.0})

In [20]:
trainer.save_model("./results/distilbert-severity-classifier")

In [21]:
results = trainer.evaluate()
print("Evaluation results:", results)

Evaluation results: {'eval_loss': 1.9023796319961548, 'eval_runtime': 14.9464, 'eval_samples_per_second': 9.233, 'eval_steps_per_second': 0.602, 'epoch': 5.0}
