# Imports

In [1]:
# imports
import pandas as pd
import numpy as np
# import matplotlib as plt
import random as rn
import os
os.environ['PYTHONHASHSEED'] = '0'
os.environ['CUDA_VISIBLE_DEVICES'] = ''
np.random.seed(37)
rn.seed(1254)

# Load data, train, test, validation splits

In [None]:
# path_to_data = "./data_for_model/Sentences_420.csv"

In [2]:
# EDA
# Load data and create the combined input column 
path_to_data = "./data_for_model/Sentences_420.csv"
df = pd.read_csv(path_to_data, index_col='S.No.')

# Programmatically create the input string required by the model
# Format: USER_CONTEXT: "value"; URL_ROOT: "value"
df['InputText'] = df.apply(
    lambda row: f"USER_CONTEXT: \"{row['USER_CONTEXT']}\"; URL_ROOT: \"{row['URL_ROOT']}\"",
    axis=1
)

print("Data loaded successfully. Shape:", df.shape)
print("First 5 rows with the new 'InputText' column:")
display(df.head())
display(df.describe())


Data loaded successfully. Shape: (420, 4)
First 5 rows with the new 'InputText' column:


Unnamed: 0_level_0,USER_CONTEXT,URL_ROOT,Label,InputText
S.No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,developer,stackoverflow,1,"USER_CONTEXT: ""developer""; URL_ROOT: ""stackove..."
2,marketing,stackoverflow,1,"USER_CONTEXT: ""marketing""; URL_ROOT: ""stackove..."
3,human_resource,stackoverflow,1,"USER_CONTEXT: ""human_resource""; URL_ROOT: ""sta..."
4,sales,stackoverflow,1,"USER_CONTEXT: ""sales""; URL_ROOT: ""stackoverflow"""
5,developer,github,1,"USER_CONTEXT: ""developer""; URL_ROOT: ""github"""


Unnamed: 0,Label
count,420.0
mean,0.047619
std,0.920526
min,-1.0
25%,-1.0
50%,0.0
75%,1.0
max,1.0


In [3]:
# Make test, train, cv splits using percentages
from datasets import Dataset

# The old code used a 'Sentence' column. We select only the necessary columns for the new task.
ds = Dataset.from_pandas(df[['InputText', 'Label']])

# Split: 80% for training, 20% for temp (test + validation)
ds_train_temp_dict = ds.train_test_split(test_size=0.2, seed=42)
ds_train = ds_train_temp_dict['train']

# Split the 20% temp set into half for validation (10%) and test (10%)
ds_test_cv_dict = ds_train_temp_dict['test'].train_test_split(test_size=0.5, seed=42)
ds_cv = ds_test_cv_dict['train']
ds_test = ds_test_cv_dict['test']

print("Dataset splits created:")
display(ds_train)
display(ds_cv)
display(ds_test)


  from .autonotebook import tqdm as notebook_tqdm


Dataset splits created:


Dataset({
    features: ['InputText', 'Label', 'S.No.'],
    num_rows: 336
})

Dataset({
    features: ['InputText', 'Label', 'S.No.'],
    num_rows: 42
})

Dataset({
    features: ['InputText', 'Label', 'S.No.'],
    num_rows: 42
})

# Fine tune LLM

In [4]:
# Get Tokenizer and Tokenize the 'InputText' column
from transformers import AutoTokenizer

model_nm = 'microsoft/deberta-v3-small'
tokz = AutoTokenizer.from_pretrained(model_nm)

def tokenize_dataset(ds, tokenizer):
    '''
    Tokenizes the 'InputText' column and casts the 'labels' column to float.
    '''
    # Tokenize the input text
    tokenized_ds = ds.map(lambda row: tokenizer(row['InputText']), batched=False)
    
    # Rename 'Label' to 'labels' as required by the Trainer
    if 'Label' in tokenized_ds.column_names:
        tokenized_ds = tokenized_ds.rename_columns({'Label' : 'labels'})
        
    # --- THIS IS THE FIX ---
    # Cast the 'labels' column to a float type to match model output
    tokenized_ds = tokenized_ds.map(lambda example: {'labels': float(example['labels'])})
    # ----------------------
    
    return tokenized_ds

# Apply the corrected tokenization function to the train and validation splits
tokenized_ds_train = tokenize_dataset(ds_train, tokz)
tokenized_ds_cv = tokenize_dataset(ds_cv, tokz)

print("Training dataset after tokenization and casting labels to float:")
display(tokenized_ds_train)
print("Note the data type of the 'labels' column is now float:")
print(tokenized_ds_train.features)


Map: 100%|██████████| 336/336 [00:00<00:00, 4852.01 examples/s]
Map: 100%|██████████| 336/336 [00:00<00:00, 9809.12 examples/s]
Map: 100%|██████████| 42/42 [00:00<00:00, 2592.35 examples/s]
Map: 100%|██████████| 42/42 [00:00<?, ? examples/s]

Training dataset after tokenization and casting labels to float:





Dataset({
    features: ['InputText', 'labels', 'S.No.', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 336
})

Note the data type of the 'labels' column is now float:
{'InputText': Value(dtype='string', id=None), 'labels': Value(dtype='int64', id=None), 'S.No.': Value(dtype='int64', id=None), 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None), 'token_type_ids': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None), 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}


In [5]:
# NEW CELL: Define a custom data collator for regression
from transformers import DataCollatorWithPadding
import torch

class DataCollatorForRegression(DataCollatorWithPadding):
    def __init__(self, tokenizer):
        super().__init__(tokenizer=tokenizer)

    def __call__(self, features):
        # This is the default behavior of DataCollatorWithPadding
        batch = super().__call__(features)
        
        # --- THIS IS THE FIX ---
        # The default collator may leave labels as Long. We explicitly cast them to Float.
        # This ensures the data type is correct right before it hits the model's loss function.
        batch["labels"] = batch["labels"].to(torch.float)
        # ----------------------
        
        return batch

In [17]:
# Get the model
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

my_model = AutoModelForSequenceClassification.from_pretrained(model_nm, num_labels=1)

# Instantiate our custom data collator
data_collator = DataCollatorForRegression(tokenizer=tokz)

# Define Training Arguments (with fp16=False for CPU)
# bs = 5
# epochs = 4
bs = 100
epochs = 1
lr = 8e-5

args = TrainingArguments(
    'outputs', 
    learning_rate=lr, 
    warmup_ratio=0.1, 
    lr_scheduler_type='cosine',
    evaluation_strategy="epoch", 
    per_device_train_batch_size=bs, 
    per_device_eval_batch_size=bs*2,
    num_train_epochs=epochs, 
    weight_decay=0.01, 
    report_to='none'
)

# --- THIS IS THE FIX ---
# Pass the custom data_collator to the Trainer
trainer = Trainer(
    my_model, 
    args, 
    train_dataset=tokenized_ds_train, 
    eval_dataset=tokenized_ds_cv,
    tokenizer=tokz,
    data_collator=data_collator  # <-- Use our custom collator here
)
# ----------------------


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


In [11]:
# Train (Here, fine tune) the model
trainer.train()

  6%|▋         | 17/272 [02:02<30:30,  7.18s/it]
100%|██████████| 4/4 [00:44<00:00,  9.31s/it]
                                             
[A                                            

100%|██████████| 4/4 [00:48<00:00,  9.31s/it]
[A
                                             
100%|██████████| 4/4 [00:48<00:00, 12.25s/it]  

{'eval_loss': 0.7904660105705261, 'eval_runtime': 0.7498, 'eval_samples_per_second': 56.014, 'eval_steps_per_second': 1.334, 'epoch': 1.0}
{'train_runtime': 48.9708, 'train_samples_per_second': 6.861, 'train_steps_per_second': 0.082, 'train_loss': 0.8590915203094482, 'epoch': 1.0}





TrainOutput(global_step=4, training_loss=0.8590915203094482, metrics={'train_runtime': 48.9708, 'train_samples_per_second': 6.861, 'train_steps_per_second': 0.082, 'total_flos': 1964278084080.0, 'train_loss': 0.8590915203094482, 'epoch': 1.0})

In [13]:
# Report loss for your model using the test set
# Use the corrected tokenization function, passing 'tokz'
tokenized_ds_test = tokenize_dataset(ds_test, tokz)

preds = trainer.predict(tokenized_ds_test).predictions.astype(float)

# Using MAE to calculate loss
def get_mae(preds, real):
    '''
    preds, real: array
    '''
    mae = np.mean(np.abs(preds - real))
    return mae

real = np.array(tokenized_ds_test['labels'])
print(f"MAE on Test Set: {get_mae(preds, real)}")


Map: 100%|██████████| 42/42 [00:00<00:00, 173.77 examples/s]
Map: 100%|██████████| 42/42 [00:00<00:00, 1346.03 examples/s]
100%|██████████| 1/1 [00:00<?, ?it/s]

MAE on Test Set: 0.9516604943644433





In [None]:
# MAE of my model: 0.9 (Based on test set)

In [15]:
import torch

# Ensure the model is in evaluation mode
my_model.eval()

# 1. Define a few new examples to test the model's logic
test_examples = [
    # --- Test Case 1: Context-dependent site (YouTube) ---
    'USER_CONTEXT: "developer"; URL_ROOT: "youtube"',      # Expected: -1 (Distractive)
    'USER_CONTEXT: "marketing"; URL_ROOT: "youtube"',      # Expected: 1 (Productive)
    
    # --- Test Case 2: Context-dependent site (LinkedIn) ---
    'USER_CONTEXT: "sales"; URL_ROOT: "linkedin"',         # Expected: 1 (Productive)
    'USER_CONTEXT: "developer"; URL_ROOT: "linkedin"',     # Expected: -1 (Distractive)
    
    # --- Test Case 3: "Always Distractive" site ---
    'USER_CONTEXT: "human_resource"; URL_ROOT: "netflix"', # Expected: -1 (Distractive)
    
    # --- Test Case 4: "Always Neutral" site ---
    'USER_CONTEXT: "developer"; URL_ROOT: "google"',       # Expected: 0 (Neutral)
    
    # --- Test Case 5: "Always Productive" site ---
    'USER_CONTEXT: "sales"; URL_ROOT: "github"',           # Expected: 1 (Productive)
]

# 2. Tokenize the examples
# We send the text to the tokenizer and get PyTorch tensors back
inputs = tokz(test_examples, padding=True, truncation=True, return_tensors="pt")

# Move tensors to the same device as the model (CPU in your case)
device = my_model.device
inputs = {k: v.to(device) for k, v in inputs.items()}

# 3. Make predictions
# We use torch.no_grad() to disable gradient calculations, which saves memory and speeds up inference
with torch.no_grad():
    outputs = my_model(**inputs)

# The model outputs raw logits. For our regression task, this is a single value per input.
# We flatten the output and move it back to the CPU to work with numpy/pandas
predictions = outputs.logits.flatten().cpu().numpy()


# 4. Display the results
print("="*40)
print("   Prediction Results on New Examples")
print("="*40)

for i, example in enumerate(test_examples):
    raw_score = predictions[i]
    
    # Round the raw score to the nearest integer to get the final label
    final_label = round(raw_score)
    
    print(f"\nInput:           '{example}'")
    print(f"Predicted Score: {raw_score:.4f}")
    print(f"Final Label:     {final_label}  (-1: Distractive, 0: Neutral, 1: Productive)")
    print("-" * 40)


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


   Prediction Results on New Examples

Input:           'USER_CONTEXT: "developer"; URL_ROOT: "youtube"'
Predicted Score: 0.1749
Final Label:     0  (-1: Distractive, 0: Neutral, 1: Productive)
----------------------------------------

Input:           'USER_CONTEXT: "marketing"; URL_ROOT: "youtube"'
Predicted Score: 0.1754
Final Label:     0  (-1: Distractive, 0: Neutral, 1: Productive)
----------------------------------------

Input:           'USER_CONTEXT: "sales"; URL_ROOT: "linkedin"'
Predicted Score: 0.1747
Final Label:     0  (-1: Distractive, 0: Neutral, 1: Productive)
----------------------------------------

Input:           'USER_CONTEXT: "developer"; URL_ROOT: "linkedin"'
Predicted Score: 0.1746
Final Label:     0  (-1: Distractive, 0: Neutral, 1: Productive)
----------------------------------------

Input:           'USER_CONTEXT: "human_resource"; URL_ROOT: "netflix"'
Predicted Score: 0.1720
Final Label:     0  (-1: Distractive, 0: Neutral, 1: Productive)
---------------

# Check if your GPU is available

In [16]:
import torch
torch.cuda.is_available()

False