In [1]:
import os
# os.environ["CUDA_VISIBLE_DEVICES"] = "0"

import numpy as np
import requests
import pandas as pd
from io import StringIO
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer
from torch.utils.data import Dataset
import logging

from datasets import load_dataset

#load train data
import pandas as pd
cols = ['id', 'text', 'label', 'intensity']
path = "https://raw.githubusercontent.com/vinayakumarr/WASSA-2017/refs/heads/master/wassa/data/training/"
anger_train = pd.read_csv(StringIO(requests.get(path + 'anger-ratings-0to1.train.txt').text), header=None, sep='\t', names=cols, index_col=0)
fear_train = pd.read_csv(StringIO(requests.get(path + 'fear-ratings-0to1.train').text), header=None, sep='\t', names=cols, index_col=0)
sad_train = pd.read_csv(StringIO(requests.get(path + 'sadness-ratings-0to1.train.txt').text), header=None, sep='\t', names=cols, index_col=0)
joy_train = pd.read_csv(StringIO(requests.get(path + 'joy-ratings-0to1.train.txt').text), header=None, sep='\t', names=cols, index_col=0)

dataset = pd.concat([anger_train, fear_train, sad_train, joy_train], axis=0)

# Reset index for the combined DataFrame (optional)
dataset.reset_index(inplace=True)

from datasets import Dataset
import pandas as pd
dataset = Dataset.from_pandas(dataset)


# Shuffle the dataset
dataset = dataset.shuffle(seed=42)

# Inspect the dataset
print(dataset)

def is_valid_intensity(example):
    if example['intensity'] is not None:
        #print(example['intensity'])
        try: 
            k = float(example['intensity'])
            return True
        except:
        
            return False
    else:
        return False

# Filter the dataset
dataset = dataset.filter(is_valid_intensity)
print(dataset)
# Split the shuffled dataset into train and test sets
train_test_split = dataset.train_test_split(test_size=0.2, seed=42)

# Access the train and test datasets
train_data = train_test_split['train']
val_data = train_test_split['test']

# Inspect the datasets
print("Train Dataset:", train_data)
print("Test Dataset:", val_data)

  from .autonotebook import tqdm as notebook_tqdm


Dataset({
    features: ['id', 'text', 'label', 'intensity'],
    num_rows: 2470
})


Filter: 100%|██████████| 2470/2470 [00:00<00:00, 42191.92 examples/s]

Dataset({
    features: ['id', 'text', 'label', 'intensity'],
    num_rows: 2466
})
Train Dataset: Dataset({
    features: ['id', 'text', 'label', 'intensity'],
    num_rows: 1972
})
Test Dataset: Dataset({
    features: ['id', 'text', 'label', 'intensity'],
    num_rows: 494
})





In [2]:
from transformers import AutoTokenizer, AutoModelForMaskedLM, AutoConfig, AutoModelForSequenceClassification
#from roberta import RobertaForSequenceClassification
# from modeling import CLMSequenceClassification


#model_name = "openai-community/gpt2-medium"
model_name = "HuggingFaceTB/SmolLM2-135M"
#config.num_labels=2
tokenizer = AutoTokenizer.from_pretrained(model_name)

tokenizer.padding_side = 'left'
tokenizer.pad_token = tokenizer.eos_token

import torch
import torch.nn as nn
from transformers import AutoModelForSequenceClassification
from transformers.activations import ACT2FN
import random



model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1).to('cuda')
model.config.pad_token_id = tokenizer.eos_token_id
import RoCoFT

RoCoFT.PEFT(model, method='row', rank=3) 

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at HuggingFaceTB/SmolLM2-135M and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
from datasets import DatasetDict


def generate_prompt(data_point):
    """
    Generates a prompt for evaluating the humor intensity of an edited headline.
    Args:
        data_point (dict): A dictionary containing 'original', 'edit', and 'meanGrade'.
    Returns:
        str: The formatted prompt as a string.
    """
    return f"""# Input: {data_point['text']} # Label: {data_point['label']} # Output: The intensity is"""  # noqa: E501


# Assuming `dataset` is your DatasetDict
def add_label_column(example):

    example['labels'] = float(example['intensity'])
  
    example['input'] = generate_prompt(example)

    
    return example

# Map the function over train and validation datasets
train_data = train_data.map(add_label_column)
val_data = val_data.map(add_label_column)

# Remove unnecessary columns

# Inspect the updated datasets
print("Train Dataset:", train_data)
print("Validation Dataset:", val_data)

Map: 100%|██████████| 1972/1972 [00:00<00:00, 10853.28 examples/s]
Map: 100%|██████████| 494/494 [00:00<00:00, 8876.00 examples/s]

Train Dataset: Dataset({
    features: ['id', 'text', 'label', 'intensity', 'labels', 'input'],
    num_rows: 1972
})
Validation Dataset: Dataset({
    features: ['id', 'text', 'label', 'intensity', 'labels', 'input'],
    num_rows: 494
})





In [4]:
from transformers import AutoTokenizer, DataCollatorWithPadding


tokenizer.padding_side = 'left'


# col_to_delete = ['idx']
col_to_delete = ['label', 'intensity','id', 'text']  # Update as per your dataset


mask_token = tokenizer.mask_token
def preprocessing_function(examples):
   
    return tokenizer(examples['input'], truncation=True, max_length=512)

tokenized_train_data = train_data.map(preprocessing_function, batched=True, remove_columns=col_to_delete)
tokenized_val_data = val_data.map(preprocessing_function, batched=True, remove_columns=col_to_delete)
# llama_tokenized_datasets = llama_tokenized_datasets.rename_column("target", "label")
tokenized_train_data.set_format("torch")
tokenized_val_data.set_format("torch")

# Data collator for padding a batch of examples to the maximum length seen in the batch
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map: 100%|██████████| 1972/1972 [00:00<00:00, 22284.76 examples/s]
Map: 100%|██████████| 494/494 [00:00<00:00, 21095.14 examples/s]


In [5]:
tokenizer.decode(tokenized_train_data['input_ids'][10])

'# Input: @TehShockwave turn that grumpy frown upside-down\\n\\nYou did something next to impossible today # Label: sadness # Output: The intensity is'

In [6]:
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from scipy.stats import pearsonr, spearmanr

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # If predictions are logits or have extra dimensions, squeeze
    if predictions.ndim > 1:
        predictions = predictions.squeeze()

    mae = mean_absolute_error(labels, predictions)
    mse = mean_squared_error(labels, predictions)
    rmse = np.sqrt(mse)
    r2 = r2_score(labels, predictions)
    
    # Define an "accuracy" for regression:
    # Example: within some threshold tolerance
    tolerance = 0.1  # you can change this
    acc = np.mean(np.abs(predictions - labels) < tolerance)

    pearson_corr, _ = pearsonr(predictions, labels)
    spearman_corr, _ = spearmanr(predictions, labels)

    return {
        "MAE": mae,
        "MSE": mse,
        "RMSE": rmse,
        "Accuracy": acc,
        "R2": r2,
        "Pearson": pearson_corr,
        "Spearman's Rank": spearman_corr
    }


In [9]:
from transformers import TrainingArguments, Trainer

import time
from transformers import Trainer, TrainingArguments
training_args = TrainingArguments(
    output_dir='dir',
    learning_rate=5e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps= 1,
    num_train_epochs=10,
    weight_decay=0.20,
    eval_strategy="steps",
    save_strategy="steps",
    save_total_limit=2,
    save_steps=10000000,
    logging_steps=100,
   
    load_best_model_at_end=True,
    lr_scheduler_type="cosine",  # You can choose from 'linear', 'cosine', 'cosine_with_restarts', 'polynomial', etc.
    warmup_steps=100,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_data,
    eval_dataset=tokenized_val_data,

    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [10]:
trainer.train()

Step,Training Loss,Validation Loss,Mae,Mse,Rmse,Accuracy,R2,Pearson,Spearman's rank
100,0.2689,0.070669,0.215945,0.070669,0.265837,0.271255,-0.984989,0.199836,0.198185
200,0.0572,0.075093,0.229594,0.075093,0.274032,0.226721,-1.109252,0.288488,0.288871
300,0.0406,0.059492,0.203394,0.059492,0.243911,0.273279,-0.671048,0.386607,0.388792
400,0.04,0.030909,0.143327,0.030909,0.175809,0.417004,0.131817,0.399279,0.373741
500,0.0375,0.045106,0.170517,0.045106,0.212382,0.354251,-0.266958,0.492493,0.477385
600,0.0362,0.027479,0.132382,0.027479,0.165769,0.451417,0.228149,0.535571,0.507853
700,0.0312,0.04679,0.180173,0.04679,0.216309,0.307692,-0.314246,0.560398,0.554342
800,0.0354,0.021108,0.116782,0.021108,0.145287,0.489879,0.4071,0.644532,0.637611
900,0.0193,0.020517,0.115064,0.020517,0.143239,0.5,0.4237,0.654396,0.644597
1000,0.0198,0.021826,0.1205,0.021826,0.147737,0.461538,0.38694,0.671096,0.649337


TrainOutput(global_step=2470, training_loss=0.029466515719166652, metrics={'train_runtime': 378.7954, 'train_samples_per_second': 52.06, 'train_steps_per_second': 6.521, 'total_flos': 3155610147840.0, 'train_loss': 0.029466515719166652, 'epoch': 10.0})