In [1]:
import os
# os.environ["CUDA_VISIBLE_DEVICES"] = "3"

import numpy as np
import requests
import pandas as pd
from io import StringIO
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer
from torch.utils.data import Dataset
import logging

from datasets import load_dataset

#load train data
import pandas as pd
cols = ['id', 'text', 'label', 'intensity']
path = "https://raw.githubusercontent.com/vinayakumarr/WASSA-2017/refs/heads/master/wassa/data/training/"
anger_train = pd.read_csv(StringIO(requests.get(path + 'anger-ratings-0to1.train.txt').text), header=None, sep='\t', names=cols, index_col=0)
fear_train = pd.read_csv(StringIO(requests.get(path + 'fear-ratings-0to1.train').text), header=None, sep='\t', names=cols, index_col=0)
sad_train = pd.read_csv(StringIO(requests.get(path + 'sadness-ratings-0to1.train.txt').text), header=None, sep='\t', names=cols, index_col=0)
joy_train = pd.read_csv(StringIO(requests.get(path + 'joy-ratings-0to1.train.txt').text), header=None, sep='\t', names=cols, index_col=0)

dataset = pd.concat([anger_train, fear_train, sad_train, joy_train], axis=0)

# Reset index for the combined DataFrame (optional)
dataset.reset_index(inplace=True)

from datasets import Dataset
import pandas as pd
dataset = Dataset.from_pandas(dataset)


# Shuffle the dataset
dataset = dataset.shuffle(seed=42)

# Inspect the dataset
print(dataset)

def is_valid_intensity(example):
    if example['intensity'] is not None:
        #print(example['intensity'])
        try: 
            k = float(example['intensity'])
            return True
        except:
        
            return False
    else:
        return False

# Filter the dataset
dataset = dataset.filter(is_valid_intensity)
print(dataset)
# Split the shuffled dataset into train and test sets
train_test_split = dataset.train_test_split(test_size=0.2, seed=42)

# Access the train and test datasets
train_data = train_test_split['train']
val_data = train_test_split['test']

# Inspect the datasets
print("Train Dataset:", train_data)
print("Test Dataset:", val_data)

  from .autonotebook import tqdm as notebook_tqdm


Dataset({
    features: ['id', 'text', 'label', 'intensity'],
    num_rows: 2470
})


Filter: 100%|██████████| 2470/2470 [00:00<00:00, 57545.58 examples/s]

Dataset({
    features: ['id', 'text', 'label', 'intensity'],
    num_rows: 2466
})
Train Dataset: Dataset({
    features: ['id', 'text', 'label', 'intensity'],
    num_rows: 1972
})
Test Dataset: Dataset({
    features: ['id', 'text', 'label', 'intensity'],
    num_rows: 494
})





In [4]:
from transformers import AutoTokenizer, AutoModelForMaskedLM, AutoConfig
#from roberta import RobertaForSequenceClassification


model_name = "microsoft/deberta-v3-base"

#config.num_labels=2
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.padding_side = 'left'



In [5]:
from datasets import DatasetDict

mask_token = tokenizer.mask_token

def generate_prompt(data_point):
    """
    Generates a prompt for evaluating the humor intensity of an edited headline.
    Args:
        data_point (dict): A dictionary containing 'original', 'edit', and 'meanGrade'.
    Returns:
        str: The formatted prompt as a string.
    """
    return f"""# Input: {data_point['text']} # Label: {data_point['label']} # Output: The intensity is"""  # noqa: E501


# Assuming `dataset` is your DatasetDict
def add_label_column(example):

    example['labels'] = float(example['intensity'])
  
    example['input'] = generate_prompt(example)

    
    return example

# Map the function over train and validation datasets
train_data = train_data.map(add_label_column)
val_data = val_data.map(add_label_column)

# Remove unnecessary columns

# Inspect the updated datasets
print("Train Dataset:", train_data)
print("Validation Dataset:", val_data)

Map: 100%|██████████| 1972/1972 [00:00<00:00, 9564.29 examples/s]
Map: 100%|██████████| 494/494 [00:00<00:00, 10729.06 examples/s]

Train Dataset: Dataset({
    features: ['id', 'text', 'label', 'intensity', 'labels', 'input'],
    num_rows: 1972
})
Validation Dataset: Dataset({
    features: ['id', 'text', 'label', 'intensity', 'labels', 'input'],
    num_rows: 494
})





In [6]:
from transformers import AutoTokenizer, DataCollatorWithPadding


tokenizer.padding_side = 'left'


# col_to_delete = ['idx']
col_to_delete = ['label', 'intensity','id', 'text']  # Update as per your dataset


mask_token = tokenizer.mask_token
def preprocessing_function(examples):
   
    return tokenizer(examples['input'], truncation=True, max_length=512)

tokenized_train_data = train_data.map(preprocessing_function, batched=True, remove_columns=col_to_delete)
tokenized_val_data = val_data.map(preprocessing_function, batched=True, remove_columns=col_to_delete)
# llama_tokenized_datasets = llama_tokenized_datasets.rename_column("target", "label")
tokenized_train_data.set_format("torch")
tokenized_val_data.set_format("torch")

# Data collator for padding a batch of examples to the maximum length seen in the batch
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map: 100%|██████████| 1972/1972 [00:00<00:00, 15356.62 examples/s]
Map: 100%|██████████| 494/494 [00:00<00:00, 16740.21 examples/s]


In [7]:
tokenizer.decode(tokenized_train_data['input_ids'][10])


'[CLS] # Input: @TehShockwave turn that grumpy frown upside-down\\n\\nYou did something next to impossible today # Label: sadness # Output: The intensity is[SEP]'

In [8]:
val_data

Dataset({
    features: ['id', 'text', 'label', 'intensity', 'labels', 'input'],
    num_rows: 494
})

In [9]:
all_lengths = [len(ids) for ids in tokenized_train_data['input_ids']]
mx = max(all_lengths)
mx


74

In [10]:
count = sum(len(ids) > 512 for ids in tokenized_train_data['input_ids'])
print(count)


0


In [14]:
import torch
import torch.nn as nn
from transformers import RobertaForSequenceClassification
from transformers import AutoModelForSequenceClassification
from transformers.activations import ACT2FN
import random
# from modeling import MLMSequenceClassification

config = AutoConfig.from_pretrained(model_name)
config.num_labels=1
config.mask_token_id = tokenizer.mask_token_id

model = AutoModelForSequenceClassification.from_pretrained(model_name, config=config)


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
model

DebertaV2ForSequenceClassification(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 768, padding_idx=0)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-11): 12 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=768, out_features=768, bias=True)
              (key_proj): Linear(in_features=768, out_features=768, bias=True)
              (value_proj): Linear(in_features=768, out_features=768, bias=True)
              (pos_dropout): Dropout(p=0.1, inplace=False)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): Layer

In [18]:
import RoCoFT

RoCoFT.PEFT(model, method='column', rank=3) 
#targets=['key', 'value', 'dense', 'query'])

In [19]:
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from scipy.stats import pearsonr, spearmanr

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # If predictions are logits or have extra dimensions, squeeze
    if predictions.ndim > 1:
        predictions = predictions.squeeze()

    mae = mean_absolute_error(labels, predictions)
    mse = mean_squared_error(labels, predictions)
    rmse = np.sqrt(mse)
    r2 = r2_score(labels, predictions)
    
    # Define an "accuracy" for regression:
    # Example: within some threshold tolerance
    tolerance = 0.1  # you can change this
    acc = np.mean(np.abs(predictions - labels) < tolerance)

    pearson_corr, _ = pearsonr(predictions, labels)
    spearman_corr, _ = spearmanr(predictions, labels)

    return {
        "MAE": mae,
        "MSE": mse,
        "RMSE": rmse,
        "Accuracy": acc,
        "R2": r2,
        "Pearson": pearson_corr,
        "Spearman's Rank": spearman_corr
    }


In [20]:
from transformers import TrainingArguments, Trainer

import time
from transformers import Trainer, TrainingArguments
training_args = TrainingArguments(
    output_dir='dir',
    learning_rate=6e-4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.20,
    eval_strategy="steps",
    save_strategy="steps",
    save_total_limit=2,
    save_steps=10000000,
    logging_steps=100,
   
    load_best_model_at_end=True,
    lr_scheduler_type="cosine",  # You can choose from 'linear', 'cosine', 'cosine_with_restarts', 'polynomial', etc.
    warmup_steps=100,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_data,
    eval_dataset=tokenized_val_data,

    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [21]:
trainer.train()

Step,Training Loss,Validation Loss,Mae,Mse,Rmse,Accuracy,R2,Pearson,Spearman's rank
100,0.1114,0.039056,0.160943,0.039056,0.197625,0.378543,-0.097011,0.144227,0.152214
200,0.0446,0.032829,0.148751,0.032829,0.181187,0.388664,0.077897,0.307421,0.310348
300,0.0381,0.030514,0.142554,0.030514,0.174683,0.408907,0.14291,0.387514,0.385067
400,0.0352,0.039344,0.16004,0.039344,0.198353,0.368421,-0.105105,0.434031,0.458547
500,0.0315,0.023204,0.122167,0.023204,0.152327,0.47166,0.348247,0.614991,0.605004
600,0.0259,0.021592,0.116626,0.021592,0.146944,0.497976,0.393502,0.674384,0.66992
700,0.0218,0.019436,0.112293,0.019436,0.139412,0.522267,0.45408,0.720442,0.711352
800,0.0219,0.021723,0.118982,0.021723,0.147389,0.495951,0.389824,0.746327,0.738918
900,0.0187,0.020423,0.115105,0.020423,0.142907,0.516194,0.426363,0.766486,0.757401
1000,0.0175,0.021739,0.119425,0.021739,0.147441,0.497976,0.389394,0.7672,0.763054


TrainOutput(global_step=1240, training_loss=0.03288155774916372, metrics={'train_runtime': 207.2104, 'train_samples_per_second': 95.169, 'train_steps_per_second': 5.984, 'total_flos': 2490025509312.0, 'train_loss': 0.03288155774916372, 'epoch': 10.0})

ValueError: Column to remove ['corpus', 'complexity', 'token', 'sentence'] not in the dataset. Current columns in the dataset: ['id', 'text', 'label', 'intensity', 'labels', 'input']

In [None]:
Y_tensor.shape

torch.Size([7232])

In [None]:
Z_layer_outputs[0][-1].mean(dim=1).shape

torch.Size([8, 768])