In [4]:
# !pip install datasets
# !pip install --upgrade pyarrow

## Data Preparation

In [16]:
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import DebertaV2Tokenizer, DebertaV2ForSequenceClassification, Trainer, TrainingArguments
import torch
import numpy as np

# Load your dataset
df = pd.read_csv('../data/train_reviews.csv', encoding='latin')

# Drop unnecessary column if it exists
if 'Unnamed: 0' in df.columns:
    df.drop(['Unnamed: 0'], axis=1, inplace=True)

# Ensure the labels are in the correct format
label_columns = ['Cinematography', 'Direction', 'Story', 'Characters', 'Production Design', 'Unique Concept', 'Emotions']
df[label_columns] = df[label_columns].astype(float)

# Create HuggingFace Dataset
dataset = Dataset.from_pandas(df)


### Load Model

In [6]:
torch.cuda.empty_cache()
# Load tokenizer and model
tokenizer = DebertaV2Tokenizer.from_pretrained('microsoft/deberta-v3-small')
model = DebertaV2ForSequenceClassification.from_pretrained('microsoft/deberta-v3-small', num_labels=len(label_columns)).to("cuda:0")
model.config.problem_type = "regression"


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:

# Ensure tokenizer and model vocabulary sizes match
vocab_size = len(tokenizer)

# Tokenize your data
def tokenize_function(example):
    return tokenizer(example['review'], padding='max_length', truncation=True, max_length=512)

# Check if token IDs are within vocabulary size
def check_vocab(example):
    input_ids = example['input_ids']
    if any(i >= vocab_size for i in input_ids):
        print(f"Found token ID out of bounds: {input_ids}")
    return example



In [8]:

# Convert labels to float32 for PyTorch
def format_labels(example):
    labels = [example[label] for label in label_columns]
    example['labels'] = torch.tensor(labels, dtype=torch.float32)
    return example


In [9]:
tokenized_datasets = dataset.map(tokenize_function, batched=True)

tokenized_datasets = tokenized_datasets.map(check_vocab)

tokenized_datasets = tokenized_datasets.map(format_labels)


Map: 100%|██████████| 48577/48577 [00:30<00:00, 1596.81 examples/s]
Map: 100%|██████████| 48577/48577 [00:08<00:00, 5636.22 examples/s]
Map: 100%|██████████| 48577/48577 [00:03<00:00, 13565.37 examples/s]


# TRAIN

In [17]:
from transformers import TrainingArguments, Trainer
import torch

# Define your training arguments
training_args = TrainingArguments(
    output_dir='./results',  # This still needs to be provided but will not be used
    evaluation_strategy="no",  # No evaluation during training
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=10,
    weight_decay=0.01,
    save_total_limit=0,  # Do not save checkpoints
    save_steps=0,  # Do not save any checkpoints
    logging_strategy="no",  # Disable logging
    use_cpu=False,  # Force the training to run on CPU
)

# Define a custom compute_metrics function for regression
def compute_metrics(p):
    preds = p.predictions
    labels = p.label_ids
    mse = ((preds - labels) ** 2).mean().item()
    return {"mse": mse}

# Custom Trainer class to compute loss for regression
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss_fct = torch.nn.MSELoss()
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1, self.model.config.num_labels))
        return (loss, outputs) if return_outputs else loss

# Initialize Trainer
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    eval_dataset=tokenized_datasets,  # Change this to your actual eval dataset if separate
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Fine-tune the model (using CPU for debugging)
trainer.train()


100%|██████████| 121450/121450 [15:21:26<00:00,  2.20it/s]  

{'train_runtime': 55286.1561, 'train_samples_per_second': 8.786, 'train_steps_per_second': 2.197, 'train_loss': 0.028372473561650884, 'epoch': 10.0}





TrainOutput(global_step=121450, training_loss=0.028372473561650884, metrics={'train_runtime': 55286.1561, 'train_samples_per_second': 8.786, 'train_steps_per_second': 2.197, 'total_flos': 6.435671823307776e+16, 'train_loss': 0.028372473561650884, 'epoch': 10.0})

In [18]:
import gc 
gc.collect()

torch.cuda.empty_cache()

In [19]:
# Save the model
model.save_pretrained('./deberta-v3-fine-tuned')
tokenizer.save_pretrained('./deberta-v3-fine-tuned')

('./deberta-v3-fine-tuned/tokenizer_config.json',
 './deberta-v3-fine-tuned/special_tokens_map.json',
 './deberta-v3-fine-tuned/spm.model',
 './deberta-v3-fine-tuned/added_tokens.json',
 './deberta-v3-fine-tuned/tokenizer.json')

# TEST

In [20]:
import pandas as pd
from datasets import Dataset
import csv


# Load test data from CSV
test_data_path = '../data/test_reviews.csv'  # Update with your test data path

# Drop unnecessary column if it exists
if 'Unnamed: 0' in df.columns:
    df.drop(['Unnamed: 0'], axis=1, inplace=True)

# Ensure the labels are in the correct format
label_columns = ['Cinematography', 'Direction', 'Story', 'Characters', 'Production Design', 'Unique Concept', 'Emotions']
df[label_columns] = df[label_columns].astype(float)


# Read CSV file properly handling commas in the reviews
rows = []
with open(test_data_path, 'r', encoding='utf-8') as f:
    reader = csv.reader(f)
    headers = next(reader)
    for row in reader:
        review = ','.join(row[:-7])
        scores = row[-7:]
        rows.append([review] + scores)

# Create DataFrame
test_df = pd.DataFrame(rows, columns=headers)

# Convert DataFrame to Hugging Face Dataset
test_dataset = Dataset.from_pandas(test_df)
test_dataset

Dataset({
    features: ['review', 'Cinematography', 'Direction', 'Story', 'Characters', 'Production Design', 'Unique Concept', 'Emotions'],
    num_rows: 793
})

In [21]:
# Load tokenizer and model (replace 'path/to/model' with the actual path)
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
import torch

tokenizer = AutoTokenizer.from_pretrained('./deberta-v3-fine-tuned')
model = AutoModelForSequenceClassification.from_pretrained('./deberta-v3-fine-tuned')
model.config.problem_type = "regression"


In [22]:
import pandas as pd
from datasets import Dataset
import csv
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
import torch
from sklearn.metrics import mean_squared_error, confusion_matrix, classification_report
import numpy as np

# Load test data from CSV
test_data_path = '../data/test_reviews.csv'  # Update with your test data path

# Read CSV file properly handling commas in the reviews
rows = []
with open(test_data_path, 'r', encoding='utf-8') as f:
    reader = csv.reader(f)
    headers = next(reader)
    for row in reader:
        review = ','.join(row[:-7])
        scores = row[-7:]
        rows.append([review] + scores)

# Create DataFrame
test_df = pd.DataFrame(rows, columns=headers)

# Drop unnecessary column if it exists
if 'Unnamed: 0' in test_df.columns:
    test_df.drop(['Unnamed: 0'], axis=1, inplace=True)

# Ensure the labels are in the correct format
label_columns = ['Cinematography', 'Direction', 'Story', 'Characters', 'Production Design', 'Unique Concept', 'Emotions']
test_df[label_columns] = test_df[label_columns].astype(float)

# Convert DataFrame to Hugging Face Dataset
test_dataset = Dataset.from_pandas(test_df)

# Load tokenizer and model (replace 'path/to/model' with the actual path)
tokenizer = AutoTokenizer.from_pretrained('./deberta-v3-fine-tuned')
model = AutoModelForSequenceClassification.from_pretrained('./deberta-v3-fine-tuned')
model.config.problem_type = "regression"

# Ensure tokenizer and model vocabulary sizes match
vocab_size = len(tokenizer)

# Tokenize your data
def tokenize_function(example):
    return tokenizer(example['review'], padding='max_length', truncation=True, max_length=512)

# Check if token IDs are within vocabulary size
def check_vocab(example):
    input_ids = example['input_ids']
    if any(i >= vocab_size for i in input_ids):
        print(f"Found token ID out of bounds: {input_ids}")
    return example

# Convert labels to float32 for PyTorch
def format_labels(example):
    labels = [float(example[label]) for label in label_columns]
    example['labels'] = torch.tensor(labels, dtype=torch.float32)
    return example

# Tokenize and format the test dataset
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = tokenized_test_dataset.map(check_vocab)
tokenized_test_dataset = tokenized_test_dataset.map(format_labels)

# Define a custom compute_metrics function for regression
def compute_metrics(p):
    preds = p.predictions
    labels = p.label_ids
    mse = mean_squared_error(labels, preds)
    return {"mse": mse}

# Custom Trainer class to compute loss for regression
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss_fct = torch.nn.MSELoss()
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1, self.model.config.num_labels))
        return (loss, outputs) if return_outputs else loss

# Initialize Trainer (without training)
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="no",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=0,
    save_steps=0,
    logging_strategy="no",
    use_cpu=False,
)

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=None,  # No training dataset
    eval_dataset=tokenized_test_dataset,  # Use the tokenized test dataset
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Evaluate the model on the test dataset
test_results = trainer.evaluate(tokenized_test_dataset)

# Get predictions and true labels
predictions = trainer.predict(tokenized_test_dataset)
preds = predictions.predictions
labels = predictions.label_ids

# Compute MSE
mse = mean_squared_error(labels, preds)
print(f"Mean Squared Error: {mse}")

# Convert predictions and labels to integers for confusion matrix
preds_class = np.round(preds).astype(int)
labels_class = np.round(labels).astype(int)

# Print classification report and confusion matrix for each label
for i, label in enumerate(label_columns):
    print(f"\nClassification Report for {label}:")
    print(classification_report(labels_class[:, i], preds_class[:, i]))
    print(f"Confusion Matrix for {label}:")
    print(confusion_matrix(labels_class[:, i], preds_class[:, i]))

# Print the evaluation results
print("Test Results:", test_results)


Map: 100%|██████████| 793/793 [00:00<00:00, 8235.69 examples/s]
Map: 100%|██████████| 793/793 [00:00<00:00, 5703.64 examples/s]
Map: 100%|██████████| 793/793 [00:00<00:00, 10220.27 examples/s]
100%|██████████| 199/199 [00:23<00:00,  8.64it/s]
100%|██████████| 199/199 [00:23<00:00,  8.58it/s]

Mean Squared Error: 0.08594679832458496

Classification Report for Cinematography:
              precision    recall  f1-score   support

          -1       0.99      1.00      0.99        68
           0       1.00      0.91      0.95       415
           1       0.89      1.00      0.94       310

    accuracy                           0.95       793
   macro avg       0.96      0.97      0.96       793
weighted avg       0.96      0.95      0.95       793

Confusion Matrix for Cinematography:
[[ 68   0   0]
 [  1 377  37]
 [  0   0 310]]

Classification Report for Direction:
              precision    recall  f1-score   support

          -1       0.81      0.99      0.89       137
           0       0.99      0.92      0.95       408
           1       0.99      1.00      0.99       248

    accuracy                           0.95       793
   macro avg       0.93      0.97      0.94       793
weighted avg       0.96      0.95      0.95       793

Confusion Matrix for Direction:
[[


