In [None]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import mean_absolute_error, median_absolute_error, accuracy_score, confusion_matrix
from transformers import BertTokenizer
from sklearn.preprocessing import StandardScaler

In [None]:

# Load the dataset
dataset = pd.read_csv("./zzzzold/zzzz1.csv")
dataset = dataset[dataset["storypoint"] != 13]
dataset = dataset.dropna()

# Split the dataset into training and testing sets
train_data, test_data = train_test_split(dataset, test_size=0.2, random_state=42)

# BERT Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize and pad the input data
def tokenize_data(data, max_length=512):
    input_ids = []
    attention_masks = []

    for idx, row in data.iterrows():
        text = row['title']+ ' ' + row['description']
        encoded_data = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        input_ids.append(encoded_data['input_ids'])
        attention_masks.append(encoded_data['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

In [None]:
# Get tokenized training and testing data
train_input_ids, train_attention_masks = tokenize_data(train_data)
test_input_ids, test_attention_masks = tokenize_data(test_data)

# Convert storypoints to numpy array

storypoint_mapping = {
    1: 0,
    2: 1,
    3: 2,
    5: 3,
    8: 4
}

train_data['storypoint'] = train_data['storypoint'].map(storypoint_mapping)
test_data['storypoint'] = test_data['storypoint'].map(storypoint_mapping)

train_labels = train_data['storypoint'].to_numpy()
test_labels = test_data['storypoint'].to_numpy()

In [None]:
# Standardize input data
scaler = StandardScaler()
train_input_ids = scaler.fit_transform(train_input_ids)
test_input_ids = scaler.transform(test_input_ids)

# Train the SVM model
svm_model = SVC(kernel='rbf', C=5000, gamma=1)

svm_model.fit(train_input_ids, train_labels)
test_predictions = svm_model.predict(test_input_ids)

In [None]:
# Calculate evaluation metrics
mae = mean_absolute_error(test_labels, test_predictions)
mdae = median_absolute_error(test_labels, test_predictions)
accuracy = accuracy_score(test_labels, test_predictions)
conf_matrix = confusion_matrix(test_labels, test_predictions)

# Display the results
print(f'Test MAE: {mae}')
print(f'Test MdAE: {mdae}')
print(f'Test Overall Accuracy: {accuracy}')

# Calculate class-wise accuracy
class_wise_accuracy = {}
for i in range(5):
    correct_predictions = conf_matrix[i, i]
    total_samples = sum(conf_matrix[i, :])
    class_wise_accuracy[i] = correct_predictions / total_samples

print('Test Class-wise Accuracy:')
for key, value in class_wise_accuracy.items():
    print(f'Class {key}: {value}')

print(f'Confusion Matrix:\n{conf_matrix}')