In [1]:
import optuna
import torch
import pandas as pd
import numpy as np

In [2]:
from transformers import Trainer
from transformers import BertTokenizer
from transformers import BertForSequenceClassification, Trainer, TrainingArguments, TrainerCallback
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report, accuracy_score



In [3]:
train_df = pd.read_json('train.jsonl', lines=True)
test_df = pd.read_json('test.jsonl', lines=True)

In [4]:
value_map = {
    'positive': 2, 
    'negative': 0, 
    'neutral': 1, 
    'unrelated': 3
    }

train_df['annotation'] = train_df['annotation'].str.lower()
test_df['annotation'] = test_df['annotation'].str.lower()

train_df['annotation'] = train_df['annotation'].map(value_map)
test_df['annotation'] = test_df['annotation'].map(value_map)
train_df.head(5)

Unnamed: 0,text,target_text,annotation
0,Do not stay here. They will rob you upon moveo...,They will rob you blind and send you the bill.,3
1,Love the place. Im disabled so Im always dropp...,Im disabled so Im always dropping batteries an...,3
2,Very friendly staff. Verg accommidating. Cha...,Changed our room to handicappedat the last min.,1
3,Truly one stop shopping. The could use some mo...,"The could use some more handicapped carts, and...",0
4,It was a very clean and accessible establishment,It was a very clean and accessible establishment,3


In [5]:
labels = ['negative', 'neutral', 'positive', 'unrelated']

X_train = train_df['target_text']
y_train = train_df['annotation']

X_test = test_df['target_text']
y_test = test_df['annotation']

### Model training and evaluation

In [6]:
# Check gpu availability
def get_device():
    if torch.cuda.is_available():
        return torch.device("cuda")
    elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
        return torch.device("mps")
    else:
        return torch.device("cpu")

device = get_device()
print(f"Using device: {device}")

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def data_processing(df):
    """Process the data for training and testing based on bert tokenizer"""
    sequences = df['target_text'].tolist()
    labels = df['annotation'].tolist()
    encodings = tokenizer(sequences, truncation=True, padding=True)

    class TextDataset(torch.utils.data.Dataset):
        def __init__(self, encodings, labels):
            self.encodings = encodings
            self.labels = labels

        def __getitem__(self, idx):
            item = {key: torch.tensor(val[idx]).to(device) for key, val in self.encodings.items()}  # Move tensors to the selected device
            item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long).to(device)  # Ensure labels are on the correct device
            return item

        def __len__(self):
            return len(self.labels)

    return TextDataset(encodings, labels)

Using device: mps




In [7]:
def compute_metrics(pred):
    """Compute the metrics for the model"""
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [8]:
# adjust the number based on classification labels
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=4).to(device)

global_best_model = None
global_best_f1 = 0

def training_objective(trial):
    """Define the training process using the bert model"""
    global global_best_model
    global global_best_f1
    
    # Define the training arguments
    training_args = TrainingArguments(
        output_dir='./results',          
        num_train_epochs=trial.suggest_int("num_train_epochs",1,5), 
        per_device_train_batch_size=trial.suggest_categorical("per_device_train_batch_size", [16,32,64]),  
        per_device_eval_batch_size=64,  
        warmup_steps=500,                
        weight_decay=0.01,               
        logging_dir='./logs',  
    )
    
    total_f1 = 0
    kf = KFold(n_splits=5)

    for fold, (train_index, val_index) in enumerate(kf.split(train_df)):
        train_fold = data_processing(train_df.iloc[train_index])
        val_fold = data_processing(train_df.iloc[val_index])

        trainer = Trainer(
            model=model.to(device),
            args=training_args,
            train_dataset=train_fold,
            eval_dataset=val_fold,
            compute_metrics=compute_metrics
        )

        # Train the model
        trainer.train()

        # Evaluate the model
        eval_result = trainer.evaluate()
        total_f1 += eval_result['eval_f1']

    mean_f1 = total_f1 / kf.get_n_splits()
    if mean_f1 > global_best_f1:
        global_best_f1 = mean_f1
        global_best_model = model
        # Uncomment the following line to save the best model
        # global_best_model = trainer.save_model("best_model") 
    trial.set_user_attr('mean_f1', mean_f1)
    return mean_f1

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Create a study object and optimize the objective function
study = optuna.create_study(direction='maximize')
study.optimize(training_objective, n_trials=15)

### Report model performance

In [9]:
# Read the saved best model
best_model = BertForSequenceClassification.from_pretrained("./best_model")

In [10]:
train_dataset = data_processing(train_df)
test_dataset = data_processing(test_df)

# Define a new trainer with the best model
best_trainer = Trainer(model=best_model)

# Get predictions on training set
train_predictions = best_trainer.predict(train_dataset)
train_preds = np.argmax(train_predictions.predictions, axis=1)
acc_train = accuracy_score(y_train, train_preds)  # Ensure this accesses the correct labels

print(f"train accuracy: {acc_train*100:.2f}%")

# Get predictions on testing set
test_predictions = best_trainer.predict(test_dataset)
test_preds = np.argmax(test_predictions.predictions, axis=1)
acc_test = accuracy_score(y_test, test_preds)  # Ensure this accesses the correct labels
print(f"test accuracy: {acc_test*100:.2f}%")

print(classification_report(y_test, test_preds, target_names=labels))

  0%|          | 0/284 [00:00<?, ?it/s]

train accuracy: 100.00%


  0%|          | 0/71 [00:00<?, ?it/s]

test accuracy: 80.99%
              precision    recall  f1-score   support

    negative       0.68      0.81      0.74       127
     neutral       0.44      0.35      0.39        23
    positive       0.88      0.78      0.82       129
   unrelated       0.88      0.86      0.87       289

    accuracy                           0.81       568
   macro avg       0.72      0.70      0.71       568
weighted avg       0.81      0.81      0.81       568



In [13]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, test_preds))

[[103   5   2  17]
 [ 11   8   0   4]
 [ 15   0 100  14]
 [ 23   5  12 249]]


### Model application

In [None]:
class InferenceDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        return item

    def __len__(self):
        return len(self.encodings.input_ids)

In [None]:
import os
from process_text import *

mapping = {0: 'negative', 1: 'neutral', 2: 'positive', 3: 'unrelated'} 

def classify_text(df):
    """Classify the text based on the trained bert model"""
    df['targeted_text'] = df['text'].apply(process_comment)
    df = df.dropna(subset=['targeted_text'])
    df = df[df['targeted_text'] != '']
    sequences = df['parking_text'].astype(str).tolist()
    tokenized_sequences = tokenizer(sequences, truncation=True, padding=True)
    new_dataset = InferenceDataset(tokenized_sequences)
    predictions = best_trainer.predict(new_dataset)
    predicted_labels = np.argmax(predictions.predictions, axis=1)
    df['predicted_labels'] = predicted_labels
    df['predicted_labels'] = df['predicted_labels'].replace(mapping)
    return df

In [None]:
# Apply the trained model to classify the comment for each file

save_folder = 'accessible-review-classification'
if not os.path.exists(save_folder):
    os.makedirs(save_folder)
    
read_folder = 'accessible-review'
read_files = [file for file in os.listdir(read_folder) if file.endswith('.csv')]
read_files = sorted(read_files)
print(read_files)
    
for read_file in read_files:
    save_file = read_file.split('.csv')[0] + "_classification.csv"
    read_filepath = os.path.join(read_folder, read_file)
    print(f'----- process {read_filepath}')
    save_filepath = os.path.join(save_folder, save_file)
    reader = pd.read_csv(read_filepath, chunksize=1000)
    chunks = []
    for chunk in reader:
        chunk = classify_text(chunk)
        chunks.append(chunk)
    df = pd.concat(chunks)
    df.to_csv(save_filepath, index=False)