In [1]:
import optuna
import torch
import pandas as pd
import numpy as np

In [2]:
from transformers import Trainer
from transformers import BertTokenizer
from transformers import BertForSequenceClassification, Trainer, TrainingArguments, TrainerCallback
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report, accuracy_score

In [3]:
df = pd.read_csv('samples.csv')

df['label'] = df['label'].replace({
    'negative': 0, 
    'neutral': 1, 
    'positive': 2, 
    'unrelated': 3,})

print(len(df))
df.head(5)

2000


Unnamed: 0,text,label
0,Lots of parking. Lots nice hotels in the area ...,2
1,The one thing I want to caution is bring wood ...,3
2,"Very little and very expensive parking, and th...",0
3,This property ruined my birthday weekend with ...,0
4,We returned to the parking area and sat on som...,3


In [4]:
labels = ['negative', 'neutral', 'positive', 'unrelated']

train_df = df.iloc[:1600]
test_df = df.iloc[1600:]

### Model training and evaluation

In [5]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def data_processing(df):
    """Process the data for training and testing based on bert tokenizer"""
    sequences = df['text'].tolist()
    labels = df['label'].tolist()
    encodings = tokenizer(sequences, truncation=True, padding=True)

    class TextDataset(torch.utils.data.Dataset):
        def __init__(self, encodings, labels):
            self.encodings = encodings
            self.labels = labels

        def __getitem__(self, idx):
            item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
            item['label'] = torch.tensor(self.labels[idx])
            return item

        def __len__(self):
            return len(self.labels)

    return TextDataset(encodings, labels)

In [6]:
def compute_metrics(pred):
    """Compute the metrics for the model"""
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [7]:
# adjust the number based on classification labels
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=5) 

global_best_model = None
global_best_f1 = 0

def training_objective(trial):
    """Define the training process using the bert model"""
    global global_best_model
    global global_best_f1
    
    # Define the training arguments
    training_args = TrainingArguments(
        output_dir='./results',          
        num_train_epochs=trial.suggest_int("num_train_epochs",1,5), 
        per_device_train_batch_size=trial.suggest_categorical("per_device_train_batch_size", [16,32,64]),  
        per_device_eval_batch_size=64,  
        warmup_steps=500,                
        weight_decay=0.01,               
        #logging_dir='./logs',  
    )
    
    total_f1 = 0
    kf = KFold(n_splits=5)

    for fold, (train_index, val_index) in enumerate(kf.split(train_df)):
        train_fold = data_processing(train_df.iloc[train_index])
        val_fold = data_processing(train_df.iloc[val_index])

        trainer = Trainer(
            model=model, 
            args=training_args,
            train_dataset=train_fold,
            eval_dataset=val_fold,
            compute_metrics=compute_metrics
        )

        # Train the model
        trainer.train()

        # Evaluate the model
        eval_result = trainer.evaluate()
        total_f1 += eval_result['eval_f1']

    mean_f1 = total_f1 / kf.get_n_splits()
    if mean_f1 > global_best_f1:
        global_best_f1 = mean_f1
        global_best_model = model
        # Uncomment the following line to save the best model
        #global_best_model = trainer.save_model("best_model") 
    trial.set_user_attr('mean_f1', mean_f1)
    return mean_f1

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Create a study object and optimize the objective function
study = optuna.create_study(direction='maximize')
study.optimize(training_objective, n_trials=15)

### Report model performance

In [8]:
# Read the saved best model
best_model = BertForSequenceClassification.from_pretrained("./best_model")

In [9]:
train_dataset = data_processing(df[:1600])
test_dataset = data_processing(df[1600:])

# Define a new trainer with the best model
best_trainer = Trainer(model=best_model)

# Get predictions on training set
train_predictions = best_trainer.predict(train_dataset)
train_preds = np.argmax(train_predictions.predictions, axis=1)
acc_train = accuracy_score(train_dataset.labels, train_preds)

print(f"train accuracy: {acc_train*100:.2f}%")

# Get predictions on testing set
test_predictions = best_trainer.predict(test_dataset)
test_preds = np.argmax(test_predictions.predictions, axis=1)
acc_test = accuracy_score(test_dataset.labels, test_preds)
print(f"test accuracy: {acc_test*100:.2f}%")

print(classification_report(test_dataset.labels, test_preds, target_names=labels))

  0%|          | 0/200 [00:00<?, ?it/s]

train accuracy: 99.12%


  0%|          | 0/50 [00:00<?, ?it/s]

test accuracy: 90.00%
              precision    recall  f1-score   support

    negative       0.89      0.96      0.92       165
     neutral       0.71      0.53      0.61        19
    positive       0.93      0.93      0.93       190
   unrelated       0.88      0.54      0.67        26

    accuracy                           0.90       400
   macro avg       0.85      0.74      0.78       400
weighted avg       0.90      0.90      0.89       400



### Model application

In [None]:
class InferenceDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        return item

    def __len__(self):
        return len(self.encodings.input_ids)

In [None]:
import os
from process_text import *

mapping = {0: 'negative', 1: 'neutral', 2: 'positive', 3: 'unrelated'} 

def classify_text(df):
    """Classify the text based on the trained bert model"""
    df['parking_text'] = df['text'].apply(process_comment)
    df = df.dropna(subset=['parking_text'])
    df = df[df['parking_text'] != '']
    sequences = df['parking_text'].astype(str).tolist()
    tokenized_sequences = tokenizer(sequences, truncation=True, padding=True)
    new_dataset = InferenceDataset(tokenized_sequences)
    predictions = best_trainer.predict(new_dataset)
    predicted_labels = np.argmax(predictions.predictions, axis=1)
    df['predicted_labels'] = predicted_labels
    df['predicted_labels'] = df['predicted_labels'].replace(mapping)
    return df

In [None]:
# Apply the trained model to classify the comment for each file

save_folder = 'parking-pos-review-classification'
if not os.path.exists(save_folder):
    os.makedirs(save_folder)
    
read_folder = 'parking-pos-review'
read_files = [file for file in os.listdir(read_folder) if file.endswith('.csv')]
read_files = sorted(read_files)
print(read_files)
    
for read_file in read_files:
    save_file = read_file.split('.csv')[0] + "_classification.csv"
    read_filepath = os.path.join(read_folder, read_file)
    print(f'----- process {read_filepath}')
    save_filepath = os.path.join(save_folder, save_file)
    reader = pd.read_csv(read_filepath, chunksize=1000)
    chunks = []
    for chunk in reader:
        chunk = classify_text(chunk)
        chunks.append(chunk)
    df = pd.concat(chunks)
    df.to_csv(save_filepath, index=False)