In [36]:
import os
import torch
import glob
import numpy as np
import xml.etree.ElementTree as ET
from transformers import BertTokenizer, BertForSequenceClassification 
from torch.utils.data import DataLoader, TensorDataset
import re

In [3]:
# Load tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=3)  # 3 sentiment labels

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [58]:
# Load and preprocess the dataset
def parse_xml_dat_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    
    texts = []
    aspects = []
    labels = []

    for i in range(0, len(lines), 3):
        text = lines[i].replace("$T$", "").strip()  # Remove "$T$" and whitespace
        aspect = lines[i + 1].strip()
        sentiment = lines[i + 1].strip()
        
        texts.append(text)
        aspects.append(aspect)
        labels.append(sentiment)
        
    return texts, aspects, labels
    

In [51]:
# Automatically find .xml.dat files in the Downloads directory
downloads_dir = os.path.expanduser("~/Downloads")
file_pattern = "*.xml.dat"
file_paths = glob.glob(os.path.join(downloads_dir, file_pattern))

In [52]:
train_file_path = next(path for path in file_paths if "train" in path.lower())
test_file_path = next(path for path in file_paths if "test" in path.lower())

In [53]:
inference_file_path = next(path for path in file_paths if "inference" in path.lower())

In [54]:
train_texts, train_aspects, train_labels = parse_xml_dat_file(train_file_path)
test_texts, test_aspects, test_labels = parse_xml_dat_file(test_file_path)

In [55]:
# After obtaining predicted_sentiments from the trained model on test dataset
# Replaced with my actual predictions
predicted_sentiments = [2, 0, 2]

In [62]:
# Load the aspect information for the inference dataset
inference_texts,inference_aspects, _ = parse_xml_dat_file(inference_file_path)

In [63]:
for aspect, sentiment in zip(inference_aspects, predicted_sentiments):
    sentiment_str = "Negative" if sentiment == 0 else "Neutral" if sentiment == 1 else "Positive"
    print(f"Aspect: {aspect}, Sentiment: {sentiment_str}")

Aspect: [B-ASP]Ortam[E-ASP] cok guzel. $LABEL$ Positive, Sentiment: Positive
Aspect: [B-ASP]Sushi ustasi[E-ASP] cok becerikli ve şakaci : $LABEL$ Positive, Sentiment: Negative
Aspect: [B-ASP]Fiyatları[E-ASP] normalin biraz üstünde ama yinede caddedeki yorgunluğu atmanız için ferah ferah oturabileceğiniz mekan. $LABEL$ Negative, Sentiment: Positive
