In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/hope/transformers/default/1/spm.model
/kaggle/input/hope/transformers/default/1/config.json
/kaggle/input/hope/transformers/default/1/tokenizer.json
/kaggle/input/hope/transformers/default/1/tokenizer_config.json
/kaggle/input/hope/transformers/default/1/model.safetensors
/kaggle/input/hope/transformers/default/1/special_tokens_map.json
/kaggle/input/hope/transformers/default/1/added_tokens.json
/kaggle/input/en-test-cleaned/cleaned_en_test.csv
/kaggle/input/test-phase/en_test_without_labels.csv
/kaggle/input/test-phase/es_test_without_labels.csv
/kaggle/input/0.72-multiclass/transformers/default/1/config.json
/kaggle/input/0.72-multiclass/transformers/default/1/tokenizer.json
/kaggle/input/0.72-multiclass/transformers/default/1/tokenizer_config.json
/kaggle/input/0.72-multiclass/transformers/default/1/model.safetensors
/kaggle/input/0.72-multiclass/transformers/default/1/special_tokens_map.json
/kaggle/input/0.72-multiclass/transformers/default/1/vocab.txt
/kaggle/input/

In [2]:
import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer

# model path for tokenizer
model_name = r"model folder path"
tokenizer = AutoTokenizer.from_pretrained(model_name)

class TestDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.data = dataframe
        
        self.text = dataframe["text"]  
        self.targets = dataframe["label"] if "label" in dataframe.columns else None 

    def __len__(self):
        return len(self.text)

    def __getitem__(self, idx):
        text = str(self.text[idx]).strip()

        # Tokenization
        inputs = self.tokenizer(
            text,
            add_special_tokens=True,
            truncation=False,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt"
        )

        item = {
            "input_ids": inputs["input_ids"].squeeze(0),
            "attention_mask": inputs["attention_mask"].squeeze(0),
        }

        if self.targets is not None:
            item["label"] = torch.tensor(self.targets[idx], dtype=torch.long)

        return item

In [4]:
import torch
import pandas as pd
from torch.utils.data import DataLoader
from transformers import AutoModelForSequenceClassification, AutoTokenizer


# model folder path 
model_path = "model folder path"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels = 2)
device = torch.device('cuda')

batch_size = 32

# test file path

test_file = r'test data path'
test_df = pd.read_csv(test_file)
test_dataset = TestDataset(test_df, tokenizer)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
test_dataset = TestDataset(test_df, tokenizer)  #Dùng trực tiếp

model.to(device)
model.eval()

# multiclass
label_map = {
    0: "Generalized Hope",
    1: "Not Hope",
    2: "Realistic Hope",
    3: "Sarcasm",
    4: "Unrealistic Hope"
}

# binary
label_map_ = {
    0: "Not Hope",
    1: "Hope"
}

test_predictions = []
test_labels = []

with torch.no_grad():
    for i, batch in enumerate(test_loader, 1):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        outputs = model(input_ids, attention_mask)
        _, predicted = torch.max(outputs.logits, dim=1)
        
        batch_size = input_ids.size(0)
        test_labels.extend([f"text{j}" for j in range(i, i + batch_size)])
        
        test_predictions.extend(predicted.cpu().numpy())

y_pred = [label_map_[label] for label in test_predictions]

df = pd.DataFrame({"Text": test_labels, "Tag": y_pred})  



In [5]:
save_path = "save predicted  path"
df.to_csv(save_path, index=False)


In [6]:
import zipfile

save_path = "save predicted path"
zip_path = "zip predicted file path"

with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
    zipf.write(save_path, arcname="prediction file name")
