In [11]:
import json
import pandas as pd
import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    set_seed,
)
from sklearn.model_selection import train_test_split
import evaluate
import numpy as np

In [12]:
# -------------------------- Config --------------------------
MODEL_NAME = "bert-base-uncased"
MAX_LENGTH = 256
BATCH_SIZE = 32
EPOCHS = 10
LEARNING_RATE = 2e-5
WEIGHT_DECAY = 0.01
SEED = 42
set_seed(SEED)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


# 1. load data

In [13]:

JSON_FILE_PATH = "train.json"   # name of your JSON file

with open(JSON_FILE_PATH, 'r', encoding='utf-8') as f:
    data_list = json.load(f)   # load list

# transfer DataFrame
df = pd.DataFrame(data_list)

# Rename columns to ones that Trainer can recognize (optional but recommended)
df = df.rename(columns={"reviews": "text", "sentiments": "label"})

print(f"load {len(df):,} ")
print("Label distribution:")
print(df["label"].value_counts())

load 7,401 
Label distribution:
label
1    6319
0    1082
Name: count, dtype: int64


In [14]:
train_df, val_df = train_test_split(
    df, test_size=0.1, random_state=SEED, stratify=df["label"]
)

train_dataset = Dataset.from_pandas(train_df)
val_dataset   = Dataset.from_pandas(val_df)

# 2. Tokenizer

In [15]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def preprocess(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=MAX_LENGTH,
    )

tokenized_train = train_dataset.map(preprocess, batched=True, remove_columns=["text"])
tokenized_val   = val_dataset.map(preprocess,   batched=True, remove_columns=["text"])

tokenized_train.set_format("torch", columns=["input_ids", "attention_mask", "label"])
tokenized_val.set_format("torch",   columns=["input_ids", "attention_mask", "label"])

Map: 100%|██████████| 6660/6660 [00:01<00:00, 5128.55 examples/s]
Map: 100%|██████████| 741/741 [00:00<00:00, 5392.81 examples/s]


# 3. Model

In [16]:
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2,
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# 4. Metrics

In [17]:
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
import numpy as np

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)

    # 使用新函数：precision_recall_fscore_support
    precision, recall, f1, support = precision_recall_fscore_support(
        labels, preds, average=None, labels=[0, 1], zero_division=0
    )

    # 加权平均
    precision_w, recall_w, f1_w, _ = precision_recall_fscore_support(
        labels, preds, average='weighted', zero_division=0
    )

    acc = accuracy_score(labels, preds)

    return {
        "precision_0": precision[0],
        "recall_0":    recall[0],
        "f1_0":        f1[0],

        "precision_1": precision[1],
        "recall_1":    recall[1],
        "f1_1":        f1[1],

        "precision_weighted": precision_w,
        "recall_weighted":    recall_w,
        "f1_weighted":        f1_w,

        "accuracy": acc,
    }

# 5. Trainer

In [18]:
training_args = TrainingArguments(
    output_dir="./bert-6483task-finetuned",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    weight_decay=WEIGHT_DECAY,
    load_best_model_at_end=True,
    metric_for_best_model="f1_weighted",
    greater_is_better=True,
    logging_steps=100,
    fp16=torch.cuda.is_available(),          # mixed precision
    dataloader_num_workers=4,
    report_to="none",                        # set to "wandb" if you use it
    seed=SEED,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


# 6. Train

In [19]:
import os
import warnings
os.environ["TOKENIZERS_PARALLELISM"] = "false"                  # combat parallelism warning
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"            # combat symlink warning (if any)
warnings.filterwarnings("ignore")                              # brutally disable all warnings (quietest)
# If you want to keep some key warnings, you can only disable the following two types:
# warnings.filterwarnings("ignore", message=".*tokenizers.*parallelism.*")
# warnings.filterwarnings("ignore", message=".*gather along dimension 0.*")
trainer.train()


Epoch,Training Loss,Validation Loss,Precision 0,Recall 0,F1 0,Precision 1,Recall 1,F1 1,Precision Weighted,Recall Weighted,F1 Weighted,Accuracy
1,No log,0.177033,0.796117,0.759259,0.777251,0.959248,0.966825,0.963021,0.935471,0.936572,0.935945,0.936572
2,0.215200,0.160278,0.732283,0.861111,0.791489,0.97557,0.946288,0.960706,0.940111,0.933873,0.936043,0.933873
3,0.215200,0.16163,0.842593,0.842593,0.842593,0.973144,0.973144,0.973144,0.954116,0.954116,0.954116,0.954116
4,0.072600,0.171255,0.80531,0.842593,0.823529,0.97293,0.965245,0.969072,0.948499,0.947368,0.947859,0.947368
5,0.072600,0.1781,0.849057,0.833333,0.841121,0.971654,0.974724,0.973186,0.953785,0.954116,0.953938,0.954116
6,0.032800,0.218306,0.852941,0.805556,0.828571,0.967136,0.976303,0.971698,0.950492,0.951417,0.950838,0.951417
7,0.032800,0.25158,0.87,0.805556,0.836538,0.967239,0.979463,0.973312,0.953066,0.954116,0.953378,0.954116
8,0.016100,0.28003,0.872549,0.824074,0.847619,0.970266,0.979463,0.974843,0.956024,0.956815,0.9563,0.956815
9,0.016100,0.297337,0.87,0.805556,0.836538,0.967239,0.979463,0.973312,0.953066,0.954116,0.953378,0.954116
10,0.015400,0.306632,0.872549,0.824074,0.847619,0.970266,0.979463,0.974843,0.956024,0.956815,0.9563,0.956815


TrainOutput(global_step=530, training_loss=0.06721083358773645, metrics={'train_runtime': 768.1923, 'train_samples_per_second': 86.697, 'train_steps_per_second': 0.69, 'total_flos': 8761598143488000.0, 'train_loss': 0.06721083358773645, 'epoch': 10.0})

In [20]:
# -------------------------- Save final model --------------------------
trainer.save_model("./bert-6483task-final")
tokenizer.save_pretrained("./bert-6483task-final")

print("Training complete! Model saved to ./bert-6483task-final")


Training complete! Model saved to ./bert-6483task-final


# 7.load model

In [21]:

import json
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from tqdm import tqdm

# 1. Load the trained model and tokenizer
MODEL_PATH = "./bert-6483task-final"      # Directory where the model was saved

print("Loading model, please wait...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()
print(f"Model successfully loaded to {device}")

TEST_JSON_PATH = "test.json"          # ←←← change this to your test file path



Loading model, please wait...
Model successfully loaded to cuda


# 8. Predict on unlabeled test set and generate submission.csv 

In [22]:
# read test set (standard JSON array format)
with open(TEST_JSON_PATH, 'r', encoding='utf-8') as f:
    test_data = json.load(f)

test_df = pd.DataFrame(test_data)
print(f"Test set contains {len(test_df):,} samples")

# Ensure the model is on the correct device and in evaluation mode
model.to(device)
model.eval()

# Batch prediction (much faster)
def predict_batch(texts):
    inputs = tokenizer(
        texts,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=MAX_LENGTH
    )
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    with torch.no_grad():
        logits = model(**inputs).logits
        probs = torch.softmax(logits, dim=-1)
        preds = torch.argmax(probs, dim=-1)
    return preds.cpu().numpy()

# 分批预测，避免爆显存
batch_size = 64
predictions = []

for i in tqdm(range(0, len(test_df), batch_size), desc="Predicting"):
    batch_texts = test_df["reviews"].iloc[i:i+batch_size].tolist()
    batch_preds = predict_batch(batch_texts)
    predictions.extend(batch_preds)

# add predictions to DataFrame
test_df["sentiments"] = predictions

# save submission.csv

# format1：original with reviews + sentiments
# submission = test_df[["reviews", "sentiments"]]
# submission.to_csv("submission.csv", index=False, encoding="utf-8")
# print("generated submission.csv(original with predictions)")
# print(submission.head(10))

# format2：only sentiments 1 row
test_df["sentiments"].to_csv("submission.csv", header=["sentiments"], encoding="utf-8")
print("generated submission.csv(only predictions)")



Test set contains 1,851 samples


Predicting: 100%|██████████| 29/29 [00:24<00:00,  1.18it/s]

generated submission.csv(only predictions)





In [23]:
# # -------------------------- 8. Quick inference --------------------------
# def predict(text):
#     inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=MAX_LENGTH)
#     inputs = {k: v.to(device) for k, v in inputs.items()}
#     model.to(device)
#     with torch.no_grad():
#         logits = model(**inputs).logits
#         prob = torch.nn.functional.softmax(logits, dim=-1)
#     return {"negative": prob[0][0].item(), "positive": prob[0][1].item()}

# # Test
# print(predict("This product is really great!"))
# print(predict("I regret buying this, total waste of money."))