In [1]:
# Step1: upload files
from google.colab import files

uploaded = files.upload()

Saving test.jsonl to test.jsonl
Saving train.jsonl to train.jsonl
Saving val.jsonl to val.jsonl


# Task1

## TF-IDF

In [2]:
# Step 2: Load the .jsonl data file into a DataFrame
import pandas as pd
import json

# Read the .jsonl file into a DataFrame
def load_jsonl(filename):
    data = []
    with open(filename, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line))
    return pd.DataFrame(data)

# Load the data
train_df = load_jsonl("train.jsonl")
val_df = load_jsonl("val.jsonl")
test_df = load_jsonl("test.jsonl")

In [3]:
# Step 3: Data Preprocessing (Concatenate text & Encode labels)
# Concatenate text fields as model input
train_df['combined_text'] = train_df['postText'].apply(lambda x: ' '.join(x)) + ' ' + \
                            train_df['targetTitle'] + ' ' + \
                            train_df['targetParagraphs'].apply(lambda x: ' '.join(x))

# Use only the first label as the classification target
train_df['label'] = train_df['tags'].apply(lambda x: x[0])

In [4]:
# Step 4: TF-IDF + Random Forest Classification
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Label Encoding
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(train_df['label'])

# TF-IDF Feature Extraction
tfidf = TfidfVectorizer(max_features=10000, stop_words='english')
X = tfidf.fit_transform(train_df['combined_text'])

# Split the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Model Training
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Validation Set Prediction and Evaluation
y_pred = clf.predict(X_val)
print(classification_report(y_val, y_pred, target_names=label_encoder.classes_))

              precision    recall  f1-score   support

       multi       0.75      0.05      0.10       112
     passage       0.44      0.58      0.50       255
      phrase       0.53      0.57      0.55       273

    accuracy                           0.48       640
   macro avg       0.57      0.40      0.38       640
weighted avg       0.53      0.48      0.45       640



In [5]:
# Step 5 (Optional): Predict on the test set & save the submission file

# Construct test set text
test_df['combined_text'] = test_df['postText'].apply(lambda x: ' '.join(x)) + ' ' + \
                           test_df['targetTitle'] + ' ' + \
                           test_df['targetParagraphs'].apply(lambda x: ' '.join(x))

X_test = tfidf.transform(test_df['combined_text'])
y_test_pred = clf.predict(X_test)
spoiler_type_pred = label_encoder.inverse_transform(y_test_pred)

# Save the output as a CSV file
output_df = pd.DataFrame({
    'id': test_df.index,
    'spoilerType': spoiler_type_pred
})

output_df.to_csv("task1_predictions.csv", index=False)
print("Save the prediction results as task1_predictions.csv")

Save the prediction results as task1_predictions.csv


## BERT

In [6]:
# Step 1: Install dependencies
!pip install -q transformers datasets accelerate
%pip install -q evaluate

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m120.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m94.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m57.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m18.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [7]:
# Step 2: Import libraries & read the JSONL data
import pandas as pd
import json
from sklearn.preprocessing import LabelEncoder

def load_jsonl(filename):
    data = []
    with open(filename, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line))
    return pd.DataFrame(data)

# Load the dataset
train_df = load_jsonl("train.jsonl")
val_df = load_jsonl("val.jsonl")
test_df = load_jsonl("test.jsonl")

# Combine text fields as input for BERT
def combine_text(df):
    return df['postText'].apply(lambda x: ' '.join(x)) + ' ' + df['targetTitle'] + ' ' + df['targetParagraphs'].apply(lambda x: ' '.join(x))

train_df["text"] = combine_text(train_df)
val_df["text"] = combine_text(val_df)
test_df["text"] = combine_text(test_df)

# Encode labels
label_encoder = LabelEncoder()
train_df["label"] = label_encoder.fit_transform(train_df["tags"].apply(lambda x: x[0]))
val_df["label"] = label_encoder.transform(val_df["tags"].apply(lambda x: x[0]))

In [8]:
# Step 3：Prepare HuggingFace Dataset & Tokenizer
from datasets import Dataset
from transformers import AutoTokenizer
import numpy as np

# Convert to HuggingFace Dataset
train_dataset = Dataset.from_pandas(train_df[["text", "label"]])
val_dataset = Dataset.from_pandas(val_df[["text", "label"]])

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Encoding function
def tokenize(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=512)

# Execute encoding
train_dataset = train_dataset.map(tokenize, batched=True)
val_dataset = val_dataset.map(tokenize, batched=True)

# Set format - explicitly handle potential numpy copy issues
def set_torch_format_with_copy_handling(dataset):
    dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
    # Check if numpy version is causing issues and try to handle
    try:
        # This line might cause the ValueError, so we wrap it
        dataset[0]
    except ValueError as e:
        if "Unable to avoid copy while creating an array as requested" in str(e):
            print("NumPy version conflict detected. Attempting to set format allowing copies.")
            # Re-set format, potentially allowing copies by not using copy=False implicitly
            dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'], format_kwargs={'numpy_copy': True})
        else:
            raise e
    return dataset

train_dataset = set_torch_format_with_copy_handling(train_dataset)
val_dataset = set_torch_format_with_copy_handling(val_dataset)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/3200 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

In [9]:
# Step 4: Define the model & training parameters
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

import os
os.environ["WANDB_DISABLED"] = "true"

model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=50,
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="eval_accuracy"
)


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [10]:
# Step 5: Define evaluation metrics and train the model

import numpy as np
import evaluate

accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy.compute(predictions=predictions, references=labels)["accuracy"],
        "f1": f1.compute(predictions=predictions, references=labels, average="macro")["f1"]
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.8883,0.812944,0.6325,0.611378
2,0.682,0.777746,0.68,0.652796
3,0.3951,0.931364,0.6875,0.679121


TrainOutput(global_step=1200, training_loss=0.6940957268079122, metrics={'train_runtime': 215.3469, 'train_samples_per_second': 44.579, 'train_steps_per_second': 5.572, 'total_flos': 2525888810188800.0, 'train_loss': 0.6940957268079122, 'epoch': 3.0})

In [11]:
# Step 6: Predict on the test set and export CSV

# Tokenize the test dataset
test_dataset = Dataset.from_pandas(test_df[["text"]])
test_dataset = test_dataset.map(tokenize, batched=True)
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])

# Make predictions
preds = trainer.predict(test_dataset)
test_preds = np.argmax(preds.predictions, axis=-1)
test_labels = label_encoder.inverse_transform(test_preds)

# Save prediction results
submission = pd.DataFrame({"id": test_df.index, "spoilerType": test_labels})
submission.to_csv("task1_bert_submission.csv", index=False)
print("Prediction results have been saved as task1_bert_submission.csv")

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Prediction results have been saved as task1_bert_submission.csv


# Task2

In [12]:
import pandas as pd
import json

def load_jsonl(path):
    with open(path, 'r', encoding='utf-8') as f:
        return [json.loads(line) for line in f]

train = pd.DataFrame(load_jsonl("train.jsonl"))
val = pd.DataFrame(load_jsonl("val.jsonl"))
test = pd.DataFrame(load_jsonl("test.jsonl"))

# Construct input text
def combine(row):
    return f"post: {' '.join(row['postText'])} title: {row['targetTitle']} paragraphs: {' '.join(row['targetParagraphs'])}"

train["input_text"] = train.apply(combine, axis=1)
val["input_text"] = val.apply(combine, axis=1)
test["input_text"] = test.apply(combine, axis=1)

# Construct target text
train["target_text"] = train["spoiler"].apply(lambda x: " ".join(x) if isinstance(x, list) else str(x))
val["target_text"] = val["spoiler"].apply(lambda x: " ".join(x) if isinstance(x, list) else str(x))

In [13]:
from datasets import Dataset
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("t5-small")

max_input_length = 512
max_target_length = 128

def preprocess_function(examples):
    model_inputs = tokenizer(examples["input_text"], max_length=max_input_length, truncation=True, padding="max_length")
    labels = tokenizer(examples["target_text"], max_length=max_target_length, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

train_dataset = Dataset.from_pandas(train[["input_text", "target_text"]])
val_dataset = Dataset.from_pandas(val[["input_text", "target_text"]])
train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Map:   0%|          | 0/3200 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

In [14]:
from transformers import T5ForConditionalGeneration, TrainingArguments, Trainer
import os
os.environ["WANDB_DISABLED"] = "true"  # close wandb

model = T5ForConditionalGeneration.from_pretrained("t5-small")

training_args = TrainingArguments(
    output_dir="./t5_task2",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    learning_rate=3e-4,
    weight_decay=0.01,
    save_total_limit=1,
    fp16=True  # If you have a GPU available

)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)

trainer.train()

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,0.4197,0.211923
2,0.1614,0.205878


Epoch,Training Loss,Validation Loss
1,0.4197,0.211923
2,0.1614,0.205878
3,0.1612,0.206708


TrainOutput(global_step=2400, training_loss=0.2252885087331136, metrics={'train_runtime': 214.2448, 'train_samples_per_second': 44.809, 'train_steps_per_second': 11.202, 'total_flos': 1299281294131200.0, 'train_loss': 0.2252885087331136, 'epoch': 3.0})

In [15]:
# Encode the test dataset
test_inputs = tokenizer(list(test["input_text"]), return_tensors="pt", padding=True, truncation=True, max_length=512)

# Generate in batches
device = "cuda" if model.device.type == "cuda" else "cpu"
model.to(device)
input_ids = test_inputs["input_ids"].to(device)
attention_mask = test_inputs["attention_mask"].to(device)

outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=128)
decoded_preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)

# Save as CSV
submission = pd.DataFrame({
    "id": test.index,
    "spoiler": decoded_preds
})
submission.to_csv("task2_t5_submission.csv", index=False)
print("Saved as task2_t5_submission.csv")

Saved as task2_t5_submission.csv
