# Pipeline 3: Text Classification with plain transformers
- ref: https://huggingface.co/docs/transformers/tasks/sequence_classification

In [None]:
# import package
import numpy as np
import pandas as pd
import torch
import evaluate
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import pipeline
from sklearn.model_selection import train_test_split

## Dataset

In [None]:
# load data
train_df = pd.read_csv('dataset/train.csv', sep='\t', encoding='utf-8')
test_df = pd.read_csv('dataset/test.csv', sep='\t', encoding='utf-8')
print(f"Training data shape: {train_df.shape}")
print(train_df.head())
print(f"Testing data shape: {test_df.shape}")
print(test_df.head())   # no labels

In [None]:
# EDA

# check NaN values
print(train_df.isnull().sum())
# print unique labels
print(train_df['label'].unique())
# find the row that label == 'label'
print(train_df[train_df['label'] == 'label'])

# remove the row that label == 'label'
train_df = train_df[train_df['label'] != 'label']

# save labels as int type
train_df['label'] = train_df['label'].astype(int)

In [None]:
# train validation split

train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)
print(train_df.shape)
print(val_df.shape)

print(train_df.head())
print(val_df.head())

## Tokenizer
At this stage, we transform the text data into embeddings to later feed into the model. 

We choose the `distilbert-base-uncased` model on Hugging Face for this task.

In [None]:
# load tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

In [None]:
# tokenize the text
train_encodings = tokenizer(train_df['text'].tolist(), max_length=512, truncation=True, padding=True)
val_encodings = tokenizer(val_df['text'].tolist(), max_length=512, truncation=True, padding=True)
test_encodings = tokenizer(test_df['text'].tolist(), max_length=512, truncation=True, padding=True)
print(train_encodings)
print(val_encodings)
print(test_encodings)

In [None]:
# After tokenization, the texts are converted to input IDs and attention masks
print(train_encodings.keys())

In [None]:
# we transform the text content into embeddings
for embedding in train_encodings['input_ids']:
    print(embedding)

# we don't need the attention mask
# for attention_mask in train_encodings['attention_mask']:
#     print(attention_mask)

# Finetune the model

In [None]:
# customize dataset
class KDDDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_labels = train_df['label'].tolist()   # pandas series to list
val_labels = val_df['label'].tolist()   # pandas series to list

train_dataset = KDDDataset(train_encodings, train_labels)
val_dataset = KDDDataset(val_encodings, val_labels)

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
accuracy = evaluate.load("accuracy")

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [None]:
id2label = {0: "Real", 1: "Fake"}
label2id = {"Real": 0, "Fake": 1}

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert/distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
)

In [None]:
num_train_epochs = 20

training_args = TrainingArguments(
    output_dir="result_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=num_train_epochs,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

In [None]:
trainer.evaluate()

In [None]:
# predict
classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, max_length=512)

test_predictions = classifier(test_df['text'].tolist())

print(test_predictions)

for i in range(10):
    print(f"Text: {test_df.iloc[i]['text']}")
    print(f"Prediction: {test_predictions[i]['label']}")
    print(f"Confidence: {test_predictions[i]['score']}")
    print()