# Text Classification with plain transformers
- ref: https://huggingface.co/docs/transformers/tasks/sequence_classification

In [20]:
# import package
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import pipeline
from sklearn.model_selection import train_test_split
import torch
import evaluate

In [21]:
# load data
train_df = pd.read_csv('dataset/train.csv', sep='\t', encoding='utf-8')
test_df = pd.read_csv('dataset/test.csv', sep='\t', encoding='utf-8')
print(f"Training data shape: {train_df.shape}")
print(train_df.head())
print(f"Testing data shape: {test_df.shape}")
print(test_df.head())   # no labels

Training data shape: (4987, 2)
                                                text label
0  Get the latest from TODAY Sign up for our news...     1
1  2d  Conan On The Funeral Trump Will Be Invited...     1
2  It’s safe to say that Instagram Stories has fa...     0
3  Much like a certain Amazon goddess with a lass...     0
4  At a time when the perfect outfit is just one ...     0
Testing data shape: (1247, 2)
   id                                               text
0   2  The 2017 Teen Choice Awards ceremony was held ...
1   3  The concert, part of “The Joshua Tree Tour,” w...
2   4  Selena Gomez refuses to talk to her mother abo...
3   5  This is worse than a lump of coal in your stoc...
4   6  Luann De Lesseps is going to rehab after her a...


In [22]:
# EDA

# check NaN values
print(train_df.isnull().sum())
# print unique labels
print(train_df['label'].unique())
# find the row that label == 'label'
print(train_df[train_df['label'] == 'label'])

# remove the row that label == 'label'
train_df = train_df[train_df['label'] != 'label']

# save labels as int type
train_df['label'] = train_df['label'].astype(int)

text     0
label    0
dtype: int64
['1' '0' 'label']
         text  label
1615  content  label


In [23]:
# train validation split

train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)
print(train_df.shape)
print(val_df.shape)

print(train_df.head())
print(val_df.head())

(3988, 2)
(998, 2)
                                                   text  label
2415  Singer Aaron Carter, who has previously been f...      0
3159  Nineteen years ago, a gay man and his straight...      0
3009  The mother! of all relationships is over.  Jen...      1
3612  The Republican war on women continues unabated...      1
4518  As Taylor Swift calls out the haters on her ne...      0
                                                   text  label
1489  George Timothy Clooney (born May 6, 1961) is a...      1
2755  Do you feel it in your fingers? Do you feel it...      0
465   Advertisement  The royal family gathered this ...      0
2489  Roger Ailes, Former Fox News CEO, Dies At 77  ...      0
676   American serial child sexual abuser and physic...      0


In [24]:
# load tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

In [25]:
# tokenize the text
train_encodings = tokenizer(train_df.iloc[:10]['text'].tolist(), max_length=512, truncation=True, padding=True)
val_encodings = tokenizer(val_df.iloc[:10]['text'].tolist(), max_length=512, truncation=True, padding=True)
test_encodings = tokenizer(test_df.iloc[:10]['text'].tolist(), max_length=512, truncation=True, padding=True)
print(train_encodings)
print(val_encodings)
print(test_encodings)

{'input_ids': [[101, 3220, 7158, 5708, 1010, 2040, 2038, 3130, 2042, 16875, 2055, 2010, 9415, 6905, 1998, 5983, 8761, 1010, 2003, 2085, 3098, 2039, 2055, 2010, 13798, 1012, 1996, 2756, 1011, 2095, 1011, 2214, 2567, 1997, 10457, 13334, 2102, 3337, 2632, 2819, 4172, 5708, 1056, 28394, 3064, 2006, 5095, 2305, 1037, 2146, 2330, 3661, 1999, 2029, 2002, 28049, 2010, 8432, 2000, 2119, 2273, 1998, 2308, 2144, 2002, 2001, 2410, 1012, 1000, 2045, 1521, 1055, 2242, 1045, 1521, 1040, 2066, 2000, 2360, 2008, 1045, 2514, 2003, 2590, 2005, 2870, 1998, 2026, 4767, 2008, 2038, 2042, 15243, 2006, 2026, 3108, 2005, 3053, 2431, 1997, 2026, 2166, 1010, 1000, 2002, 2626, 1012, 1000, 2023, 2987, 1521, 1056, 3288, 2033, 9467, 1010, 2074, 1037, 3635, 1998, 10859, 1045, 2031, 2218, 3031, 2005, 1037, 2146, 2051, 2008, 1045, 2052, 2066, 4196, 2125, 2033, 1012, 1000, 2002, 7607, 1010, 1000, 1045, 3473, 2039, 1999, 2023, 4024, 3068, 2012, 1037, 2200, 2402, 2287, 1998, 2043, 1045, 2001, 2105, 2410, 2086, 2214, 1045,

In [26]:
for embedding in train_encodings['input_ids']:
    print(embedding)

[101, 3220, 7158, 5708, 1010, 2040, 2038, 3130, 2042, 16875, 2055, 2010, 9415, 6905, 1998, 5983, 8761, 1010, 2003, 2085, 3098, 2039, 2055, 2010, 13798, 1012, 1996, 2756, 1011, 2095, 1011, 2214, 2567, 1997, 10457, 13334, 2102, 3337, 2632, 2819, 4172, 5708, 1056, 28394, 3064, 2006, 5095, 2305, 1037, 2146, 2330, 3661, 1999, 2029, 2002, 28049, 2010, 8432, 2000, 2119, 2273, 1998, 2308, 2144, 2002, 2001, 2410, 1012, 1000, 2045, 1521, 1055, 2242, 1045, 1521, 1040, 2066, 2000, 2360, 2008, 1045, 2514, 2003, 2590, 2005, 2870, 1998, 2026, 4767, 2008, 2038, 2042, 15243, 2006, 2026, 3108, 2005, 3053, 2431, 1997, 2026, 2166, 1010, 1000, 2002, 2626, 1012, 1000, 2023, 2987, 1521, 1056, 3288, 2033, 9467, 1010, 2074, 1037, 3635, 1998, 10859, 1045, 2031, 2218, 3031, 2005, 1037, 2146, 2051, 2008, 1045, 2052, 2066, 4196, 2125, 2033, 1012, 1000, 2002, 7607, 1010, 1000, 1045, 3473, 2039, 1999, 2023, 4024, 3068, 2012, 1037, 2200, 2402, 2287, 1998, 2043, 1045, 2001, 2105, 2410, 2086, 2214, 1045, 2318, 2000, 24

In [27]:
# customize dataset
class KDDDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_labels = train_df.iloc[:10]['label'].tolist()
val_labels = val_df.iloc[:10]['label'].tolist()

train_dataset = KDDDataset(train_encodings, train_labels)
val_dataset = KDDDataset(val_encodings, val_labels)

In [28]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [29]:
accuracy = evaluate.load("accuracy")

In [30]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [31]:
id2label = {0: "Real", 1: "Fake"}
label2id = {"Real": 0, "Fake": 1}

In [32]:
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert/distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [33]:
training_args = TrainingArguments(
    output_dir="result_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
                                             
 50%|█████     | 1/2 [00:02<00:01,  1.89s/it]

{'eval_loss': 0.674322247505188, 'eval_accuracy': 0.7, 'eval_runtime': 0.4248, 'eval_samples_per_second': 23.539, 'eval_steps_per_second': 2.354, 'epoch': 1.0}


                                             
100%|██████████| 2/2 [00:05<00:00,  2.60s/it]

{'eval_loss': 0.670178234577179, 'eval_accuracy': 0.6, 'eval_runtime': 0.4534, 'eval_samples_per_second': 22.054, 'eval_steps_per_second': 2.205, 'epoch': 2.0}


100%|██████████| 2/2 [00:07<00:00,  3.58s/it]

{'train_runtime': 7.1777, 'train_samples_per_second': 2.786, 'train_steps_per_second': 0.279, 'train_loss': 0.6856998205184937, 'epoch': 2.0}





TrainOutput(global_step=2, training_loss=0.6856998205184937, metrics={'train_runtime': 7.1777, 'train_samples_per_second': 2.786, 'train_steps_per_second': 0.279, 'train_loss': 0.6856998205184937, 'epoch': 2.0})

In [34]:
trainer.evaluate()

100%|██████████| 1/1 [00:00<00:00, 150.97it/s]


{'eval_loss': 0.670178234577179,
 'eval_accuracy': 0.6,
 'eval_runtime': 0.4408,
 'eval_samples_per_second': 22.687,
 'eval_steps_per_second': 2.269,
 'epoch': 2.0}

In [36]:
# predict
classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, max_length=512)

test_predictions = classifier(test_df.iloc[0:10]['text'].tolist())

print(test_predictions)

for i in range(10):
    print(f"Text: {test_df.iloc[i]['text']}")
    print(f"Prediction: {test_predictions[i]['label']}")
    print(f"Confidence: {test_predictions[i]['score']}")
    print()

[{'label': 'Real', 'score': 0.5050602555274963}, {'label': 'Real', 'score': 0.5435073971748352}, {'label': 'Real', 'score': 0.5242502689361572}, {'label': 'Real', 'score': 0.537466824054718}, {'label': 'Real', 'score': 0.5254349112510681}, {'label': 'Real', 'score': 0.5186701416969299}, {'label': 'Real', 'score': 0.5186699628829956}, {'label': 'Real', 'score': 0.5214144587516785}, {'label': 'Real', 'score': 0.5191699862480164}, {'label': 'Real', 'score': 0.5207558274269104}]
Text: The 2017 Teen Choice Awards ceremony was held on August 13, 2017.[1] The awards celebrated the year's achievements in music, film, television, sports, fashion, comedy, and the Internet, and were voted on by viewers living in the USA, aged 13 and over through various social media sites.[2] A three hour musical festival called "Teen Fest" and hosted by Jake Paul was streamed exclusively on YouTube with some of the event appearing during the Teen Choice broadcast.[3] Maroon 5 received the inaugural Decade Award.