In [1]:
!pip install transformers datasets evaluate scikit-learn




In [2]:
import pandas as pd

train_df = pd.read_csv('/content/multimodal_train.tsv', sep='\t')
val_df   = pd.read_csv('/content/multimodal_validate.tsv', sep='\t')
test_df  = pd.read_csv('/content/multimodal_test_public.tsv', sep='\t')

print(train_df.columns)  # check available columns


Index(['author', 'clean_title', 'created_utc', 'domain', 'hasImage', 'id',
       'image_url', 'linked_submission_id', 'num_comments', 'score',
       'subreddit', 'title', 'upvote_ratio', '2_way_label', '3_way_label',
       '6_way_label'],
      dtype='object')


In [3]:
print(train_df.shape)
print(val_df.shape)
print(test_df.shape)

(564000, 16)
(59342, 16)
(59319, 16)


In [12]:
# Select the first 10,000 rows of each dataframe
train_df_small = train_df.iloc[:10000]
val_df_small = val_df.iloc[:2000]
test_df_small = test_df.iloc[:2000]

# Verify the new shapes
print(train_df_small.shape)
print(val_df_small.shape)
print(test_df_small.shape)

(10000, 16)
(2000, 16)
(2000, 16)


In [13]:
from datasets import Dataset

train_ds = Dataset.from_pandas(train_df_small[['clean_title', '2_way_label']])
val_ds   = Dataset.from_pandas(val_df_small[['clean_title', '2_way_label']])
test_ds  = Dataset.from_pandas(test_df_small[['clean_title', '2_way_label']])


In [14]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize(batch):
    return tokenizer(batch["clean_title"], truncation=True, padding="max_length", max_length=64)

train_ds = train_ds.map(tokenize, batched=True)
val_ds   = val_ds.map(tokenize, batched=True)
test_ds  = test_ds.map(tokenize, batched=True)

train_ds = train_ds.rename_column("2_way_label", "labels")
val_ds   = val_ds.rename_column("2_way_label", "labels")
test_ds  = test_ds.rename_column("2_way_label", "labels")

train_ds.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
val_ds.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
test_ds.set_format("torch", columns=["input_ids", "attention_mask", "labels"])


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [15]:
!pip install --upgrade transformers




In [16]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import evaluate

# Load BERT
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# Metrics
accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(-1)
    return {
        "accuracy": accuracy.compute(predictions=preds, references=labels),
        "f1": f1.compute(predictions=preds, references=labels)
    }

# Training Args
training_args = TrainingArguments(
    output_dir="./results",


    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
)



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:



trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()



  trainer = Trainer(


Step,Training Loss
500,0.4489
1000,0.2912
1500,0.1807


TrainOutput(global_step=1875, training_loss=0.26996063435872397, metrics={'train_runtime': 391.8235, 'train_samples_per_second': 76.565, 'train_steps_per_second': 4.785, 'total_flos': 986666457600000.0, 'train_loss': 0.26996063435872397, 'epoch': 3.0})

In [18]:
print(trainer.evaluate(test_ds))

model.save_pretrained("./text_model")
tokenizer.save_pretrained("./text_model")


Trainer is attempting to log a value of "{'accuracy': 0.865}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'f1': 0.8341523341523341}" of type <class 'dict'> for key "eval/f1" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


{'eval_loss': 0.54542076587677, 'eval_accuracy': {'accuracy': 0.865}, 'eval_f1': {'f1': 0.8341523341523341}, 'eval_runtime': 6.8131, 'eval_samples_per_second': 293.552, 'eval_steps_per_second': 18.347, 'epoch': 3.0}


('./text_model/tokenizer_config.json',
 './text_model/special_tokens_map.json',
 './text_model/vocab.txt',
 './text_model/added_tokens.json',
 './text_model/tokenizer.json')

In [19]:
# # Inspect a batch from the training dataloader
# for batch in trainer.get_train_dataloader():
#     print(batch['labels'].dtype)
#     print(batch['input_ids'].dtype)
#     print(batch['attention_mask'].dtype)
#     break # Just inspect one batch

In [None]:
print("Hello")