In [1]:
import pandas as pd
import transformers
import torch
from sklearn.metrics import f1_score

In [2]:
raw_df = pd.read_csv('labeled_data_corpus.csv')
train_df = raw_df.loc[raw_df['subset'] == 'train']
test_df = raw_df.loc[raw_df['subset'] == 'test']

In [3]:
class TextDataset(torch.utils.data.Dataset):
  def __init__(self, frame: pd.DataFrame, tokenizer):
      self.inputs = []
      self.labels = []
      for index, row in frame.iterrows():
          inputs = tokenizer(row['msg'], return_tensors="pt", max_length=200, truncation=True, padding='max_length')
          input_ids = inputs['input_ids'][0]
          self.inputs.append(input_ids)
          self.labels.append(row['label'])
  def __len__(self):
      return len(self.labels)
  def __getitem__(self, index):
      return {"input_ids" : self.inputs[index], 'labels': torch.tensor(self.labels[index])}

In [4]:
tokenizer = transformers.DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
train_dataset = TextDataset(train_df, tokenizer)
test_dataset = TextDataset(test_df, tokenizer)

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [5]:
class Model(torch.nn.Module):
  def __init__(self, criterion):
    super().__init__()
    self.bert = transformers.DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
    self.criterion = criterion

  def forward(self, input_ids, labels):
    out = self.bert(input_ids).logits
    if labels is not None:
      loss = self.criterion(out, labels)
      return loss, out.max(dim=1).indices
    else:
      return (out.max(dim=1).indices, )

In [6]:
def compute_metrics (evalPrediction):
    predictions = evalPrediction.predictions
    label_ids = evalPrediction.label_ids
    print(predictions)
    print(label_ids)
    return {
        "f1": f1_score(label_ids, predictions)
    }

In [7]:
args = transformers.TrainingArguments(
    output_dir="output",
    evaluation_strategy="epoch",
    per_device_train_batch_size = 16,
    per_device_eval_batch_size  = 16,
    learning_rate = 0.001,
    num_train_epochs=1,
    fp16 = False

)

criterion = torch.nn.CrossEntropyLoss()
model = Model(criterion)

trainer = transformers.Trainer(
    model=model,
    args =args ,
    train_dataset =train_dataset,
    eval_dataset  = test_dataset ,
    compute_metrics = compute_metrics
)
trainer.train()

Downloading:   0%|          | 0.00/256M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier

  0%|          | 0/328 [00:00<?, ?it/s]

***** Running Evaluation *****
  Num examples = 1309
  Batch size = 16


  0%|          | 0/82 [00:00<?, ?it/s]



Training completed. Do not forget to share your model on huggingface.co/models =)




[0 0 0 ... 0 0 0]
[1 0 0 ... 0 0 0]
{'eval_loss': 0.501939058303833, 'eval_f1': 0.0, 'eval_runtime': 222.7818, 'eval_samples_per_second': 5.876, 'eval_steps_per_second': 0.368, 'epoch': 1.0}
{'train_runtime': 3086.88, 'train_samples_per_second': 1.695, 'train_steps_per_second': 0.106, 'train_loss': 0.5198788991788539, 'epoch': 1.0}


TrainOutput(global_step=328, training_loss=0.5198788991788539, metrics={'train_runtime': 3086.88, 'train_samples_per_second': 1.695, 'train_steps_per_second': 0.106, 'train_loss': 0.5198788991788539, 'epoch': 1.0})