In [1]:
! pip install transformers datasets accelerate evaluate



In [2]:
import torch

class Config:
  DATASET_ID = "emad12/stock_tweets_sentiment"
  MODEL_CKPT = "bert-base-uncased"
  SRC_COLUMN = "tweet"
  TGT_COLUMN = "sentiment"
  TEST_SIZE = 0.2 # 2%
  SEED = 0
  MAX_LENGTH = 32
  EVAL_METRIC = "accuracy"
  MODEL_OUT_DIR = "distilbert-stock-tweet-sentiment-analysis"
  LR = 2E-5
  BATCH_SIZE = 16
  WEIGHT_DECAY = 0.01
  EVAL_STRATEGY = "epoch"
  SAVE_STRATEGY = "epoch"
  LOGGING_STRATEGY = "epoch"
  PUSH_TO_HUB = True
  NUM_TRAIN_EPOCHS = 3

  # CUDA : Compute Unified Device Architecture
  # If the PC has a nvidia graphic card, we can use cuda
  # which is way faster than cpu

  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

  ID2LABEL = {
      0 : "NEUTRAL",
      1 : "POSITIVE",
      2 : "NEGATIVE"
  }

  LABEL2ID = {
      "NEUTRAL" : 0,
      "POSITIVE" : 1,
      "NEGATIVE" : 2
  }



config = Config()

In [3]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
from datasets import Dataset,load_dataset
import pandas as pd
from sklearn.model_selection import train_test_split
import torch

The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.


In [15]:
class TextClassificationDataset:

  def __init__(self):
    self.dataset_id = config.DATASET_ID
    self.model_ckpt = config.MODEL_CKPT
    self.tokenizer = AutoTokenizer.from_pretrained(self.model_ckpt)
    self.src_column = config.SRC_COLUMN
    self.tgt_column = config.TGT_COLUMN
    self.seed = config.SEED
    self.test_size = config.TEST_SIZE
    self.max_length = config.MAX_LENGTH

  def create_data(self):
    self.data = load_dataset(self.dataset_id,split="train")
    self.df = self.data.to_pandas()
    self.df = self.df[[self.src_column,self.tgt_column]]

    # Target column contains these 3 numbers -1,0,1
    # Usually we try and avoid negative values to feed in the model, hence we change that
    self.df[self.tgt_column] = self.df[self.tgt_column].apply(lambda x: 2 if x==-1 else x)

    # Although the model that we are using here is 'uncased' meaning all are lowercase
    # But writing the below code for general case
    # We need to lower the case of the dataset in order to reduce the variations and
    # make the model more consistent
    self.df[self.src_column] = self.df[self.src_column].apply(lambda x: x.lower())

    # Sampling the data for now, we can change later
    self.df = self.df.sample(40000)

    self.train_df, self.test_df = train_test_split(
        self.df,
        test_size = self.test_size,
        shuffle = True,
        random_state=self.seed,
        stratify=self.df[self.tgt_column ]
    )

    # Converting the pandas dataframe to huggingface dataframe

    self.train_data = Dataset.from_pandas(self.train_df)
    self.test_data = Dataset.from_pandas(self.test_df)

    return self.train_data,self.test_data

  def tokenize_function(self,example):

    # The below line will add a "input_ids" and "attention_mask" column
    model_input = self.tokenizer(example[self.src_column],padding=True,truncation=True,max_length=self.max_length)
    labels = torch.tensor(example[self.tgt_column], dtype=torch.int)

    # Adding a "labels" column
    model_input["labels"] = labels

    return model_input

  def preprocess_function(self,data):
    # The data column names are src_column and tgt_column
    # Those will be removed and will only be left with
    # "input_ids", "attention_mask" and "labels" as columns in the dataset

    model_input = data.map(self.tokenize_function,batched=True,remove_columns=data.column_names)
    return model_input

  def generate_dataset(self):
    train_data, test_data = self.create_data()
    train_tokenized_data = self.preprocess_function(train_data)
    test_tokenized_data = self.preprocess_function(test_data)

    return train_tokenized_data, test_tokenized_data

In [11]:
import evaluate
import numpy as np

class TextClassificationTrainer:
  def __init__(self,train_data,test_data):
    self.train_data = train_data
    self.test_data = test_data
    self.model_ckpt = config.MODEL_CKPT

    self.id2label = config.ID2LABEL
    self.label2id = config.LABEL2ID
    self.num_labels = len(self.id2label)

    self.device = config.DEVICE
    self.eval_metric = config.EVAL_METRIC

    self.model_out_dir = config.MODEL_OUT_DIR
    self.lr = config.LR
    self.batch_size = config.BATCH_SIZE
    self.weight_decay = config.WEIGHT_DECAY

    self.eval_strategy = config.EVAL_STRATEGY
    self.save_strategy = config.SAVE_STRATEGY
    self.logging_strategy = config.LOGGING_STRATEGY
    self.push_to_hub = config.PUSH_TO_HUB
    self.num_train_epochs = config.NUM_TRAIN_EPOCHS

    self.model = AutoModelForSequenceClassification.from_pretrained(
        self.model_ckpt,
        id2label=self.id2label,
        label2id=self.label2id,
        num_labels=self.num_labels
      ).to(self.device)

    self.tokenizer = AutoTokenizer.from_pretrained(self.model_ckpt)
    self.eval_metric_computer = evaluate.load(self.eval_metric)
    self.data_collator = DataCollatorWithPadding(self.tokenizer)

  def compute_metrics(self,eval_pred):

    predictions, labels = eval_pred
    predictions = np.argmax(predictions,axis=1)

    return self.accuracy.compute(predictions=predictions,references=labels)

  def set_training_args(self):
    return TrainingArguments(
        output_dir = self.model_out_dir,
        learning_rate = self.lr,
        per_device_train_batch_size = self.batch_size,
        per_device_eval_batch_size = self.batch_size,
        weight_decay = self.weight_decay,
        save_strategy = self.save_strategy,
        logging_strategy = self.logging_strategy,
        push_to_hub = self.push_to_hub,
        num_train_epochs = self.num_train_epochs
    )

  def model_trainer(self):
    return Trainer(
        model = self.model,
        args = self.set_training_args(),
        data_collator = self.data_collator,
        train_dataset = self.train_data,
        eval_dataset = self.test_data,
        compute_metrics = self.compute_metrics
    )

  def train_and_save_and_push_to_hub(self):
    trainer = self.model_trainer()
    trainer.train()
    trainer.push_to_hub()

In [5]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [16]:
if __name__=="__main__":
  textclassificationdataset = TextClassificationDataset()
  train_data, test_data = textclassificationdataset.generate_dataset()

  textclassificationtrainer = TextClassificationTrainer(train_data,test_data)
  textclassificationtrainer.train_and_save_and_push_to_hub()
  
  print("Finished training and pushed to hub.")

Map:   0%|          | 0/32000 [00:00<?, ? examples/s]

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss
2000,0.6362
4000,0.4227
6000,0.3002


Finished training and pushed to hub.


In [19]:
from transformers import pipeline

classifier = pipeline("sentiment-analysis",model = config.MODEL_OUT_DIR,tokenizer="distilbert-base-uncased")
text = "have a great weekend everyone will be back to full schedule next week spy aapl baba"

classifier(text)

[{'label': 'POSITIVE', 'score': 0.9967382550239563}]