## Import libraries

In [82]:
!pip install transformers==4.9.2
!pip install nlp==0.4.0
from transformers import BertForSequenceClassification, BertTokenizerFast, Trainer, TrainingArguments
from nlp import load_dataset, Dataset
import torch
import numpy as np
import pandas as pd
from google.colab import drive

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## Loading model and dataset




In [83]:
drive.mount('/content/drive')
dataset = load_dataset('csv', data_files='/content/drive/MyDrive/NLP/Merged_Preprocessed_Data_Updated (1).csv', split='train')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Using custom data configuration default


### Split the dataset into training and test dataset

In [84]:
my_data = []
data_count = [0,0,0]
for data in dataset:
  if sum(data_count) == 30:
    break
  if data_count[data["label"]] == 10:
    continue
  my_data.append(data)
  data_count[data["label"]] += 1
df = pd.DataFrame(my_data)
dataset = dataset.train_test_split(test_size=0.001)
train_set = Dataset.from_pandas(df)
test_set = dataset['test']

### Load heBERT pretrain-model

In [85]:
model = BertForSequenceClassification.from_pretrained('avichr/heBERT_sentiment_analysis')
tokenizer = BertTokenizerFast.from_pretrained('avichr/heBERT_sentiment_analysis')

loading configuration file https://huggingface.co/avichr/heBERT_sentiment_analysis/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/1966f39378e2eb1d4405f2bf37672fb6881269a12be943555fe0914382cb4876.7ac37fd18f373ab2080ec1f082bc64226d0a5b08371461a52ddb09714ee7709d
Model config BertConfig {
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "neutral",
    "1": "positive",
    "2": "negative"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "negative": 2,
    "neutral": 0,
    "positive": 1
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "total_flos": 6997313242916978688,
  "transformers_

## Preprocess Data

In [86]:
def preprocess(data):
    return tokenizer(data['comment'], padding=True, truncation=True,max_length=50, add_special_tokens = True)

In [87]:
train_set = train_set.map(preprocess, batched=True, batch_size=len(train_set))
test_set = test_set.map(preprocess, batched=True, batch_size=len(test_set))
train_set.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test_set.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

  0%|          | 0/1 [00:00<?, ?it/s]

## Finetuning BERT for Sentiment Analysis 

### Train the model

In [88]:
batch_size = 1
epochs = 1
warmup_steps = 500
weight_decay = 0.01

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    warmup_steps=warmup_steps,
    weight_decay=weight_decay,
    logging_dir='./logs',
    save_steps=30
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_set,
    eval_dataset=test_set,
)

trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running training *****
  Num examples = 30
  Num Epochs = 1
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 30


Step,Training Loss


Saving model checkpoint to ./results/checkpoint-30
Configuration saved in ./results/checkpoint-30/config.json
Model weights saved in ./results/checkpoint-30/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=30, training_loss=3.3007306416829425, metrics={'train_runtime': 8.6159, 'train_samples_per_second': 3.482, 'train_steps_per_second': 3.482, 'total_flos': 770840091000.0, 'train_loss': 3.3007306416829425, 'epoch': 1.0})

In [89]:
from transformers import TextClassificationPipeline

pipeline = TextClassificationPipeline(
    model=model,
    tokenizer=tokenizer,
    return_all_scores=True,
    device  = 0
)

In [118]:
pipeline("I dont like the hospital")

[[{'label': 'neutral', 'score': 0.0062800380401313305},
  {'label': 'positive', 'score': 0.012196362018585205},
  {'label': 'negative', 'score': 0.9815235733985901}]]

In [95]:
pipeline("I like you")

[[{'label': 'neutral', 'score': 0.00038969668094068766},
  {'label': 'positive', 'score': 0.9992244839668274},
  {'label': 'negative', 'score': 0.0003858081763610244}]]

In [92]:
pipeline("rubbish get appointment monday morning emergency appointment available per day reception staff ask problem order screen call though trained emergency appointment left got appointment available week advised go walk centre lri")

[[{'label': 'neutral', 'score': 0.04122542962431908},
  {'label': 'positive', 'score': 0.9437763690948486},
  {'label': 'negative', 'score': 0.014998206868767738}]]

In [93]:
pipeline("visited minor injury clinic yesterday afternoon whilst service received nurse fine person reception appalling manner turn give detail opening comment problem found insensitive upsetting would expect anyone matter job well anyway comment doubly thoughtless")

[[{'label': 'neutral', 'score': 0.011190050281584263},
  {'label': 'positive', 'score': 0.03933887556195259},
  {'label': 'negative', 'score': 0.9494710564613342}]]