`importing necessary libraries`

In [None]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [None]:
!pip install -U accelerate
!pip install -U transformers
!pip install -U datasets

Collecting accelerate
  Downloading accelerate-0.25.0-py3-none-any.whl (265 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.7/265.7 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.25.0
Collecting transformers
  Downloading transformers-4.36.2-py3-none-any.whl (8.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.2/8.2 MB[0m [31m32.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.35.2
    Uninstalling transformers-4.35.2:
      Successfully uninstalled transformers-4.35.2
Successfully installed transformers-4.36.2
Collecting datasets
  Downloading datasets-2.16.1-py3-none-any.whl (507 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Down

In [None]:
import pandas as pd

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


importing data with little changes

In [None]:
#use cleaned data from the LSTM model
data = pd.read_csv('/content/drive/MyDrive/ZUM/projekt/data2_clean.csv', usecols=['text', 'target'], index_col=False)

In [None]:
#column names normalization
data.rename(columns={ 'target':'label'}, inplace=True)

In [None]:
data

Unnamed: 0,text,label
0,kicker watchlist xide tit soq pnk cpw bpz aj t...,1
1,user aap movie number return feageed indicator...,1
2,user I have afraid short amzn looking like nea...,1
3,mnta number,1
4,oi number,1
...,...,...
5786,industry body cii say discom likely suffer net...,0
5787,gold price slip rs number investor book profit...,0
5788,worker bajaj auto agree number wage cut period...,1
5789,sharemarket live sensex day high number point ...,1


In [None]:
data[data['text'].isna()]

Unnamed: 0,text,label
1250,,1


In [None]:
data.dropna(inplace=True)

In [None]:
from datasets import Dataset

dataset_ = Dataset.from_pandas(data)
dataset_ = dataset_.remove_columns(["__index_level_0__"])
dataset = dataset_.train_test_split(0.1)

In [None]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 5211
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 579
    })
})


In [None]:
dataset['train'][0]

{'text': 'maruti suzuki march sale plunge nearly number amid coronavirus outbreak httpstco05lzqly97h',
 'label': 0}

Fine-tuning

In [None]:
#using roBERTa model instead of BERT
model_checkpoint = 'roberta-base'
batch_size = 128

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
#function which will make tokens from text in dataset
def process(x):
  return tokenizer(x['text'])

In [None]:
dataset['train']

Dataset({
    features: ['text', 'label'],
    num_rows: 5211
})

In [None]:
train_ds = dataset['train'].map(process)
test_ds = dataset['test'].map(process)

Map:   0%|          | 0/5211 [00:00<?, ? examples/s]

Map:   0%|          | 0/579 [00:00<?, ? examples/s]

In [None]:
train_ds

Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 5211
})

In [None]:
#dealing with model definition
#i used hugging face libraries specifically transformers
from transformers.pipelines.base import AutoModel
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

#I have to classes to predict
num_labels = 2
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
args = TrainingArguments(
    f'{model_checkpoint}_sentiment_analysis',
    evaluation_strategy = 'epoch',
    save_strategy = 'epoch',
    learning_rate = 2e-5,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = batch_size,
    num_train_epochs = 5,
    weight_decay = 0.01,
    load_best_model_at_end = True,
    metric_for_best_model = 'accuracy'
)

In [None]:
from datasets import load_metric
import numpy as np

metric = load_metric('accuracy')

def compute_metrics(eval_preds):
  logits, labels = eval_preds
  predictions = np.argmax(logits, axis=-1)
  return metric.compute(predictions=predictions, references=labels)

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.evaluate()

{'eval_loss': 0.738693356513977,
 'eval_accuracy': 0.40241796200345425,
 'eval_runtime': 1.2659,
 'eval_samples_per_second': 457.383,
 'eval_steps_per_second': 3.95}

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.604824,0.685665
2,No log,0.565655,0.749568
3,No log,0.542565,0.759931
4,No log,0.530937,0.768566
5,No log,0.543759,0.768566


TrainOutput(global_step=205, training_loss=0.47207139178020197, metrics={'train_runtime': 223.6126, 'train_samples_per_second': 116.518, 'train_steps_per_second': 0.917, 'total_flos': 467756984219400.0, 'train_loss': 0.47207139178020197, 'epoch': 5.0})

In [None]:
trainer.model

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [None]:
import torch

# text = 'This is a positive example'
text = 'The stocks will raise in next 2 years' #type text which sentiment will be predicted
inputs = tokenizer(text, return_tensors='pt')
device='cuda' if torch.cuda.is_available() else 'cpu'
input_ids = inputs['input_ids'].to(device)
attention_mask = inputs['attention_mask'].to(device)

In [None]:
#saving model in case using it in different problems
torch.save(model.state_dict(), 'robert_model.pt')

In [None]:
with torch.no_grad():
  outputs = model(input_ids=input_ids, attention_mask=attention_mask)
  logits = outputs.logits
  predictions = torch.argmax(logits, dim=-1)

if predictions.item() == 0:
  print('This is a negative text')
else:
  print('This is a positive text')

This is a positive text
