### STEP 1 : Installations and Import

In [1]:
!pip install sentence-transformers -q
!pip install accelerate -U -q

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/86.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m24.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m54.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m31.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m69.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m78.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m35.9 MB/s[0m eta [

In [2]:
import torch
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from transformers import TrainingArguments, Trainer
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.metrics import accuracy_score, recall_score, precision_score,f1_score

### STEP 2 : Loading the Data

In [5]:
df = pd.read_csv('/content/twitter.csv').sample(5000).reset_index(drop=True)
df.head(3)

Unnamed: 0,id,label,tweet
0,11187,0,sad world we live in when you hope your kids d...
1,7505,0,you hu my feelings so much ð­ð­ you staing...
2,25718,0,"@user #bestsellers #inspiration #golf ""neve..."


### STEP 3: Load the Model

In [6]:
## For each model there is a specific tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased',num_labels=2)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
### To let the model pass into the GPU acceleration
model = model.to('cuda')

### STEP 4 : Tokenization

In [8]:
sample_data =['hi, how are you?']
tokenizer(sample_data, padding=True, truncation=True)

{'input_ids': [[101, 7632, 1010, 2129, 2024, 2017, 1029, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1]]}

In [9]:
### Tokenizing the data
X = list(df['tweet'])
y = list(df['label'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state=42)

train_encodings = tokenizer(X_train, truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(X_test, truncation=True, padding=True, max_length=512)

In [10]:
print("Encoded Words of the first Tweet : \n",train_encodings['input_ids'][0])

Encoded Words of the first Tweet : 
 [101, 2269, 1005, 1055, 2154, 1012, 1012, 13008, 1006, 6358, 1007, 1998, 9122, 1006, 8600, 1007, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


### STEP 5: Training

In [11]:
# Create torch dataset
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

In [12]:
### Converting into torch Dataset
train_dataset = Dataset(train_encodings, y_train)
test_dataset = Dataset(test_encodings, y_test)

In [13]:
train_dataset[5]

{'input_ids': tensor([  101,  1001,  2047, 29100,  3366,  3726,  7087,  2039,  1024,  2017,
          2097, 16083,  2115,  7087,  1998,  2017,  2097,  3622,  2009,  3649,
          2017,  2215,  2009,  2000,  2079,  1012,  2043,   102,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0]),
 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'attention_mask':

In [14]:
def compute_metrics(p):
    print(type(p))
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred)
    precision = precision_score(y_true=labels, y_pred=pred)
    f1 = f1_score(y_true=labels, y_pred=pred)

    return {"accuracy": accuracy, "precision": precision, "recall": recall,
            "f1_score":f1}

In [25]:
# Define Trainer
args = TrainingArguments(
    output_dir="/content/output",
    num_train_epochs=3,
    per_device_train_batch_size=8

)
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

In [26]:
%%time
## Initialize the training
trainer.train()

Step,Training Loss
500,0.1751
1000,0.0868
1500,0.0231


CPU times: user 3min 48s, sys: 4.4 s, total: 3min 53s
Wall time: 4min 7s


TrainOutput(global_step=1500, training_loss=0.09499734687805175, metrics={'train_runtime': 246.8884, 'train_samples_per_second': 48.605, 'train_steps_per_second': 6.076, 'total_flos': 511833224880000.0, 'train_loss': 0.09499734687805175, 'epoch': 3.0})

In [27]:
### Evaluating the performance
trainer.evaluate()

<class 'transformers.trainer_utils.EvalPrediction'>


{'eval_loss': 0.18591611087322235,
 'eval_accuracy': 0.968,
 'eval_precision': 0.726027397260274,
 'eval_recall': 0.8153846153846154,
 'eval_f1_score': 0.7681159420289856,
 'eval_runtime': 4.2216,
 'eval_samples_per_second': 236.879,
 'eval_steps_per_second': 29.61,
 'epoch': 3.0}

### STEP 6: Evaluation

In [28]:
## save the model and load the saved model
trainer.save_model('/content/CustomModel')
loaded_model = BertForSequenceClassification.from_pretrained('/content/CustomModel').to('cuda')

In [29]:
## Testing on unseen data
test_text = 'that was a good point'
inputs = tokenizer(test_text,padding=True,truncation=True,return_tensors='pt').to('cuda')
outputs = model(**inputs)
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
predictions = predictions.cpu().detach().numpy()
predictions

array([[0.99217176, 0.00782828]], dtype=float32)

In [31]:
## Testing on unseen data
test_text = 'It was a very bad experience'
inputs = tokenizer(test_text,padding=True,truncation=True,return_tensors='pt').to('cuda')
outputs = model(**inputs)
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
predictions = predictions.cpu().detach().numpy()
predictions

array([[0.998555  , 0.00144504]], dtype=float32)