In [3]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/smileannotaions/smile-annotations-final.csv


## Task 1: Introduction
What is BERT
BERT is a large-scale transformer-based Language Model that can be finetuned for a variety of tasks.

## Task 2: Exploratory Data Analysis and Preprocessing
We will use the SMILE Twitter dataset.


In [4]:
import torch
import pandas as pd
from tqdm.notebook import tqdm

In [5]:
df = pd.read_csv("../input/smileannotaions/smile-annotations-final.csv", names = ['id','text','category'])

In [6]:
df.set_index('id',inplace = True)

In [7]:
df.head()
df.text.iloc[0]

'@aandraous @britishmuseum @AndrewsAntonio Merci pour le partage! @openwinemap'

In [8]:
df.category.value_counts()

nocode               1572
happy                1137
not-relevant          214
angry                  57
surprise               35
sad                    32
happy|surprise         11
happy|sad               9
disgust|angry           7
disgust                 6
sad|disgust             2
sad|angry               2
sad|disgust|angry       1
Name: category, dtype: int64

In [9]:
df = df[-df.category.str.contains("\|")]
df = df[df.category != 'nocode']

In [10]:
df.category.value_counts()

happy           1137
not-relevant     214
angry             57
surprise          35
sad               32
disgust            6
Name: category, dtype: int64

In [11]:
possible_labels = df.category.unique()
possible_labels

array(['happy', 'not-relevant', 'angry', 'disgust', 'sad', 'surprise'],
      dtype=object)

In [12]:
label_dict = {}
for index,possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index

In [13]:
df['label'] = df.category.replace(label_dict)

In [14]:
df.head()

Unnamed: 0_level_0,text,category,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
614484565059596288,Dorian Gray with Rainbow Scarf #LoveWins (from...,happy,0
614746522043973632,@SelectShowcase @Tate_StIves ... Replace with ...,happy,0
614877582664835073,@Sofabsports thank you for following me back. ...,happy,0
611932373039644672,@britishmuseum @TudorHistory What a beautiful ...,happy,0
611570404268883969,@NationalGallery @ThePoldarkian I have always ...,happy,0


## Task 3: Training/Validation Split

In [15]:
from sklearn.model_selection import train_test_split

In [16]:
x_train,x_val,y_train,y_val = train_test_split(
    df.index.values , df.label.values , test_size= 0.15 , random_state= 17 , stratify= df.label.values)
df['data_type'] = ['not_set'] * df.shape[0]
df.head()

Unnamed: 0_level_0,text,category,label,data_type
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
614484565059596288,Dorian Gray with Rainbow Scarf #LoveWins (from...,happy,0,not_set
614746522043973632,@SelectShowcase @Tate_StIves ... Replace with ...,happy,0,not_set
614877582664835073,@Sofabsports thank you for following me back. ...,happy,0,not_set
611932373039644672,@britishmuseum @TudorHistory What a beautiful ...,happy,0,not_set
611570404268883969,@NationalGallery @ThePoldarkian I have always ...,happy,0,not_set


In [17]:
df.loc[x_train ,'data_type'] = 'train'
df.loc[x_val ,'data_type'] = 'val'

In [18]:
df.groupby(['category' ,'label','data_type'] ).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,text
category,label,data_type,Unnamed: 3_level_1
angry,2,train,48
angry,2,val,9
disgust,3,train,5
disgust,3,val,1
happy,0,train,966
happy,0,val,171
not-relevant,1,train,182
not-relevant,1,val,32
sad,4,train,27
sad,4,val,5


## Task 4: Loading Tokenizer and Encoding our Data

In [19]:
from transformers import BertTokenizer
from torch.utils.data import TensorDataset

In [20]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased' , do_lower_case = True)

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [21]:
encoded_data_train =  tokenizer.batch_encode_plus(
            df[df.data_type == 'train'].text.values , 
            add_special_tokens=True , 
            return_attention_mask=True,
            pad_to_max_length = True,
            max_length=250 ,
            return_tensors='pt')

encoded_data_val =  tokenizer.batch_encode_plus(
            df[df.data_type == 'val'].text.values , 
            add_special_tokens=True , 
            return_attention_mask=True,
            pad_to_max_length = True,
            max_length=250 ,
            return_tensors='pt')


inputs_ids_train = encoded_data_train['input_ids']
attention_mask_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(df[df.data_type == 'train'].label.values)

inputs_ids_val = encoded_data_val['input_ids']
attention_mask_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(df[df.data_type == 'val'].label.values)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [22]:
dataset_train = TensorDataset(inputs_ids_train , attention_mask_train , labels_train)
dataset_val   = TensorDataset(inputs_ids_val , attention_mask_val , labels_val)

In [23]:
len(dataset_train)

1258

In [24]:
len(dataset_val)

223

## Task 5: Setting up BERT Pretrained Model

In [25]:
from transformers import BertForSequenceClassification

In [26]:
model = BertForSequenceClassification.from_pretrained(
        'bert-base-uncased',
        num_labels= len(label_dict),
        output_attentions = False ,
        output_hidden_states = False)

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

## Task 6: Creating Data Loaders

In [27]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

In [28]:
dataloader_train = DataLoader(dataset_train ,
                              sampler= RandomSampler(dataset_train),
                             batch_size = 4)

dataloader_val = DataLoader(dataset_val ,
                              sampler= RandomSampler(dataset_val),
                             batch_size = 32)

## Task 7: Setting Up Optimizer and Scheduler

In [29]:
from transformers import AdamW, get_linear_schedule_with_warmup

In [30]:
optimizer = AdamW(model.parameters() ,lr = 1e-5 , eps = 1e-8)
epochs = 10

In [31]:
scheduler = get_linear_schedule_with_warmup(optimizer ,
                                            num_warmup_steps= 0,
                                           num_training_steps= len(dataloader_train)*epochs)

## Task 8: Defining our Performance Metrics

In [32]:
from sklearn.metrics import f1_score

In [33]:
def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds , axis =1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat , preds_flat , average="weighted")

In [67]:
def accuracy_per_class(preds, labels):
    labels_dict_inverse = {v:k for k,v in label_dict.items()}
    
    preds_flat = np.argmax(preds , axis =1).flatten()
    labels_flat = labels.flatten()
    
    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat == label]
        y_true  = labels_flat[labels_flat == label]
        print(f'class:{labels_dict_inverse[label]}')
        print(f'accuracy:{len(y_preds[y_preds == label])}/{len(y_true)}\n')

## Task 9: Creating our Training Loop

In [35]:
import random

seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [36]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
print(device)

cuda


In [37]:
def evaluate(dataloader_val):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in tqdm(dataloader_val):
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2]
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val)  
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals

In [38]:
for epoch in tqdm(range(1, epochs+1)):
    model.train()
    loss_train_total = 0
    progress_bar = tqdm(dataloader_train , 
                        desc = 'epoch {:I=1d}'.format(epoch),
                       leave = False , 
                       disable = False)
    for batch in progress_bar:
        model.zero_grad()
        batch = tuple(b.to(device) for b in batch)
        inputs ={'input_ids' : batch[0],
                 'attention_mask':batch[1],
                 'labels' : batch[2]}
        outputs = model(**inputs)
        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(),1)
        optimizer.step()
        scheduler.step()
        progress_bar.set_postfix({'training_loss':'{:3f}'.format(loss.item()/len(batch))})
   # torch.save(model.state_dict(),f'Models/Bert_ft_epoch{epoch}.model')
    tqdm.write('\nEpoch{epoch}')
    loss_train_avg = loss_train_total/len(dataloader_train)
        
    tqdm.write(f'training_loss :{loss_train_avg}')
    val_loss , predictions ,true_values = evaluate(dataloader_val)
    val_f1 = f1_score_func(predictions , true_values)
    tqdm.write(f'validation loss :{val_loss}')
    tqdm.write(f'F1_score(weighted) :{val_f1}')

  0%|          | 0/10 [00:00<?, ?it/s]

epoch 1:   0%|          | 0/315 [00:00<?, ?it/s]


Epoch{epoch}
training_loss :0.7689728549548558


  0%|          | 0/7 [00:00<?, ?it/s]

validation loss :0.5855848448617118
F1_score(weighted) :0.7769166003988482


epoch 2:   0%|          | 0/315 [00:00<?, ?it/s]


Epoch{epoch}
training_loss :0.4481437216231984


  0%|          | 0/7 [00:00<?, ?it/s]

validation loss :0.6229808756283352
F1_score(weighted) :0.8484651828119184


epoch 3:   0%|          | 0/315 [00:00<?, ?it/s]


Epoch{epoch}
training_loss :0.2783280083296141


  0%|          | 0/7 [00:00<?, ?it/s]

validation loss :0.6635871529579163
F1_score(weighted) :0.8401985906470213


epoch 4:   0%|          | 0/315 [00:00<?, ?it/s]


Epoch{epoch}
training_loss :0.1899415168811434


  0%|          | 0/7 [00:00<?, ?it/s]

validation loss :0.6129864241395678
F1_score(weighted) :0.8546580244026889


epoch 5:   0%|          | 0/315 [00:00<?, ?it/s]


Epoch{epoch}
training_loss :0.12844385036511258


  0%|          | 0/7 [00:00<?, ?it/s]

validation loss :0.7755047785384315
F1_score(weighted) :0.8394140153643124


epoch 6:   0%|          | 0/315 [00:00<?, ?it/s]


Epoch{epoch}
training_loss :0.06722971337925022


  0%|          | 0/7 [00:00<?, ?it/s]

validation loss :0.7495512962341309
F1_score(weighted) :0.8534744513378119


epoch 7:   0%|          | 0/315 [00:00<?, ?it/s]


Epoch{epoch}
training_loss :0.0484527624299055


  0%|          | 0/7 [00:00<?, ?it/s]

validation loss :0.7906738945416042
F1_score(weighted) :0.8562845973159876


epoch 8:   0%|          | 0/315 [00:00<?, ?it/s]


Epoch{epoch}
training_loss :0.031391148478169706


  0%|          | 0/7 [00:00<?, ?it/s]

validation loss :0.8315566395010267
F1_score(weighted) :0.8547498447681376


epoch 9:   0%|          | 0/315 [00:00<?, ?it/s]


Epoch{epoch}
training_loss :0.023054983788391666


  0%|          | 0/7 [00:00<?, ?it/s]

validation loss :0.7830214330128261
F1_score(weighted) :0.8599325729086263


epoch 10:   0%|          | 0/315 [00:00<?, ?it/s]


Epoch{epoch}
training_loss :0.022075138912011412


  0%|          | 0/7 [00:00<?, ?it/s]

validation loss :0.8066349284989494
F1_score(weighted) :0.8708976807159483


## save model 

In [39]:
torch.save(model.state_dict(),'../working/sentiment_Analysis.model')

## Task 10: Loading and Evaluating our Model


In [40]:
from transformers import BertForSequenceClassification

In [41]:
model2 = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=6,
                                                      output_attentions=False,
                                                      output_hidden_states=False
                                                      )

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [42]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model2.to(device)
print(device)
pass

cuda


In [56]:
model2.load_state_dict(torch.load('../working/sentiment_Analysis.model' ,map_location=torch.device('cuda') ) )

<All keys matched successfully>

In [61]:
_,predictions,true_vals = evaluate(dataloader_val)

  0%|          | 0/7 [00:00<?, ?it/s]

In [62]:
predictions

array([[ 7.7751017 , -1.2679998 , -2.2954752 , -1.8883604 , -1.2804353 ,
        -1.9283006 ],
       [ 7.789495  , -1.6507716 , -2.2226288 , -1.7088171 , -1.2414494 ,
        -1.7250088 ],
       [ 7.79996   , -1.218955  , -2.5107586 , -1.8783234 , -1.1751504 ,
        -1.8808656 ],
       ...,
       [ 7.9200635 , -1.6884208 , -2.2749364 , -1.7718617 , -1.1399211 ,
        -1.8092116 ],
       [ 7.746097  , -2.049114  , -2.20051   , -1.6845186 , -0.84045184,
        -1.6721164 ],
       [ 7.880602  , -1.8670471 , -2.1236322 , -1.6987044 , -1.1565515 ,
        -1.7241452 ]], dtype=float32)

In [59]:
true_vals

array([0, 4, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 5, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 1, 0, 0, 5,
       4, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 1, 0, 0, 5, 0, 0, 0, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 2, 0, 0, 0, 1, 0, 0, 1, 0, 0, 2, 0, 4,
       0, 2, 0, 0, 0, 5, 0, 0, 3, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 4, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 4, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 2, 0, 1, 2, 0, 2, 0, 0, 0, 1, 0, 0, 0,
       5, 0, 0])

In [68]:
accuracy_per_class(predictions, true_vals)

class:happy
accuracy:164/171

class:not-relevant
accuracy:21/32

class:angry
accuracy:6/9

class:disgust
accuracy:0/1

class:sad
accuracy:2/5

class:surprise
accuracy:2/5

