In [56]:
!pip install tqdm transformers torch

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [3]:
import pandas as pd

In [6]:
import torch
from tqdm.notebook import tqdm

from transformers import BertTokenizer
from torch.utils.data import TensorDataset

from transformers import BertForSequenceClassification

df = pd.read_csv('/kaggle/input/classifierbert/classifier.csv')
df.head()

Unnamed: 0,Input,Query
0,"What is the meaning of ""Bhuvan"" in Sanskrit, a...",Bhuvan
1,"How can the public access Bhuvan, and what typ...",Bhuvan
2,What role does the National Remote Sensing Cen...,Bhuvan
3,Could you provide examples of the practical ap...,Bhuvan
4,How does Bhuvan contribute to ISRO's Earth Obs...,Bhuvan


In [7]:
df['Query'].value_counts()

Query
Bhuvan        10
Aadhaar       10
AddPoint      10
Drawing       10
NavMap        10
              ..
HarState      10
NARL          10
Ganga         10
Heatwave      10
NOEDAWater    10
Name: count, Length: 88, dtype: int64

In [8]:
possible_labels = df.Query.unique()

label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index
label_dict

{'Bhuvan': 0,
 'Aadhaar': 1,
 'MGNREGA': 2,
 'NRSC': 3,
 'Bhuvan2D': 4,
 'Geospatial': 5,
 'Yuktdhara': 6,
 'Bhuvan3D': 7,
 'WBIS': 8,
 'RUSA': 9,
 'GI': 10,
 'Flycatch': 11,
 'NCERT': 12,
 'GIS': 13,
 'Tourism': 14,
 'Hpfor': 15,
 'TelFor': 16,
 'PunFor': 17,
 'UkFor': 18,
 'KarFor': 19,
 'NDEM': 20,
 'Thematic': 21,
 'Agro': 22,
 'PMKSY': 23,
 'Heatwave': 24,
 'Ganga': 25,
 'NARL': 26,
 'HarState': 27,
 'Saras': 28,
 'Satell': 29,
 'PMJVK': 30,
 'Anganwadi': 31,
 'Covid': 32,
 'Organiz': 33,
 'NOEDA': 34,
 'RBI': 35,
 'Geoportal': 36,
 'AmTourism': 37,
 'LudMun': 38,
 'Toll': 39,
 'KALAMTARI': 40,
 'Geomorph': 41,
 'AgroPortal': 42,
 'DeltaPortal': 43,
 'CDMAPortal': 44,
 'AndhraSat': 45,
 'Multilingual': 46,
 'GeoTaggingLight': 47,
 'GeoTaggingAgri': 48,
 'AIBP': 49,
 'MahaWater': 50,
 'PMGSY': 51,
 'Collab': 52,
 'PunHer': 53,
 'PunGIS': 54,
 'LudMunCollab': 55,
 'AIBPCollab': 56,
 'IMD': 57,
 'Register': 58,
 '2DHelp': 59,
 'BhuvanFind': 60,
 'Admin': 61,
 'NavMap': 62,
 'Drawing'

In [9]:
df['label'] = df.Query.replace(label_dict)



In [10]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(df.index.values, 
                                                  df.label.values, 
                                                  test_size=0.15, 
                                                  random_state=42, 
                                                  stratify=df.label.values)

df['data_type'] = ['not_set']*df.shape[0]

df.loc[X_train, 'data_type'] = 'train'
df.loc[X_val, 'data_type'] = 'val'

df.groupby(['Query', 'label', 'data_type']).count()



Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Input
Query,label,data_type,Unnamed: 3_level_1
2DHelp,59,train,8
2DHelp,59,val,2
3Don2D,74,train,8
3Don2D,74,val,2
AIBP,49,train,9
...,...,...,...
Weather2D,71,val,2
WebService,80,train,9
WebService,80,val,1
Yuktdhara,6,train,8


In [12]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', 
                                          do_lower_case=True)
                                          
encoded_data_train = tokenizer.batch_encode_plus(
    df[df.data_type=='train'].Input.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=256, 
    return_tensors='pt'
)

encoded_data_val = tokenizer.batch_encode_plus(
    df[df.data_type=='val'].Input.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=256, 
    return_tensors='pt'
)


input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(df[df.data_type=='train'].label.values)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(df[df.data_type=='val'].label.values)

dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [13]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

batch_size = 3

dataloader_train = DataLoader(dataset_train, 
                              sampler=RandomSampler(dataset_train), 
                              batch_size=batch_size)

dataloader_validation = DataLoader(dataset_val, 
                                   sampler=SequentialSampler(dataset_val), 
                                   batch_size=batch_size)

In [16]:
from transformers import AdamW, get_linear_schedule_with_warmup

optimizer = AdamW(model.parameters(),
                  lr=1e-5, 
                  eps=1e-8)
                  
epochs = 5

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=0,
                                            num_training_steps=len(dataloader_train)*epochs)




In [17]:
from sklearn.metrics import f1_score

def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}
    
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}\n')

In [19]:
import numpy as np

cuda


cuda


  0%|          | 0/5 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/250 [00:00<?, ?it/s]

RuntimeError: Parent directory data_volume does not exist.

In [23]:
import random

seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [24]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

print(device)

cuda


In [25]:

def evaluate(dataloader_val):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in dataloader_val:
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals

In [27]:



for epoch in tqdm(range(1, epochs+1)):
    
    model.train()
    
    loss_train_total = 0

    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:

        model.zero_grad()
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }       

        outputs = model(**inputs)
        
        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
         
        
    torch.save(model.state_dict(), f'/kaggle/working/finetuned_BERT_epoch_{epoch}.model')
        
    tqdm.write(f'\nEpoch {epoch}')
    
    loss_train_avg = loss_train_total/len(dataloader_train)            
    tqdm.write(f'Training loss: {loss_train_avg}')
    
    val_loss, predictions, true_vals = evaluate(dataloader_validation)
    val_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (Weighted): {val_f1}')



  0%|          | 0/5 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/250 [00:00<?, ?it/s]


Epoch 1
Training loss: 4.087772956848145
Validation loss: 4.022597871043465
F1 Score (Weighted): 0.08590222177178698


Epoch 2:   0%|          | 0/250 [00:00<?, ?it/s]


Epoch 2
Training loss: 3.9101081638336184
Validation loss: 3.9320993586020037
F1 Score (Weighted): 0.11381773306906996


Epoch 3:   0%|          | 0/250 [00:00<?, ?it/s]


Epoch 3
Training loss: 3.813375012397766
Validation loss: 3.8801670507951216
F1 Score (Weighted): 0.17916445153287258


Epoch 4:   0%|          | 0/250 [00:00<?, ?it/s]


Epoch 4
Training loss: 3.7830126409530638
Validation loss: 3.8801670507951216
F1 Score (Weighted): 0.17916445153287258


Epoch 5:   0%|          | 0/250 [00:00<?, ?it/s]


Epoch 5
Training loss: 3.7887882776260375
Validation loss: 3.8801670507951216
F1 Score (Weighted): 0.17916445153287258


In [29]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)

model.to(device)

model.load_state_dict(torch.load('/kaggle/working/finetuned_BERT_epoch_1.model', map_location=torch.device('cpu')))

_, predictions, true_vals = evaluate(dataloader_validation)
accuracy_per_class(predictions, true_vals)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Class: Bhuvan
Accuracy: 0/1

Class: Aadhaar
Accuracy: 0/2

Class: MGNREGA
Accuracy: 0/2

Class: NRSC
Accuracy: 0/2

Class: Bhuvan2D
Accuracy: 0/1

Class: Geospatial
Accuracy: 0/1

Class: Yuktdhara
Accuracy: 0/2

Class: Bhuvan3D
Accuracy: 1/1

Class: WBIS
Accuracy: 1/2

Class: RUSA
Accuracy: 1/1

Class: GI
Accuracy: 0/1

Class: Flycatch
Accuracy: 1/1

Class: NCERT
Accuracy: 0/1

Class: GIS
Accuracy: 1/1

Class: Tourism
Accuracy: 1/2

Class: Hpfor
Accuracy: 0/2

Class: TelFor
Accuracy: 1/1

Class: PunFor
Accuracy: 0/2

Class: UkFor
Accuracy: 1/1

Class: KarFor
Accuracy: 1/1

Class: NDEM
Accuracy: 0/2

Class: Thematic
Accuracy: 0/2

Class: Agro
Accuracy: 0/1

Class: PMKSY
Accuracy: 0/2

Class: Heatwave
Accuracy: 0/1

Class: Ganga
Accuracy: 0/1

Class: NARL
Accuracy: 0/1

Class: HarState
Accuracy: 0/2

Class: Saras
Accuracy: 0/1

Class: Satell
Accuracy: 0/2

Class: PMJVK
Accuracy: 0/1

Class: Anganwadi
Accuracy: 0/1

Class: Covid
Accuracy: 2/2

Class: Organiz
Accuracy: 0/1

Class: NOEDA
Ac

In [35]:
model.eval()

# Your input text
input_text = "how to register at bhuvan"

# Tokenize and encode the input text
inputs = tokenizer(input_text, return_tensors='pt', truncation=True, padding=True)
input_ids = inputs['input_ids'].to(device)
attention_mask = inputs['attention_mask'].to(device)

# Make predictions
with torch.no_grad():
    output = model(input_ids=input_ids, attention_mask=attention_mask)

# Extract predicted probabilities or class labels
predicted_probabilities = torch.softmax(output.logits, dim=1).cpu().numpy()
predicted_class = np.argmax(predicted_probabilities, axis=-1)

# Print the results
print("Predicted probabilities:", predicted_probabilities)
print("Predicted class:", predicted_class)

Predicted probabilities: [[0.01205402 0.00678871 0.00580697 0.00866903 0.01225773 0.01121108
  0.01235777 0.00606379 0.01430028 0.0147843  0.01166353 0.01222698
  0.00690184 0.00899304 0.01864684 0.00728631 0.01337449 0.01198034
  0.01024485 0.01356276 0.00950969 0.01273645 0.01280288 0.00961967
  0.01187383 0.00675393 0.01757831 0.00904988 0.01593163 0.01607915
  0.01182196 0.01401507 0.00540755 0.008975   0.01375524 0.00863925
  0.00900889 0.00717433 0.00682692 0.01013758 0.00970669 0.01348936
  0.0126135  0.01725973 0.01246408 0.00629694 0.01153642 0.00897702
  0.00896007 0.00577681 0.00994997 0.00846006 0.00976122 0.01078334
  0.0141584  0.01439384 0.0191335  0.01608367 0.01216887 0.00917387
  0.0116261  0.01291068 0.0124333  0.00988087 0.01066578 0.00849536
  0.01092464 0.00977497 0.01470074 0.00858203 0.01504172 0.00772466
  0.00789098 0.009669   0.01038978 0.01101698 0.019224   0.01257592
  0.01256517 0.01201769 0.02039632 0.01239308 0.01257324 0.00931434
  0.01510507 0.01452705

In [37]:
key_list = list(label_dict.keys())
val_list = list(label_dict.values())
 
# print key with val 100
position = val_list.index(predicted_class)
print(key_list[position])

WebService


In [38]:
key_list

['Bhuvan',
 'Aadhaar',
 'MGNREGA',
 'NRSC',
 'Bhuvan2D',
 'Geospatial',
 'Yuktdhara',
 'Bhuvan3D',
 'WBIS',
 'RUSA',
 'GI',
 'Flycatch',
 'NCERT',
 'GIS',
 'Tourism',
 'Hpfor',
 'TelFor',
 'PunFor',
 'UkFor',
 'KarFor',
 'NDEM',
 'Thematic',
 'Agro',
 'PMKSY',
 'Heatwave',
 'Ganga',
 'NARL',
 'HarState',
 'Saras',
 'Satell',
 'PMJVK',
 'Anganwadi',
 'Covid',
 'Organiz',
 'NOEDA',
 'RBI',
 'Geoportal',
 'AmTourism',
 'LudMun',
 'Toll',
 'KALAMTARI',
 'Geomorph',
 'AgroPortal',
 'DeltaPortal',
 'CDMAPortal',
 'AndhraSat',
 'Multilingual',
 'GeoTaggingLight',
 'GeoTaggingAgri',
 'AIBP',
 'MahaWater',
 'PMGSY',
 'Collab',
 'PunHer',
 'PunGIS',
 'LudMunCollab',
 'AIBPCollab',
 'IMD',
 'Register',
 '2DHelp',
 'BhuvanFind',
 'Admin',
 'NavMap',
 'Drawing',
 'AddPoint',
 'AddLine',
 'AddPolygon',
 'Area',
 'Distance',
 'Pan',
 'Land',
 'Weather2D',
 'Ocean2D',
 'Disaster2D',
 '3Don2D',
 'Dissemination',
 'Visualization',
 'Statistics',
 'Analysis',
 'Metadata',
 'WebService',
 'Layers',
 'Info