In [1]:
!pip install transformers datasets

[0m

In [4]:
import numpy as np
import warnings
from transformers import (
    TFAutoModelForSequenceClassification, 
    AutoTokenizer,
    DataCollatorWithPadding
)
from datasets import load_dataset

warnings.filterwarnings('ignore')

In [5]:
!pip install gdown
!gdown --id 1mk8-xfc-8aq6dHRmKMRc_rnupcHcYwCl

Collecting gdown
  Downloading gdown-4.7.1-py3-none-any.whl (15 kB)
Installing collected packages: gdown
Successfully installed gdown-4.7.1
Downloading...
From: https://drive.google.com/uc?id=1mk8-xfc-8aq6dHRmKMRc_rnupcHcYwCl
To: /kaggle/working/dataset.csv
100%|███████████████████████████████████████| 13.0M/13.0M [00:00<00:00, 167MB/s]


In [6]:
df = load_dataset('csv', data_files='/kaggle/working/dataset.csv')

Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-46c3161adce4f42b/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-46c3161adce4f42b/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [7]:
df_t = df['train'].train_test_split(test_size=.2)

In [8]:
df_v = df_t['test'].train_test_split(test_size=.5)

In [9]:
df['train'] = df_t['train']
df['test'] = df_v['train']
df['validation'] = df_v['test']

In [10]:
df

DatasetDict({
    train: Dataset({
        features: ['Text', 'language'],
        num_rows: 17600
    })
    test: Dataset({
        features: ['Text', 'language'],
        num_rows: 2200
    })
    validation: Dataset({
        features: ['Text', 'language'],
        num_rows: 2200
    })
})

Clean the dataset  
remove the english chars from chinese and jabanese

In [11]:
langs = np.unique(df['train']['language'])
langs

array(['Arabic', 'Chinese', 'Dutch', 'English', 'Estonian', 'French',
       'Hindi', 'Indonesian', 'Japanese', 'Korean', 'Latin', 'Persian',
       'Portugese', 'Pushto', 'Romanian', 'Russian', 'Spanish', 'Swedish',
       'Tamil', 'Thai', 'Turkish', 'Urdu'], dtype='<U10')

In [12]:
id2label = {idx:lbl for idx,lbl in enumerate(langs)}
label2id = {id2label[i]:i for i in id2label}

In [13]:
label2id[df['train']['language'][0]]

13

In [14]:
import re

#clean the database
def clean(example):
  if example['language'] in ['Chinese', 'Japanese']:
    pattern = r'[a-zA-Z0-9]'
    example['Text'] = re.sub(pattern, '', example['Text'])

  return example

#map each label to its id 
def lang_to_id(example):
  example['language'] = label2id[example['language']]

  return example

In [15]:
df = df.map(clean, batched=True).map(lang_to_id)

  0%|          | 0/18 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/17600 [00:00<?, ?ex/s]

  0%|          | 0/2200 [00:00<?, ?ex/s]

  0%|          | 0/2200 [00:00<?, ?ex/s]

In [28]:
class Config:
  checkpoint = 'xlm-roberta-base'
  batch_size = 8
  epochs = 4
  max_length = 331
  num_labels = 22
  learning_rate=2e-5

In [17]:
#tokenize dataset

tokenizer = AutoTokenizer.from_pretrained(Config.checkpoint)

Downloading (…)lve/main/config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

In [18]:
_len = [len(s.split()) for s in df['train']['Text']]
max_length = max(_len)
average_len = np.mean(_len)
print(f'max : {max_length}')
print(f'mean : {average_len}')

max : 331
mean : 53.32028409090909


In [19]:
def tokenize(example):

  return tokenizer(example['Text'], max_length=Config.max_length, truncation=True)

In [20]:
df = df.map(tokenize)

  0%|          | 0/17600 [00:00<?, ?ex/s]

  0%|          | 0/2200 [00:00<?, ?ex/s]

  0%|          | 0/2200 [00:00<?, ?ex/s]

In [21]:
df.column_names

{'train': ['Text', 'language', 'input_ids', 'attention_mask'],
 'test': ['Text', 'language', 'input_ids', 'attention_mask'],
 'validation': ['Text', 'language', 'input_ids', 'attention_mask']}

In [22]:
df = df.rename_column(original_column_name='language', new_column_name='labels')

In [23]:
datacollator = DataCollatorWithPadding(tokenizer=tokenizer, max_length=Config.max_length, padding='max_length')

In [24]:
train_ds = df['train'].to_tf_dataset(batch_size = Config.batch_size, 
                                     columns=['input_ids', 'attention_mask', 'labels'],
                                     shuffle=True,
                                     collate_fn=datacollator
                                     )
test_ds = df['test'].to_tf_dataset(batch_size = Config.batch_size, 
                                     columns=['input_ids', 'attention_mask', 'labels'],
                                     shuffle=False,
                                     collate_fn=datacollator
                                     )
val_ds = df['validation'].to_tf_dataset(batch_size = Config.batch_size, 
                                     columns=['input_ids', 'attention_mask', 'labels'],
                                     shuffle=False,
                                     collate_fn=datacollator
                                     )

You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [25]:
for i in train_ds.take(1).as_numpy_iterator():
  print(i)

{'labels': array([12, 17, 18,  0, 15,  5, 11,  2]), 'input_ids': array([[     0,   1152,   2408, ...,      1,      1,      1],
       [     0,    193,   3911, ...,      1,      1,      1],
       [     0, 100789,  89306, ...,      1,      1,      1],
       ...,
       [     0,     95,  54427, ...,      1,      1,      1],
       [     0,   2977,  79077, ..., 162593,  61724,      2],
       [     0,     72,     83, ...,      1,      1,      1]]), 'attention_mask': array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 0, 0, 0]])}


#modeling

In [26]:
model = TFAutoModelForSequenceClassification.from_pretrained(Config.checkpoint, num_labels=Config.num_labels, id2label=id2label, label2id=label2id)

Downloading model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFXLMRobertaForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFXLMRobertaForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [27]:
model.summary()

Model: "tfxlm_roberta_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 roberta (TFXLMRobertaMainLa  multiple                 277453056 
 yer)                                                            
                                                                 
 classifier (TFXLMRobertaCla  multiple                 607510    
 ssificationHead)                                                
                                                                 
Total params: 278,060,566
Trainable params: 278,060,566
Non-trainable params: 0
_________________________________________________________________


In [29]:
import tensorflow as tf
from tensorflow.keras.optimizers import AdamW

model.compile(optimizer=AdamW(learning_rate=Config.learning_rate))

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


In [30]:
model.fit(train_ds, validation_data=(val_ds), epochs=Config.epochs)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7b9b86e88f10>

In [31]:
from sklearn.metrics import classification_report

In [32]:
tst_prds = model.predict(test_ds)



In [33]:
tst = np.argmax(tst_prds.logits, axis=1)

In [34]:
true_labels = []
for i in test_ds.as_numpy_iterator():
 true_labels.append(i['labels'])

true_labels = np.concatenate(np.squeeze(true_labels), axis=0)
true_labels

array([ 6,  1, 19, ..., 19,  5, 12])

In [35]:
print(classification_report(true_labels, tst))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       103
           1       1.00      0.99      0.99        86
           2       0.97      1.00      0.98        94
           3       0.96      1.00      0.98       108
           4       1.00      0.98      0.99        96
           5       1.00      0.96      0.98        91
           6       1.00      0.99      0.99       100
           7       1.00      0.99      0.99        94
           8       1.00      0.99      0.99        98
           9       1.00      0.99      1.00       106
          10       0.88      1.00      0.94        94
          11       1.00      1.00      1.00        98
          12       0.98      0.97      0.97        98
          13       0.99      0.95      0.97       110
          14       1.00      1.00      1.00       118
          15       0.98      0.99      0.99       105
          16       1.00      0.97      0.99       104
          17       1.00    

In [38]:
model.save_pretrained('/kaggle/working')