In [None]:
# importing transformers and datasets (from HuggingFace)
!pip install transformers datasets

tamil hate speech classification - dataset

In [None]:
#import datasets from local colab drive
#import transformers
#from datasets import load_dataset
#dataset=load_dataset('csv',data_files={'train':'sample_data/train.csv','test':'sample_data/eval.csv'})

In [None]:
# import dataset from the Google Drive
from google.colab import drive
drive.mount('/content/drive/')

In [None]:
import transformers
from datasets import load_dataset

dataset=load_dataset('csv',data_files={'train':'drive/MyDrive/train.csv','eval':'drive/MyDrive/eval.csv'})

# New Section

In [None]:
dataset['train'][201]

In [None]:
dataset['eval'][101]

In [None]:
dataset['train'].column_names

In [None]:
dataset['train'].num_rows

In [None]:
# Pre-Training using BERT. The contextual representation of the word(sub-tokens) 
# from the pre-trained BERT model
from transformers import AutoTokenizer
tokenizer=AutoTokenizer.from_pretrained("bert-base-cased")
def tokenize_function(examples):
    return tokenizer(examples['text'],padding='max_length',truncation=True,max_length=100)
tokenize_function(dataset["train"][:2])

In [None]:
tokenized_datasets=dataset.map(tokenize_function,batched=True)
print(tokenized_datasets)

In [None]:
train_dataset=tokenized_datasets['train'].shuffle(seed=42).select(range(500))
eval_dataset=tokenized_datasets['eval'].shuffle(seed=42).select(range(500))

#train_dataset.column_names
print(train_dataset['text'][201])

In [None]:
decode_sentence=tokenizer.decode(tokenized_datasets['train']['input_ids'][601])
print(decode_sentence)

In [None]:
# Pre-Training using RoBERTa. The contextual representation of the word(sub-tokens) 
# from the pre-trained BERT model
from transformers import AutoTokenizer
tokenizer=AutoTokenizer.from_pretrained("xlm-roberta-base")
def tokenize_function(examples):
    return tokenizer(examples['text'],padding='max_length',truncation=True,max_length=50)
tokenize_function(dataset["train"][:2])

In [None]:
#input ids are the only required parameters to be passed to the model as input. 

tokenized_datasets=dataset.map(tokenize_function,batched=True)
print(tokenized_datasets)
print(tokenized_datasets['train']['text'][201])

In [None]:
##The tokens are either words or subwords.These tokens can then be converted into IDs which are understandable by the model.
train_dataset=tokenized_datasets['train'].shuffle(seed=42).select(range(500))
eval_dataset=tokenized_datasets['eval'].shuffle(seed=42).select(range(500))

#train_dataset.column_names
print(train_dataset['text'][201])

In [None]:
print(train_dataset['input_ids'][201])

In [None]:
decode_sentence=tokenizer.decode(train_dataset['input_ids'][201])
print(decode_sentence)

In [None]:
# a binary tensor indicating the position of the padded indices so that the model does not attend to them.
print(train_dataset['attention_mask'][201])

In [None]:
# Fine-tuning using TF Sequence Classifier
# The contextual embeddings are input to the BERT for the downstream tasks

import tensorflow as tf
from transformers import TFXLMRobertaForSequenceClassification
#from transformers import TFAutoModelForSequenceClassification

#model=TFAutoModelForSequenceClassification.from_pretrained('jplu/tf-xlm-roberta-base',num_labels=2)
model=TFXLMRobertaForSequenceClassification.from_pretrained('jplu/tf-xlm-roberta-base',num_labels=2)
tf_train=train_dataset.remove_columns(['text']).with_format('tensorflow')
tf_eval=eval_dataset.remove_columns(['text']).with_format('tensorflow')

In [None]:
#transforming the dataset into TF tensors - TF based classifier
train_features={x:tf_train[x] for x in tokenizer.model_input_names}
train_tf_dataset=tf.data.Dataset.from_tensor_slices((train_features,tf_train['labels']))
train_tf_dataset=train_tf_dataset.shuffle(len(tf_train)).batch(8)

eval_features={x:tf_eval[x] for x in tokenizer.model_input_names}
eval_tf_dataset=tf.data.Dataset.from_tensor_slices((eval_features,tf_eval['labels']))
eval_tf_dataset=eval_tf_dataset.shuffle(len(tf_eval)).batch(8)

In [None]:
# Creating the model and fitting the dataset to the model

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=tf.keras.metrics.SparseCategoricalAccuracy(),
)    
model.fit(train_tf_dataset,validation_data=eval_tf_dataset,epochs=2)


In [None]:
dataset=load_dataset('csv',data_files={'test':'drive/MyDrive/test1.csv'})
tokenizer=AutoTokenizer.from_pretrained("xlm-roberta-base")
def tokenize_function(examples):
    return tokenizer(examples['text'],padding='max_length',truncation=True,max_length=100)
tokenized_datasets=dataset.map(tokenize_function,batched=True)
test_dataset=tokenized_datasets['test']
tf_test=test_dataset.remove_columns(['text']).with_format('tensorflow')
test_features={x:tf_test[x] for x in tokenizer.model_input_names}
test_tf_dataset=tf.data.Dataset.from_tensor_slices((test_features,tf_test['label']))
test_tf_dataset=test_tf_dataset.shuffle(len(tf_test)).batch(8)


In [None]:
preds = model.predict(test_tf_dataset)['logits']
class_preds = tf.math.argmax(preds,axis=1)

In [None]:
print(class_preds)

In [None]:
from datasets import load_metric
metric = load_metric('accuracy')
metric.compute(predictions=class_preds,references=tf_test['label'])