<a href="https://colab.research.google.com/github/ModithaSubasinghe/Analysis_and_Visualisation_Basics/blob/main/Transformers_code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Named Entity Recognition

In [None]:
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForTokenClassification

In [None]:
# classifier = pipeline('sentiment-analysis')

In [None]:
# seq = pipeline(task="text-classification", model='nlptown/bert-base-multilingual-uncased-sentiment')

In [None]:
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")

nlp = pipeline("ner", model=model, tokenizer=tokenizer)
seq = "I am Moditha, and I live in Matara. I am a Data Scientist, and I work at DataDisca."

for item in nlp(seq):
    print(item['word'], item['entity'])   

# Fine Tuning Pretrained Model on Custom Dataset Using Transformer

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch 

In [None]:
df = pd.read_csv('dataset_01.csv',usecols = ['author','text'])
df

In [None]:
df['author'].value_counts()

In [None]:
x = list(df["text"])
y = list(df["author"])

In [None]:
le=LabelEncoder()
y1 = list(le.fit_transform(y))

In [None]:
label_index=le.fit(y).classes_
label_index

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x, y1, test_size=0.2, random_state=0)

In [None]:
from transformers import BertTokenizerFast

tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

In [None]:
train_encodings = tokenizer(X_train, truncation=True, padding=True)
test_encodings = tokenizer(X_test, truncation=True, padding=True)

In [None]:
import tensorflow as tf
from transformers import TFBertForSequenceClassification

### Convert these encodings into Dataset objects

In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    y_train
))

test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    y_test
))

In [None]:
from transformers import TFTrainingArguments,TFTrainer

training_args = TFTrainingArguments(
    output_dir='./result',
    num_train_epochs=2,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    eval_steps = 2
)

# training_args = TrainingArguments(output_dir="test_trainer")

In [None]:
with training_args.strategy.scope():
    model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=5)


trainer=TFTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

In [None]:
trainer.train()

In [None]:
trainer.evaluate(test_dataset)

In [None]:
trainer.predict(test_dataset)

In [None]:
trainer.predict(test_dataset)[0].shape


In [None]:
output=trainer.predict(test_dataset)[0]

In [None]:
output

In [None]:
import numpy as np
output = np.argmax(output, axis = 1)

In [None]:
from sklearn.metrics import confusion_matrix

cm=confusion_matrix(y_test,output,normalize='pred')
cm

In [None]:
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import accuracy_score, f1_score

In [None]:
disp = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=label_index)
disp = disp.plot(cmap='Greens', xticks_rotation=45)
plt.show()

In [None]:
print( "F1_score: " + str(f1_score(y_test, output, average='micro')))

In [None]:
trainer.save_model('predicted_model')