<a href="https://colab.research.google.com/github/HarriG109/Dissertation/blob/main/Predictions_BERT_OLID_UNT_TIN_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Running Predictions with Finalised Model

Check available GPU:

In [None]:
import tensorflow as tf

# Get the GPU device name.
device_name = tf.test.gpu_device_name()

# The device name should look like the following:
if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')

In [None]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

Install transformers package:

In [None]:
!pip install transformers

Mount Google Drive:

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Load the saved tokenizer:

In [None]:
from transformers import BertTokenizer

# Load the BERT tokenizer.
print('Loading BERT tokenizer...')
output_dir = "/content/drive/MyDrive/Model/BERT Model/UNTTIN"
tokenizer = BertTokenizer.from_pretrained(output_dir)

Load the saved BERT model:

In [None]:
from transformers import BertForSequenceClassification, BertConfig

# Load BertForSequenceClassification, the pretrained BERT model with a single 
# linear classification layer on top. 
model = BertForSequenceClassification.from_pretrained(output_dir)

# Tell pytorch to run this model on the GPU.
model.cuda()

Import data for running predictions:

In [None]:
import pandas as pd

# Load the dataset into a pandas dataframe.
results = pd.read_csv("/content/drive/MyDrive/Data/Reddit Data/Predictions_OFF/skyrim_predictions_all.csv", delimiter=',', header=None, names=['sentence','label_a'])
#Subset for only offensive which can be targeted or not
label_a = results['label_a'].values
keep = label_a != 'NOT'
results_f = results[keep]

# Report the number of sentences.
print('Number of test sentences: {:,}\n'.format(results_f.shape[0]))

# Create sentence and label lists
sentences = results_f.sentence.values

#Create array of identifiers

target_names = ['TIN', 'UNT']

Create function for applying model:

In [None]:
def get_prediction(text):
    # prepare our text into tokenized sequence
    inputs = tokenizer(text, padding=True, truncation=True, max_length=510, return_tensors="pt").to("cuda")
    # perform inference to our model
    with torch.no_grad():
      outputs = model(**inputs)
      # get output probabilities by doing softmax
      probs = outputs[0].softmax(1)
      
    # executing argmax function to get the candidate label
    return target_names[probs.argmax()]

Loop through posts using function to get classification per post:

In [None]:
import pandas as pd

final = pd.DataFrame(columns=['sent', 'unt_tin'])
for sent in sentences:
  to_append = [sent, str(get_prediction(sent))]
  final_ln = len(final)
  final.loc[final_ln] = to_append

Output prediction results

In [None]:
#Write to CSV
final.to_csv('./skyrim_predictions_TIN.csv', header=False, index=False)

In [None]:
!mv "/content/skyrim_predictions_TIN.csv" "/content/drive/MyDrive/Data/Reddit Data/Predictions_TIN"