<a href="https://colab.research.google.com/github/MAbuTalha/Research-Paper/blob/main/BertTest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# verify GPU availability
import tensorflow as tf

device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

In [None]:
import torch

print(torch.cuda.current_device())


print(torch.cuda.device(0))
print(torch.cuda.device_count())
print(torch.cuda.get_device_name(0))
print(torch.cuda.is_available())

torch.cuda.empty_cache()

In [None]:
torch.cuda.memory_summary(device=None, abbreviated=False)

In [None]:
import pandas as pd
import numpy as np
# Set notebook mode to work in offline
import warnings
warnings.filterwarnings("ignore")
import re
import re
import string
import sklearn
import itertools


In [None]:
!pip install simpletransformers
from simpletransformers.classification import ClassificationModel


In [None]:
from simpletransformers.classification import ClassificationModel

In [None]:
from google.colab import files
TestData = files.upload()

In [None]:
for fn in TestData.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(TestData[fn])))

In [None]:
train_df =pd.read_csv('train.csv')

In [None]:
train_df

In [None]:
test_df =pd.read_csv('test.csv')

In [None]:
test_df

In [None]:
contractions = { 
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he's": "he is",
"how'd": "how did",
"how'll": "how will",
"how's": "how is",
"i'd": "i would",
"i'll": "i will",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'll": "it will",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"must've": "must have",
"mustn't": "must not",
"needn't": "need not",
"oughtn't": "ought not",
"shan't": "shall not",
"sha'n't": "shall not",
"she'd": "she would",
"she'll": "she will",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"that'd": "that would",
"that's": "that is",
"there'd": "there had",
"there's": "there is",
"they'd": "they would",
"they'll": "they will",
"they're": "they are",
"they've": "they have",
"wasn't": "was not",
"we'd": "we would",
"we'll": "we will",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"where'd": "where did",
"where's": "where is",
"who'll": "who will",
"who's": "who is",
"won't": "will not",
"wouldn't": "would not",
"you'd": "you would",
"you'll": "you will",
"you're": "you are",
"thx"   : "thanks"
}

In [None]:
def remove_contractions(text):
    return contractions[text.lower()] if text.lower() in contractions.keys() else text

In [None]:
train_df.text=train_df.text.apply(remove_contractions)
test_df.text=test_df.text.apply(remove_contractions)

In [None]:
def clean_dataset(text):
    # Remove hashtag while keeping hashtag text
    text = re.sub(r'#','', text)
    # Remove HTML special entities (e.g. &amp;)
    text = re.sub(r'\&\w*;', '', text)
    # Remove tickers
    text = re.sub(r'\$\w*', '', text)
    # Remove hyperlinks
    text = re.sub(r'https?:\/\/.*\/\w*', '', text)
    # Remove whitespace (including new line characters)
    text = re.sub(r'\s\s+','', text)
    text = re.sub(r'[ ]{2, }',' ',text)
    # Remove URL, RT, mention(@)
    text=  re.sub(r'http(\S)+', '',text)
    text=  re.sub(r'http ...', '',text)
    text=  re.sub(r'(RT|rt)[ ]*@[ ]*[\S]+','',text)
    text=  re.sub(r'RT[ ]?@','',text)
    text = re.sub(r'@[\S]+','',text)
    # Remove words with 4 or fewer letters
    text = re.sub(r'\b\w{1,4}\b', '', text)
    #&, < and >
    text = re.sub(r'&amp;?', 'and',text)
    text = re.sub(r'&lt;','<',text)
    text = re.sub(r'&gt;','>',text)
    # Remove characters beyond Basic Multilingual Plane (BMP) of Unicode:
    text= ''.join(c for c in text if c <= '\uFFFF') 
    text = text.strip()
    # Remove misspelling words
    text = ''.join(''.join(s)[:2] for _, s in itertools.groupby(text))
    # Remove emoji
    text = emoji.demojize(text)
    text = text.replace(":"," ")
    text = ' '.join(text.split()) 
    text = re.sub("([^\x00-\x7F])+"," ",text)
    # Remove Mojibake (also extra spaces)
    text = ' '.join(re.sub("[^\u4e00-\u9fa5\u0030-\u0039\u0041-\u005a\u0061-\u007a]", " ", text).split())
    return text

In [None]:
#!pip install emoji
import emoji 
train_df.text=train_df.text.apply(clean_dataset)
test_df.text=test_df.text.apply(clean_dataset)

In [None]:
X_train_clean= train_df.text
X_test_clean = test_df.text
y_train_clean= train_df.label
y_test_clean = test_df.label

In [None]:
train_df_clean = pd.concat([X_train_clean, y_train_clean], axis=1)
print("Shape of training data set: ", train_df_clean.shape)
print("View of data set: ", train_df_clean.head())

In [None]:
eval_df_clean = pd.concat([X_test_clean, y_test_clean], axis=1)
print("Shape of Eval data set: ", eval_df_clean.shape)

In [None]:
bert_train_args = {
    'learning_rate':5e-5,
    'evaluate_during_training': True,
    'logging_steps': 100,
    'num_train_epochs': 1,
    'evaluate_during_training_steps': 100,
    'save_eval_checkpoints': False,
    'train_batch_size': 32,
    'eval_batch_size': 64,
    'overwrite_output_dir': True,
    'output_dir':'D:/Output-Bert-Test',
    'fp16': False,
    'n_gpu':1,
    'wandb_project': "Bert-Model-Test"
}

In [None]:
model_BertTest = ClassificationModel('bert', 'bert-base-cased', num_labels=2, use_cuda=True, cuda_device=0, args=bert_train_args)

In [None]:
model_BertTest.train_model(train_df_clean, eval_df=eval_df_clean)

In [None]:
result, model_outputs, wrong_predictions = model_BertTest.eval_model(eval_df_clean)

In [None]:
from sklearn.metrics import f1_score,accuracy_score,classification_report, confusion_matrix

In [None]:
predictions = []
for x in model_outputs:
    predictions.append(np.argmax(x))

print('f1 score:', f1_score(eval_df_clean['label'], predictions))
print('Accuracy score:', accuracy_score(eval_df_clean['label'], predictions))

# **RoBerta**