### Note:
This code for the fine-tuning process of XLNet for hate speech detection is based on the following example, published on Medium: <br>

link to article: https://medium.com/swlh/using-xlnet-for-sentiment-classification-cfa948e65e85 <br>
author: Shanay Ghag <br>
published at: Jun 16, 2020 <br>
link to GitHub: https://github.com/shanayghag/Sentiment-classification-using-XLNet <br>

In [None]:
import pandas as pd
import re
import torch
import sentencepiece
from transformers import XLNetForSequenceClassification
from transformers import XLNetTokenizer
from keras.preprocessing.sequence import pad_sequences
import torch.nn.functional as F

# Define Functions

In [None]:
# define function for text preprocessing 
def prepare_text(text):
    text = re.sub(r"@[A-Za-z0-9_]+", ' ', text) # remove @user 
    text = re.sub(r"https?://[A-Za-z0-9./]+", ' ', text) # remove links
    text = re.sub(r"[^a-zA-z.!?'0-9]", ' ', text) # remove smileys
    text = re.sub('[^A-Za-z0-9]+', ' ', text) # remove any other special characters
    text = re.sub('#', '', text) # remove hash sign
    text = re.sub('\t', ' ',  text) # remove tab
    text = re.sub(r" +", ' ', text) # remove multiple whitespaces
    return text

In [None]:
# define function for sentiment prediction
def predict_hate(text):
    review_text = text
    
    df = pd.DataFrame(columns=['positive_score', 'negative_score', 'text', 'hate_value'])

    encoded_review = tokenizer.encode_plus(review_text,
                                           max_length=MAX_LEN,
                                           truncation=True,
                                           add_special_tokens=True,
                                           return_token_type_ids=False,
                                           pad_to_max_length=False,
                                           return_attention_mask=True,
                                           return_tensors='pt',)

    input_ids = pad_sequences(encoded_review['input_ids'],
                              maxlen=MAX_LEN, 
                              dtype=torch.Tensor ,
                              truncating="post",
                              padding="post")
    input_ids = input_ids.astype(dtype = 'int64')
    input_ids = torch.tensor(input_ids) 

    attention_mask = pad_sequences(encoded_review['attention_mask'], 
                                   maxlen=MAX_LEN, dtype=torch.Tensor ,
                                   truncating="post",
                                   padding="post")
    attention_mask = attention_mask.astype(dtype = 'int64')
    attention_mask = torch.tensor(attention_mask) 

    input_ids = input_ids.reshape(1,512).to(device)
    attention_mask = attention_mask.to(device)

    outputs = model(input_ids=input_ids, attention_mask=attention_mask)

    outputs = outputs[0][0].cpu().detach()

    probs = F.softmax(outputs, dim=-1).cpu().detach().numpy().tolist()
    _, prediction = torch.max(outputs, dim =-1)

    result = {'positive_score': probs[1], 'negative_score': probs[0], 'text': review_text, 'hate_value': class_names[prediction]}
    df = df.append(result, ignore_index=True)
    return df

# Prepare Data

In [None]:
# read dataset with all tweets:
dtype={'text': str, 'id': str, 'tweet_id': str, 'title': str, 'outlet': str, 'twitter_handle': str, 'article_url': str, 'adfontes_url': str, 'bias_score': float, 'reliability_score': float}

# note: this scrip was implemented using Kaggle's GPU. The dataset 'all_tweets_final' was loaded into the Kaggle repo and accessed from there
all_tweets = pd.read_csv('../input/all-tweets-final/all_tweets_final.csv', dtype=dtype)

In [None]:
# apply text preprocessing to tweets:
all_tweets['text_prepared'] = all_tweets['text'].apply(prepare_text)

In [None]:
all_tweets

# Load XLNet & Define Parameters

In [None]:
# read classifier:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

In [None]:
model = XLNetForSequenceClassification.from_pretrained('xlnet-base-cased', num_labels = 2)
model = model.to(device)

In [None]:
model.load_state_dict(torch.load('../input/new-hate-model/xlnet_model_hate.bin'))
#model.load_state_dict(torch.load('../input/new-hate-model/xlnet_model_hate.bin', map_location=torch.device('cpu')))

In [None]:
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')

In [None]:
# define variables
MAX_LEN = 512
class_names = ['non-hate', 'hate'] # 0=non-hate; 1=hate

# Run for all Comments

In [None]:
tweets = all_tweets['text_prepared'].tolist() # 175807 comments
results = pd.DataFrame()

for tweet in tweets:
    results = results.append(predict_hate(tweet))

In [None]:
# add tweet id
results.insert(loc=0, column="id", value=all_tweets['id'].tolist())

In [None]:
results.reset_index(drop=True)

In [None]:
# note: this scrip was implemented using Kaggle's GPU. The classified hate dataset was saved into the Kaggle repo and downloaded manually
results.to_csv('all_tweets_hate.csv', header=True, index=None)