### Note:
This code for the fine-tuning process of XLNet for hate speech detection is based on the following example, published on Medium: <br>

link to article: https://medium.com/swlh/using-xlnet-for-sentiment-classification-cfa948e65e85 <br>
author: Shanay Ghag <br>
published at: Jun 16, 2020 <br>
link to GitHub: https://github.com/shanayghag/Sentiment-classification-using-XLNet <br>

In [1]:
import pandas as pd
import re
import torch
import sentencepiece
from transformers import XLNetForSequenceClassification
from transformers import XLNetTokenizer
from keras.preprocessing.sequence import pad_sequences
import torch.nn.functional as F

# Define Functions

In [2]:
# define function for text preprocessing 
def prepare_text(text):
    text = re.sub(r"@[A-Za-z0-9_]+", ' ', text) # remove @user 
    text = re.sub(r"https?://[A-Za-z0-9./]+", ' ', text) # remove links
    text = re.sub(r"[^a-zA-z.!?'0-9]", ' ', text) # remove smileys
    text = re.sub('[^A-Za-z0-9]+', ' ', text) # remove any other special characters
    text = re.sub('#', '', text) # remove hash sign
    text = re.sub('\t', ' ',  text) # remove tab
    text = re.sub(r" +", ' ', text) # remove multiple whitespaces
    return text

In [3]:
# define function for sentiment prediction
def predict_hate(text):
    review_text = text
    
    df = pd.DataFrame(columns=['positive_score', 'negative_score', 'text', 'hate_value'])

    encoded_review = tokenizer.encode_plus(review_text,
                                           max_length=MAX_LEN,
                                           truncation=True,
                                           add_special_tokens=True,
                                           return_token_type_ids=False,
                                           pad_to_max_length=False,
                                           return_attention_mask=True,
                                           return_tensors='pt',)

    input_ids = pad_sequences(encoded_review['input_ids'],
                              maxlen=MAX_LEN, 
                              dtype=torch.Tensor ,
                              truncating="post",
                              padding="post")
    input_ids = input_ids.astype(dtype = 'int64')
    input_ids = torch.tensor(input_ids) 

    attention_mask = pad_sequences(encoded_review['attention_mask'], 
                                   maxlen=MAX_LEN, dtype=torch.Tensor ,
                                   truncating="post",
                                   padding="post")
    attention_mask = attention_mask.astype(dtype = 'int64')
    attention_mask = torch.tensor(attention_mask) 

    input_ids = input_ids.reshape(1,512).to(device)
    attention_mask = attention_mask.to(device)

    outputs = model(input_ids=input_ids, attention_mask=attention_mask)

    outputs = outputs[0][0].cpu().detach()

    probs = F.softmax(outputs, dim=-1).cpu().detach().numpy().tolist()
    _, prediction = torch.max(outputs, dim =-1)
    threshold = 0.99
    hatespeech_prob = probs[1]
    if hatespeech_prob < threshold:
        pred_thresh = 0
    elif hatespeech_prob >= threshold:
        pred_thresh = 1
    

    result = {'positive_score': probs[1], 'negative_score': probs[0], 'text': review_text, 'hate_value': class_names[pred_thresh]}
    df = df.append(result, ignore_index=True)
    return df

# Prepare Data

In [4]:
# read dataset with all tweets:
dtype={'text': str, 'id': str, 'tweet_id': str, 'title': str, 'outlet': str, 'twitter_handle': str, 'article_url': str, 'adfontes_url': str, 'bias_score': float, 'reliability_score': float}

# note: this scrip was implemented using Kaggle's GPU. The dataset 'all_tweets_final' was loaded into the Kaggle repo and accessed from there
all_tweets = pd.read_csv('../input/all-tweets/all_tweets_final.csv', dtype=dtype)

In [5]:
# apply text preprocessing to tweets:
all_tweets['text_prepared'] = all_tweets['text'].apply(prepare_text)

In [6]:
all_tweets

Unnamed: 0,id,text,tweet_id,title,outlet,twitter_handle,article_url,adfontes_url,bias_score,reliability_score,text_prepared
0,1411899288750223368,@19thnews Cool. Never going back to work.\n\n#...,1411877241647206401,"COVID Delta variant puts men, people of color ...",19th News,19thnews,https://19thnews.org/2021/07/the-covid-delta-v...,https://adfontesmedia.com/19th-news-bias-and-r...,-1.00,48.00,Cool Never going back to work maskedforever
1,1327396633118695425,"Bass: ""In 30 minutes, we have a phone call wit...",1327395318825181189,Karen Bass addresses the question of filling K...,19th News,19thnews,https://19thnews.org/2020/11/karen-bass-addres...,https://adfontesmedia.com/19th-news-bias-and-r...,-6.00,44.83,Bass In 30 minutes we have a phone call with t...
2,1327447530100191233,Q: “Do you think that [Harris’s] vacancy shoul...,1327395318825181189,Karen Bass addresses the question of filling K...,19th News,19thnews,https://19thnews.org/2020/11/karen-bass-addres...,https://adfontesmedia.com/19th-news-bias-and-r...,-6.00,44.83,Q Do you think that Harris s vacancy should be...
3,1328117283357552640,Excellent choice as is @BLeeForCongress. I fee...,1328091662468460551,Karen Bass addresses the question of filling K...,19th News,19thnews,https://19thnews.org/2020/11/karen-bass-addres...,https://adfontesmedia.com/19th-news-bias-and-r...,-6.00,44.83,Excellent choice as is I feel so lucky as a Ca...
4,1331090575613861889,"She’d be great, @GavinNewsom. https://t.co/TKr...",1331090228489162752,Karen Bass addresses the question of filling K...,19th News,19thnews,https://19thnews.org/2020/11/karen-bass-addres...,https://adfontesmedia.com/19th-news-bias-and-r...,-6.00,44.83,She d be great
...,...,...,...,...,...,...,...,...,...,...,...
175802,1278768989968236545,"@zerohedge She doesn't know it yet, but she is...",1278765704028868609,"Ghislaine Maxwell Arrested, May Be Sent To Sam...",ZeroHedge,zerohedge,https://www.zerohedge.com/political/ghislane-m...,https://adfontesmedia.com/zerohedge-bias-and-r...,5.33,33.33,She doesn t know it yet but she is going to h...
175803,1278769750395543552,@zerohedge https://t.co/UGA3XMx9fD,1278765704028868609,"Ghislaine Maxwell Arrested, May Be Sent To Sam...",ZeroHedge,zerohedge,https://www.zerohedge.com/political/ghislane-m...,https://adfontesmedia.com/zerohedge-bias-and-r...,5.33,33.33,
175804,1278771968024760324,@zerohedge its all because of $NFLX,1278765704028868609,"Ghislaine Maxwell Arrested, May Be Sent To Sam...",ZeroHedge,zerohedge,https://www.zerohedge.com/political/ghislane-m...,https://adfontesmedia.com/zerohedge-bias-and-r...,5.33,33.33,its all because of NFLX
175805,1278767022873198593,@zerohedge Hope we get ALL the names...,1278765704028868609,"Ghislaine Maxwell Arrested, May Be Sent To Sam...",ZeroHedge,zerohedge,https://www.zerohedge.com/political/ghislane-m...,https://adfontesmedia.com/zerohedge-bias-and-r...,5.33,33.33,Hope we get ALL the names


# Load XLNet & Define Parameters

In [7]:
# read classifier:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [8]:
model = XLNetForSequenceClassification.from_pretrained('xlnet-base-cased', num_labels = 2)
model = model.to(device)

Downloading:   0%|          | 0.00/760 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.weight', 'sequence_summary.summary.bias', 'logits_proj.weight', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a

In [9]:
model.load_state_dict(torch.load('../input/xlnet-hate-model/xlnet_model_hate.bin'))
#model.load_state_dict(torch.load('../input/new-hate-model/xlnet_model_hate.bin', map_location=torch.device('cpu')))

<All keys matched successfully>

In [10]:
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')

Downloading:   0%|          | 0.00/779k [00:00<?, ?B/s]

In [11]:
# define variables
MAX_LEN = 512
class_names = ['non-hate', 'hate'] # 0=non-hate; 1=hate

# Run for all Comments

In [12]:
tweets = all_tweets['text_prepared'].tolist() # 175807 comments
results = pd.DataFrame()

for tweet in tweets:
    results = results.append(predict_hate(tweet))

In [13]:
# add tweet id
results.insert(loc=0, column="id", value=all_tweets['id'].tolist())

In [14]:
results.reset_index(drop=True)

Unnamed: 0,id,positive_score,negative_score,text,hate_value
0,1411899288750223368,0.000774,0.999226,Cool Never going back to work maskedforever,non-hate
1,1327396633118695425,0.944241,0.055759,Bass In 30 minutes we have a phone call with t...,non-hate
2,1327447530100191233,0.983609,0.016391,Q Do you think that Harris s vacancy should be...,non-hate
3,1328117283357552640,0.001051,0.998949,Excellent choice as is I feel so lucky as a Ca...,non-hate
4,1331090575613861889,0.002564,0.997436,She d be great,non-hate
...,...,...,...,...,...
175802,1278768989968236545,0.009136,0.990864,She doesn t know it yet but she is going to h...,non-hate
175803,1278769750395543552,0.000960,0.999040,,non-hate
175804,1278771968024760324,0.001706,0.998294,its all because of NFLX,non-hate
175805,1278767022873198593,0.001921,0.998079,Hope we get ALL the names,non-hate


In [15]:
# note: this scrip was implemented using Kaggle's GPU. The classified hate dataset was saved into the Kaggle repo and downloaded manually
results.to_csv('all_tweets_hate.csv', header=True, index=None)

In [16]:
hate_values = len(results[results['hate_value'] == "hate"])
hate_share = hate_values / len(results)
print(hate_values, hate_share)

10475 0.05958238295403482
