# Neural Networks and Deep Learning Project

## XLNet Model for detecting Cyberbullying Tweets

### Team Members:

Hemanth Chenna

Srinivas Akhil Mallela

### Installs and imports

In [None]:
!pip install simpletransformers

In [None]:
!pip install emoji

In [None]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs, MultiLabelClassificationModel, MultiLabelClassificationArgs
from urllib import request
import pandas as pd
import logging
import torch
from collections import Counter
from ast import literal_eval
import tensorflow as tf
from sklearn.model_selection import train_test_split
import os

### Reading data from CSV

In [None]:
df = pd.read_csv('../input/nndl-data/cyberbullying_tweets.csv', sep=',')
df.columns = ['tweet', 'label']

Renaming the labels of all data as non-negative integers to make them compatible with the SimpleTransformers module

In [None]:
def smush_labels(label):
    if label == 'not_cyberbullying':
        return 0
    elif label == 'gender':
        return 1
    elif label == 'religion':
        return 2
    elif label == 'other_cyberbullying':
        return 3
    elif label == 'age':
        return 4
    else: #ethnicity
        return 5

In [None]:
df['label'] = df['label'].apply(smush_labels)

### Logging and GPU operations

In [None]:
# prepare logger
logging.basicConfig(level=logging.INFO)

transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

In [None]:
# check gpu
cuda_available = torch.cuda.is_available()
print('Cuda available? ',cuda_available)

In [None]:
# Get the GPU device name.
device_name = tf.test.gpu_device_name()

### Trimming tweets

Removing special characters, usernames, emojis, hashtags, multiple spaces, etc to make the tweets easier to process

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

In [None]:
#Text cleaning
import re, string
import emoji
from nltk.stem import WordNetLemmatizer,PorterStemmer
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

import nltk
from nltk.stem import WordNetLemmatizer,PorterStemmer
#Clean emojis from text
def strip_emoji(text):
    return re.sub(emoji.get_emoji_regexp(), r"", text) #remove emoji

#Remove punctuations, links, stopwords, mentions and \r\n new line characters
def strip_all_entities(text): 
    text = text.replace('\r', '').replace('\n', ' ').lower() #remove \n and \r and lowercase
    text = re.sub(r"(?:\@|https?\://)\S+", "", text) #remove links and mentions
    text = re.sub(r'[^\x00-\x7f]',r'', text) #remove non utf8/ascii characters such as '\x9a\x91\x97\x9a\x97'
    #banned_list= string.punctuation
    #table = str.maketrans('', '', banned_list)
    #text = text.translate(table)
    text = [word for word in text.split() if word not in stop_words]
    text = ' '.join(text)
    text =' '.join(word for word in text.split() if len(word) < 14) # remove words longer than 14 characters
    return text

#remove contractions
def decontract(text):
    text = re.sub(r"can\'t", "can not", text)
    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'t", " not", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'m", " am", text)
    return text

#clean hashtags at the end of the sentence, and keep those in the middle of the sentence by removing just the "#" symbol
def clean_hashtags(tweet):
    new_tweet = " ".join(word.strip() for word in re.split('#(?!(?:hashtag)\b)[\w-]+(?=(?:\s+#[\w-]+)*\s*$)', tweet)) #remove last hashtags
    new_tweet2 = " ".join(word.strip() for word in re.split('#|_', new_tweet)) #remove hashtags symbol from words in the middle of the sentence
    return new_tweet2

#Filter special characters such as "&" and "$" present in some words
def filter_chars(a):
    sent = []
    for word in a.split(' '):
        if ('$' in word) | ('&' in word):
            sent.append('')
        else:
            sent.append(word)
    return ' '.join(sent)

#Remove multiple sequential spaces
def remove_mult_spaces(text):
    return re.sub("\s\s+" , " ", text)

#Stemming
def stemmer(text):
    tokenized = nltk.word_tokenize(text)
    ps = PorterStemmer()
    return ' '.join([ps.stem(words) for words in tokenized])

#Lemmatization 
#NOTE:Stemming seems to work better for this dataset
def lemmatize(text):
    tokenized = nltk.word_tokenize(text)
    lm = WordNetLemmatizer()
    return ' '.join([lm.lemmatize(words) for words in tokenized])

#Then we apply all the defined functions in the following order
def deep_clean(text):
    text = strip_emoji(text)
    text = decontract(text)
    text = strip_all_entities(text)
    text = clean_hashtags(text)
    text = filter_chars(text)
    text = remove_mult_spaces(text)
    text = stemmer(text)#check this
    return text

In [None]:
df['tweet'] = df['tweet'].apply(deep_clean)

### Preparing the model and data

Applying a train/test/validation split of 60/20/20

In [None]:
df_train, df_test = train_test_split(df, test_size = 0.20, shuffle = True)
#df_train, df_val = train_test_split(df_train, test_size = 0.111, shuffle = True)

XLNet model args set to be run for 10 epochs

In [None]:
xlnet_model_args = ClassificationArgs(num_train_epochs=10, 
                                      no_save=True, 
                                      no_cache=True, 
                                      overwrite_output_dir=True,
                                      train_batch_size=32,
                                      save_steps = -1,
                                      save_model_every_epoch = False,
                                      use_early_stopping = True,
                                      early_stopping_delta = 0.01,
                                      early_stopping_metric = "mcc",
                                      early_stopping_metric_minimize = False,
                                      early_stopping_patience = 3,
                                      evaluate_during_training = True,
                                      evaluate_during_training_steps = 1000,
                                      evaluate_during_training_verbose = True,
                                      use_cached_eval_features= True)

Defining the model and number of output classes

In [None]:
xlnet_model = ClassificationModel("xlnet", 
                                  'xlnet-base-cased', 
                                  args = xlnet_model_args, 
                                  num_labels=6,
                                  use_cuda=cuda_available)

### Training the model with train and validation data

In [None]:
xlnet_model.train_model(df_train[['tweet', 'label']], eval_df=df_test[['tweet', 'label']])

### Testing the model

Get the predictions of the test set and compare with the correct labels

In [None]:
preds_xlnet, _ = xlnet_model.predict(df_test.tweet.tolist())

In [None]:
correct_xlnet = df_test.label.tolist()

In [None]:
count = 0
for i, pred in enumerate(correct_xlnet):
    if pred == preds_xlnet[i]:
        count = count + 1

In [None]:
print('Total = ', len(correct_xlnet))
print('Count = ', count)

In [None]:
from sklearn.metrics import classification_report

print(classification_report(correct_xlnet, preds_xlnet))

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(correct_xlnet, preds_xlnet)

In [None]:
!pip install seaborn

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import numpy as np
labels = ['not_cyberbullying', 'gender', 'religion', 'other_cyberbullying','age', 'ethnicity']

disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)

disp.plot(cmap=plt.cm.Blues)
plt.show()

import seaborn as sns
import matplotlib.pyplot as plt 

index = ['not_cyberbullying', 'gender', 'religion', 'other_cyberbullying','age', 'ethnicity']
columns = ['not_cyberbullying', 'gender', 'religion', 'other_cyberbullying','age', 'ethnicity']
cm_df = pd.DataFrame(cm,columns,index)                      
plt.figure(figsize=(10,6)) 
ax = plt.subplot()
sns.heatmap(cm_df, annot=True,fmt='g', ax = ax)

ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('XLNet Confusion Matrix')

### Saving the model as necessary

In [None]:
torch.save(xlnet_model, 'xlnet_model')

In [None]:
torch.load(xlnet_model1, '../input/nndl-data/mymodel')