In [1]:
import torch
import torchtext
import torch.functional as F
import torch.nn as nn
import spacy
import pandas as pd
import numpy as np
import random
import time
from torch.utils.data import Dataset, DataLoader
from spacy.lang.en.stop_words import STOP_WORDS
SEED=42
torch.manual_seed(SEED)
import re
df = pd.read_csv('dataset.csv')

In [2]:
class Vocabulary(object):
    """Class to build vocabulary for mapping"""
    def __init__(self, token_to_idx = None, add_unk = True, unk_token = "<UNK>", mask_token="<MASK>", begin_seq_token="<BEGIN>",end_seq_token="<END>"):
        """
        Args:
            token_to_idx: Initialize token to idx dictionary.
            add_unk: Whether to include the unknown token in the vocabulary
            unk_token: How the unknown token is represented in the vocabulary
        """
        
        if token_to_idx is None:
            token_to_idx = {}
            
        self._token_to_idx = token_to_idx
        
        self._idx_to_token = {idx:token 
                             for token, idx in self._token_to_idx.items()}
        
#         self._mask_token = mask_token
        self._unk_token = unk_token
        self._begin_seq_token = begin_seq_token
        self._end_seq_token = end_seq_token

#         self.mask_index = self.add_token(self._mask_token)
        self.unk_index = self.add_token(self._unk_token)
        self.begin_seq_index = self.add_token(self._begin_seq_token)
        self.end_seq_index = self.add_token(self._end_seq_token)
        
        self._add_unk = add_unk
        
    def from_serializable(self):
        """
        Returns:
            A dictionary that can be serialized
        """

        return cls(**contents)
        
    def add_token(self, token):
        """ Update mapping dictionaries given the token
        Args:
            token (str): Token to add to the vocabulary
        Returns:
            index (int): The index corresponding to the token          
        """

        if token in self._token_to_idx:
            index = self._token_to_idx[token]
        else:
            index = len(self._token_to_idx)
            self._token_to_idx[token] = index
            self._idx_to_token[index] = token

        return index
        
    def lookup_token(self, token):
        """ Retrieves the index associated with the token
            or the UNK index if the token isn't present in the vocabulary

        Args:
            token (str): The token for which the index has to be retrieved
        Returns:
            index (int): The index associated with the token in the dictionary

        Note: 
            'UNK Index' has to be >=0 for the UNK functionality
        """

        if self._add_unk:
            return self._token_to_idx.get(token, self.unk_index)
        else:
            return self._token_to_idx.get(token)

    def lookup_index(self, index):
        """Retrieve the token associated with the index
        Args:
            index (int): The index to look up
        Returns:
            token (str): The token associated with the index
        Raises:
            KeyError: if the index is not in the vocabulary
        """

        if index not in self._idx_to_token:
            raise KeyError("the index %d is not in the vocabulary" % index)
        else:
            return self._idx_to_token[index]

    def __len__(self):
        """Returns the length of the vocabulary
        """
        return len(self._token_to_idx)      


In [3]:
class Hate_Speech_Tweets(Dataset):
    def __init__(self, df, nlp, vocab):
        """Initializing
        Args:
            df (Pandas DataFrame): Dataframe consisting of tweets and labels
            nlp (spacy object): For preprocessing
            vocab (Vocabulary Object): To vectorize the tweets
        """
        self.df = df
        self.nlp = nlp
        self.vocab = vocab
        measure_len = lambda context: len(context.split(" "))
        self._max_seq_length = max(map(measure_len, self.df.tweet)) + 1
        
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
          
        tweet =  self.df['tweet'].iloc[idx]
        
        label = self.df['label'].iloc[idx]
        return {'tweet':torch.LongTensor(self.preprocess(tweet)), 'label':label}
    
    def preprocess(self, sent):
        
        #Preprocessing and tokenizing
        sent =  re.sub(r'RT','',sent)
        sent =  re.sub('@[a-zA-z0-9_]+','',sent)
        sent =  re.sub(r'http://[^\s<>"]+|www\.[^\s<>"]+','',sent)
        sent =  re.sub(r'&#[0-9]+','',sent)
        sent =  re.sub(r'[0-9]+','',sent)
        sent =  re.sub(r'[^\w\s]', '',sent)
        sent =  sent.strip()
        sent =  " ".join(sent.split())
        sent = [token.lemma_ for token in self.nlp(sent) if token.text not in STOP_WORDS]
        sent = self.vectorize(sent)
        return sent
    
    def vectorize(self, sent):
        """Converts raw text to numeric vectors using the vocabulary
        Args:
            sent (str): The tweet to be vectorized
        Returns:
            vector (list): The vector associated with the tweet
        """
        vector = [self.vocab.begin_seq_index]
#         vector = []
        for token in sent:
            vector.append(vocab.lookup_token(token))
        vector.append(self.vocab.end_seq_index)

#         out_vector = np.zeros(self._max_seq_length, dtype=np.int64)
#         out_vector[:len(vector)] = vector
#         out_vector[len(vector):] = self.vocab.mask_index
            
        return vector
    

In [4]:
vocab = Vocabulary(token_to_idx=None, add_unk=True)
nlp = spacy.load(name='en_core_web_sm')

def clean(t):
    t = re.sub(r'RT','',t)
    t = re.sub('@[a-zA-z0-9_]+','',t)
    t = re.sub(r'http://[^\s<>"]+|www\.[^\s<>"]+','',t)
    t = re.sub(r'&#[0-9]+','',t)
    t = re.sub(r'[0-9]+','',t)
    t = re.sub(r'[^\w\s]+', '',t)
    t = t.strip()
    t = " ".join(t.split())
    t = [token.lemma_ for token in nlp(t) if token.text not in STOP_WORDS]
    return t

cleaned_tweets = df['tweet'].apply(clean)
text = []

for i in range(len(cleaned_tweets)):
    for word in cleaned_tweets[i]:
        text.append(word)

from collections import Counter
count_dict = Counter(text).most_common(6000)


In [5]:
for tup in count_dict:
    vocab.add_token(tup[0])

In [6]:
len(count_dict)

6000

In [7]:
data = Hate_Speech_Tweets(df,nlp,vocab)

In [8]:
# text = []

# for i in range(len(data)):
#     for word in data[i]['tweet']:
#         text.append(word)

# text = " ".join(text)

# x =  ''.join(text)

# plt.figure(figsize = (5,5))
# wc = WordCloud(width=3500, height=3500).generate(x)
# plt.imshow(wc)

In [9]:
vocab.__len__()

6003

In [10]:
def load_glove(filepath):
    """Loads the glove embeddings
    
    Args:
        filepath (str): path to the glove embeddings file
    Return:
        word_to_index (dict): Mappings from word to index
        embeddings (np.array): Embeddings of the words in the vocabulary
    """
    word_to_index = {}
    embeddings = []
    with open(filepath, 'r', encoding='utf-8') as fp:
        for index, line in enumerate(fp):
            # line = word num1 num2 .......
            line = line.split(" ")
            word_to_index[line[0]] = index
            embedding_i = np.array([float(val) for val in line[1:]])
            embeddings.append(embedding_i)
    
    return word_to_index, np.array(embeddings)

In [11]:
def make_embedding_matrix(filepath, words):
    """Create embedding matrix for a specific set of words
    Args:
        word_to_index (dict) : mapping of word to index
        embeddings (list): embeddings of words
        words (list): List of words in the dictionary
    Returns:
        final_embeddings (np..array) : embedding matrix
    """
    
    word_to_idx, embeddings = load_glove(filepath)
    embedding_size = embeddings.shape[1]
    final_embeddings = np.zeros((len(words), embedding_size))
    
    for i, word in enumerate(words):
        if word in word_to_idx:
            final_embeddings[i, :] = embeddings[word_to_idx[word]]
        else:
            embedding_i = torch.zeros(embedding_size)
            final_embeddings[i, :] = embedding_i
            
    return final_embeddings

In [12]:
words=[]
for idx in range(0, vocab.__len__()):
    words.append(vocab.lookup_index(idx))

In [13]:
embs = make_embedding_matrix(r"C:\Users\win10\Documents\glove.6B\glove.6B.50d.txt", words)

In [14]:
i=0
words[i], embs[i]

('<UNK>',
 array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]))

In [15]:
i = random.randint(0,4200)
print(i)
data[i]['tweet'], data[i]['label']

1074


(tensor([   1,   99,   69, 1371,  163,  184,    5,   28,  223,   61,  468, 1068,
          404, 2193,  313,    2]),
 2)

In [16]:
# train_iterator, test_iterator = BucketIterator.splits(
#     (train_data,test_data),
#     batch_size = 8,
#     sort = False,
#     device = 'cuda')

In [17]:
embs.shape

(6003, 50)

In [18]:
len(words)

6003

In [19]:
class LSTMClassifier(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, label_size, batch_size, embedding_weights, bidirectional = False):
        super(LSTMClassifier, self).__init__()
        self.hidden_dim = hidden_dim
        self.batch_size = batch_size
        self.embedding_dim = embedding_dim
        self.word_embeddings = nn.Embedding.from_pretrained(embedding_weights, freeze=False, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim,
                            bidirectional = bidirectional,batch_first=True)
        if bidirectional:
            self.fc = nn.Linear(hidden_dim*2, label_size)
        else:
            self.fc = nn.Linear(hidden_dim, label_size)
#         self.act = nn.Sigmoid()
 
    def forward(self, sentences, train = True):
        embeds = self.word_embeddings(sentences)
#         packed_embedded = nn.utils.rnn.pack_padded_sequence(embeds, src_len.cpu(), enforce_sorted=False)
        packed_outputs, (hidden,cell) = self.lstm(embeds)
#         hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)
#         print(hidden.shape)
#         print(hidden)
        dense_outputs = self.fc(hidden)
        outputs = dense_outputs
        return outputs

In [20]:
BATCH_SIZE = 4
nlabel = 3
hidden_dim = 50
device = 'cuda' if torch.cuda.is_available() else 'cpu'

model = LSTMClassifier(embedding_dim=embs.shape[1],hidden_dim=hidden_dim,label_size=nlabel, batch_size=BATCH_SIZE, embedding_weights=torch.from_numpy(embs).float())
model = model.to(device)
 
optimizer = torch.optim.Adam(model.parameters(), lr = 0.001)
loss_function = nn.CrossEntropyLoss()
 
def categorical_accuracy(preds, y):
    """
    Returns accuracy per batch
    """
    top_pred = preds.argmax(1, keepdim = True)
    correct = top_pred.eq(y.view_as(top_pred)).sum()
    acc = correct.float() / y.shape[0]
    return acc

In [21]:
#Rewrite collate_ FN function, whose input is the sample data of a batch
def collate_fn(batch):
	#Because token_ List is a variable length data, so you need to use a list to load the token of the batch_ list
    token_lists = [item['tweet'] for item in batch]
    #Each label is an int. we take out all the labels in the batch and reassemble them
    labels = [item['label'] for item in batch]
    #Converting labels to tensor
    labels = torch.LongTensor(labels)
    return {
    'token_list': torch.nn.utils.rnn.pad_sequence(token_lists, batch_first=True),
    'label': labels,
    }


#When using dataloader to load data, pay attention to collate_ The FN parameter passes in an overridden function
trainset = DataLoader(data, batch_size=BATCH_SIZE, shuffle=True, num_workers=0, collate_fn=collate_fn)

In [22]:
for batch in trainset:
    print(batch['token_list'])
    print(batch['label'])
    break

tensor([[   1,   67,  107,  123,   26, 1689,  116,    8,    2,    0,    0,    0,
            0,    0,    0,    0],
        [   1, 1237,    0,    0,  564,    0,   37,   34,    0,    2,    0,    0,
            0,    0,    0,    0],
        [   1,  195,  376,  176,   13,  220,  158, 2499,  211,  176, 1187,  241,
          123, 2397,  151,    2],
        [   1, 1100,  355, 1959,    2,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0]])
tensor([2, 2, 0, 2])


In [23]:
from tqdm import tqdm
epochs=15
for epoch in range(epochs):
    time.sleep(1)
    total_loss = 0.0
    total_acc=0.0
    for i, batch in enumerate(tqdm(trainset)):
        feature, label = batch['token_list'].to(device), batch['label'].to(device)
#         batch_length = torch.tensor(33, dtype = torch.int64).unsqueeze(0)
        optimizer.zero_grad()
        output =  model(feature).squeeze()
        loss = loss_function(output, label)
        acc=categorical_accuracy(output,label)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        total_acc+=acc.item() 
        

    print(f"loss on epoch {epoch+1} = {total_loss/len(trainset)}")
    print(f"accuracy on epoch {epoch+1} = {total_acc/len(trainset)}")

100%|██████████████████████████████████████████████████████████████████████████████| 1051/1051 [00:30<00:00, 34.36it/s]


loss on epoch 1 = 0.79414727111355
accuracy on epoch 1 = 0.6207580082650189


100%|██████████████████████████████████████████████████████████████████████████████| 1051/1051 [00:27<00:00, 37.79it/s]


loss on epoch 2 = 0.5272630900676255
accuracy on epoch 2 = 0.7965429749634014


100%|██████████████████████████████████████████████████████████████████████████████| 1051/1051 [00:26<00:00, 40.10it/s]


loss on epoch 3 = 0.410305385314793
accuracy on epoch 3 = 0.8498255629748191


100%|██████████████████████████████████████████████████████████████████████████████| 1051/1051 [00:30<00:00, 34.28it/s]


loss on epoch 4 = 0.3283729544768977
accuracy on epoch 4 = 0.8800348874277211


100%|██████████████████████████████████████████████████████████████████████████████| 1051/1051 [00:29<00:00, 35.26it/s]


loss on epoch 5 = 0.25179480255973996
accuracy on epoch 5 = 0.9133365049348572


100%|██████████████████████████████████████████████████████████████████████████████| 1051/1051 [00:29<00:00, 36.17it/s]


loss on epoch 6 = 0.19364922524179082
accuracy on epoch 6 = 0.936964795432921


100%|██████████████████████████████████████████████████████████████████████████████| 1051/1051 [00:28<00:00, 37.37it/s]


loss on epoch 7 = 0.15067251934714218
accuracy on epoch 7 = 0.9523469711575022


100%|██████████████████████████████████████████████████████████████████████████████| 1051/1051 [00:31<00:00, 33.28it/s]


loss on epoch 8 = 0.1193287891803662
accuracy on epoch 8 = 0.964954012070918


100%|██████████████████████████████████████████████████████████████████████████████| 1051/1051 [00:28<00:00, 37.16it/s]


loss on epoch 9 = 0.10106188634156403
accuracy on epoch 9 = 0.9676498572787822


100%|██████████████████████████████████████████████████████████████████████████████| 1051/1051 [00:29<00:00, 35.47it/s]


loss on epoch 10 = 0.08678261530804904
accuracy on epoch 10 = 0.9757373929590866


100%|██████████████████████████████████████████████████████████████████████████████| 1051/1051 [00:28<00:00, 36.79it/s]


loss on epoch 11 = 0.07247877514659831
accuracy on epoch 11 = 0.9804947668886774


100%|██████████████████████████████████████████████████████████████████████████████| 1051/1051 [00:28<00:00, 37.21it/s]


loss on epoch 12 = 0.05466890645306026
accuracy on epoch 12 = 0.9835870599429115


100%|██████████████████████████████████████████████████████████████████████████████| 1051/1051 [00:27<00:00, 38.62it/s]


loss on epoch 13 = 0.0474337928500711
accuracy on epoch 13 = 0.9871550903901046


100%|██████████████████████████████████████████████████████████████████████████████| 1051/1051 [00:27<00:00, 38.82it/s]


loss on epoch 14 = 0.033360494873031485
accuracy on epoch 14 = 0.9897716460513797


100%|██████████████████████████████████████████████████████████████████████████████| 1051/1051 [00:27<00:00, 38.90it/s]

loss on epoch 15 = 0.037771118129498335
accuracy on epoch 15 = 0.9883444338725024





In [24]:
import spacy
nlp = spacy.load('en_core_web_sm')
itol = {0: 'Hate Speech', 1: 'Offensive', 2: 'Neither'}

In [39]:
def predict_class(model, sent, min_len = 4):
    model.eval()
    sent =  re.sub(r'RT','',sent)
    sent =  re.sub('@[a-zA-z0-9_]+','',sent)
    sent =  re.sub(r'http://[^\s<>"]+|www\.[^\s<>"]+','',sent)
    sent =  re.sub(r'&#[0-9]+','',sent)
    sent =  re.sub(r'[0-9]+','',sent)
    sent =  re.sub(r'[^\w\s]', '',sent)
    sent =  sent.strip()
    sent =  " ".join(sent.split())
    sent = [token.lemma_ for token in nlp(sent) if token.text not in STOP_WORDS]
    vector = []
    for token in sent:
        vector.append(vocab.lookup_token(token))
    tensor = torch.LongTensor(vector)
    tensor = tensor.unsqueeze(0)
    preds = model(tensor.to(device))
    pred_class = preds[0].argmax(dim = 1)
    
#     print('The sentence is : {}'.format(sent))
    print(f'Predicted class is: {pred_class.item()} = {itol[int(pred_class)]}.')

In [40]:
predict_class(model=model, sent="Hate is a strong word")
predict_class(model=model, sent="Hate is a strong word. I would use it to describe my feelings towards noun")

Predicted class is: 2 = Neither.
Predicted class is: 0 = Hate Speech.


In [41]:
test_sen = 'What a wonderful world'
predict_class(model, test_sen)

Predicted class is: 2 = Neither.
