# Dataset Information
The objective of this task is to detect hate speech in tweets. For the sake of simplicity, we say a tweet contains hate speech if it has a racist or sexist sentiment associated with it. So, the task is to classify racist or sexist tweets from other tweets.

Formally, given a training sample of tweets and labels, where label '1' denotes the tweet is racist/sexist and label '0' denotes the tweet is not racist/sexist, your objective is to predict the labels on the test dataset.

For training the models, we provide a labelled dataset of 31,962 tweets. The dataset is provided in the form of a csv file with each line storing a tweet id, its label and the tweet.

## Import modules

In [1]:
import pandas as pd # provide sql-like data manipulation tools. very handy.
pd.options.mode.chained_assignment = None
import numpy as np # high dimensional vector computing library.
from copy import deepcopy
from string import punctuation
from random import shuffle
import pickle
import h5py
import json
import matplotlib.pyplot as plt 

import gensim
from gensim.models.word2vec import Word2Vec # the word2vec model gensim class

from tqdm import tqdm
tqdm.pandas(desc="progress-bar")

from nltk.tokenize import TweetTokenizer # a tweet tokenizer from nltk.
from nltk import word_tokenize

from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from keras.preprocessing.image import ImageDataGenerator
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import Flatten
from keras.constraints import maxnorm
from keras.optimizers import SGD
from keras.layers.convolutional import Conv2D
from keras.layers.convolutional import MaxPooling2D
from keras.utils import np_utils
from keras.callbacks import Callback
from keras.models import model_from_json


# importing bokeh library for interactive dataviz
import bokeh.plotting as bp
from bokeh.models import HoverTool, BoxSelectTool
from bokeh.plotting import figure, show, output_notebook



## Loading the dataset

In [2]:
def ingest():
    data = pd.read_csv('./trainingandtestdata/training.1600000.processed.noemoticon.csv', encoding='latin-1') # Enter your file location
    data.columns=["Sentiment","ItemID","Date","Blank","SentimentSource","SentimentText"]
    data.drop(['ItemID', 'SentimentSource'], axis=1, inplace=True)
    data = data[data.Sentiment.isnull() == False]
    data['Sentiment'] = data['Sentiment'].map( {4:1, 0:0}) #Converting 4 to 1
    data = data[data['SentimentText'].isnull() == False]
    data.reset_index(inplace=True)
    data.drop('index', axis=1, inplace=True)
    print ('dataset loaded with shape', data.shape  )  
    return data

data = ingest()

dataset loaded with shape (1599999, 4)


In [3]:
data.head()

Unnamed: 0,Sentiment,Date,Blank,SentimentText
0,0,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,is upset that he can't update his Facebook by ...
1,0,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,@Kenichan I dived many times for the ball. Man...
2,0,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,my whole body feels itchy and like its on fire
3,0,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,"@nationwideclass no, it's not behaving at all...."
4,0,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,@Kwesidei not the whole crew


## Preprocessing the dataset

In [4]:
tokenizer = TweetTokenizer()
def tokenize(tweet):
    try:
        tweet = tweet.lower()
        tokens = tokenizer.tokenize(tweet)
        tokens = list(filter(lambda t: not t.startswith('@'), tokens))
        tokens = list(filter(lambda t: not t.startswith('#'), tokens))
        tokens = list(filter(lambda t: not t.startswith('http'), tokens))
        return tokens
    except:
        return 'NC'

In [5]:
def postprocess(data):
#     , n=1000000
#     data = data.head(n)
    data['tokens'] = data['SentimentText'].progress_map(tokenize)  ## progress_map is a variant of the map function plus a progress bar. Handy to monitor DataFrame creations.
    data = data[data.tokens != 'NC']
    data.reset_index(inplace=True)
    data.drop('index', inplace=True, axis=1)
    return data

data = postprocess(data)

progress-bar: 100%|████████████████████████████████████████████████████████| 1599999/1599999 [06:17<00:00, 4238.25it/s]


## Labelise Data

In [6]:
LabeledSentence = gensim.models.doc2vec.TaggedDocument # we'll talk about this down below

def labelizeTweets(tweets, label_type):
    labelized = []
    for i,v in tqdm(enumerate(tweets)):
        label = '%s_%s'%(label_type,i)
        labelized.append(LabeledSentence(v, [label]))
    return labelized

In [7]:
#Splitting for training and testing
x_train, x_test, y_train, y_test = train_test_split(np.array(data.head(1000000).tokens),
                                                    np.array(data.head(1000000).Sentiment), test_size=0.2)

In [8]:
x_train = labelizeTweets(x_train, 'TRAIN')
x_test = labelizeTweets(x_test, 'TEST') 

800000it [00:08, 92624.76it/s] 
200000it [00:00, 244486.36it/s]


In [9]:
data_labellised= labelizeTweets(np.array(data.tokens), 'data')

1599999it [00:20, 76741.35it/s] 


## Exploratry data analysis

In [10]:
# # frequent words visualization for -ve
# all_words = " ".join([sentence for sentence in df['clean_tweet'][df['label']==1]])

# wordcloud = WordCloud(width=800, height=500, random_state=42, max_font_size=100).generate(all_words)

# # plot the graph
# plt.figure(figsize=(15,8))
# plt.imshow(wordcloud, interpolation='bilinear')
# plt.axis('off')
# plt.show()

## Building word2vec vocabulary and training

In [11]:
n=1000000
n_dim = 200
tweet_w2v = Word2Vec(vector_size=n_dim, min_count=10)
tweet_w2v.build_vocab([x.words for x in tqdm(data_labellised)])

100%|███████████████████████████████████████████████████████████████████| 1599999/1599999 [00:01<00:00, 1414229.45it/s]


In [12]:
tweet_w2v.train([x.words for x in tqdm(data_labellised)],total_examples=tweet_w2v.corpus_count, epochs=tweet_w2v.epochs) 

100%|███████████████████████████████████████████████████████████████████| 1599999/1599999 [00:01<00:00, 1364532.12it/s]


(86481786, 118492055)

In [13]:
#convert any word(present in vocabulary) to vector.
tweet_w2v.wv['bye']

array([-1.456789  ,  0.17298372, -0.42566878, -1.0395752 , -0.81388956,
       -0.5286573 , -0.15369165,  1.416732  , -0.8759864 , -0.5698073 ,
       -0.0680962 , -0.73051476, -0.08978359, -1.7200352 ,  0.01341879,
        0.29176313, -0.35773915,  0.42538205, -0.7328192 , -2.4816663 ,
        1.1601367 , -1.5728831 ,  1.905467  ,  0.31764805,  1.105683  ,
        0.8333067 ,  1.6343863 , -0.2759056 , -0.40420222,  0.4828367 ,
       -0.6261319 ,  1.2176775 , -0.86758333, -0.6807841 , -1.1975994 ,
       -0.20793377,  1.7313544 , -0.7382337 ,  0.9589271 , -0.11782512,
       -1.1019429 , -2.485898  , -0.20628533, -0.05414274, -1.0182922 ,
        2.4717777 ,  0.56440985,  0.32911605,  2.9891093 ,  0.35594702,
        0.7188384 ,  0.58216184, -1.3354557 ,  0.31988502, -1.3287048 ,
        0.48533145,  0.3246286 ,  0.15365058,  0.26543778, -1.4515023 ,
       -1.7204909 ,  1.1772506 ,  0.7430171 , -0.08569822,  1.6623672 ,
        0.02137739, -0.28818232,  0.31419274, -1.7343674 ,  1.47

In [14]:
#Save the w2v model
# tweet_w2v.save('w2vmodel')
#Load the w2v model
#new_w2vmodel = gensim.models.Word2Vec.load('w2vmodel')

In [15]:
tweet_w2v.wv.most_similar('happy')

[('happpy', 0.6302089691162109),
 ('thrilled', 0.6099652051925659),
 ('pleased', 0.5953126549720764),
 ('unhappy', 0.5685697793960571),
 ('blessed', 0.5559052228927612),
 ('hapy', 0.5432155132293701),
 ('upset', 0.5414510369300842),
 ('sad', 0.5407065153121948),
 ('thankful', 0.5387083888053894),
 ('depressed', 0.5338698625564575)]

## Plotting the vectors

In [16]:
# # defining the chart
# output_notebook()
# plot_tfidf = bp.figure(width=700, height=600, title="A map of 10000 word vectors",
#                         tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
#                         x_axis_type=None, y_axis_type=None, min_border=1)

# # getting a list of word vectors. limit to 10000. each is of 200 dimensions
# word_vectors = [tweet_w2v.wv[w] for w in list(tweet_w2v.wv.vocab.keys())[:5000]]

# # dimensionality reduction. converting the vectors to 2d vectors
# from sklearn.manifold import TSNE
# tsne_model = TSNE(n_components=2, verbose=1, random_state=0)
# tsne_w2v = tsne_model.fit_transform(word_vectors)

# # putting everything in a dataframe
# tsne_df = pd.DataFrame(tsne_w2v, columns=['x', 'y'])
# tsne_df['words'] = list(tweet_w2v.wv.vocab.keys())[:5000]

# # plotting. the corresponding word appears when you hover on the data point.
# plot_tfidf.scatter(x='x', y='y', source=tsne_df)
# hover = plot_tfidf.select(dict(type=HoverTool))
# hover.tooltips={"word": "@words"}
# show(plot_tfidf)

## TF-IDF matrix of Data

In [17]:
print ('building tf-idf matrix ...')
vectorizer = TfidfVectorizer(analyzer=lambda x: x, min_df=10)
matrix = vectorizer.fit_transform([x.words for x in data_labellised])
tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))
print ('vocab size :', len(tfidf))

building tf-idf matrix ...
vocab size : 34893




In [18]:
#Save the tfidf 
# with open("tfidfdict.txt", "wb") as myFile:
#     pickle.dump(tfidf, myFile)
# with open("tfidfdict.txt", "rb") as myFile:
#     tfidf = pickle.load(myFile)

## Build tweet vector to give input to FFNN

In [19]:
def buildWordVector(tokens, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in tokens:
        try:
            vec += tweet_w2v.wv[word].reshape((1, size)) * tfidf[word] #combining w2v vectors with tfidf value of words in the tweet.
            count += 1.
        except KeyError: # handling the case where the token is not
                         # in the corpus. useful for testing.
            continue
    if count != 0:
        vec /= count
    return vec

In [20]:
train_vecs_w2v = np.concatenate([buildWordVector(z, n_dim) for z in tqdm(map(lambda x: x.words, x_train))])
train_vecs_w2v = scale(train_vecs_w2v)

test_vecs_w2v = np.concatenate([buildWordVector(z, n_dim) for z in tqdm(map(lambda x: x.words, x_test))])
test_vecs_w2v = scale(test_vecs_w2v)

800000it [04:05, 3255.11it/s]
200000it [01:11, 2784.49it/s]


## Training 3 layered FFNN

In [21]:
model = Sequential()
model.add(Dense(32, activation='relu', input_dim=200))
model.add(Dense(16, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.fit(train_vecs_w2v, y_train, epochs=100, batch_size=10000, verbose=2)

Epoch 1/100
80/80 - 4s - loss: 0.4469 - accuracy: 0.8116 - 4s/epoch - 49ms/step
Epoch 2/100
80/80 - 1s - loss: 0.3711 - accuracy: 0.8462 - 1s/epoch - 15ms/step
Epoch 3/100
80/80 - 1s - loss: 0.3548 - accuracy: 0.8528 - 1s/epoch - 14ms/step
Epoch 4/100
80/80 - 1s - loss: 0.3474 - accuracy: 0.8556 - 1s/epoch - 15ms/step
Epoch 5/100
80/80 - 1s - loss: 0.3430 - accuracy: 0.8575 - 1s/epoch - 15ms/step
Epoch 6/100
80/80 - 1s - loss: 0.3400 - accuracy: 0.8586 - 1s/epoch - 15ms/step
Epoch 7/100
80/80 - 1s - loss: 0.3377 - accuracy: 0.8595 - 1s/epoch - 14ms/step
Epoch 8/100
80/80 - 1s - loss: 0.3358 - accuracy: 0.8604 - 1s/epoch - 15ms/step
Epoch 9/100
80/80 - 1s - loss: 0.3343 - accuracy: 0.8609 - 1s/epoch - 16ms/step
Epoch 10/100
80/80 - 1s - loss: 0.3330 - accuracy: 0.8615 - 1s/epoch - 15ms/step
Epoch 11/100
80/80 - 1s - loss: 0.3319 - accuracy: 0.8620 - 1s/epoch - 14ms/step
Epoch 12/100
80/80 - 1s - loss: 0.3308 - accuracy: 0.8626 - 1s/epoch - 15ms/step
Epoch 13/100
80/80 - 1s - loss: 0.330

<keras.callbacks.History at 0x1b834d354e0>

In [22]:
# Evaluating accuracy score

score = model.evaluate(test_vecs_w2v, y_test, batch_size=128, verbose=2)
print(model.metrics_names[0],": ",score[0],"\n",model.metrics_names[1],": ",score[1])

1563/1563 - 4s - loss: 0.3272 - accuracy: 0.8639 - 4s/epoch - 3ms/step
loss :  0.3271777033805847 
 accuracy :  0.8638700246810913


In [23]:
# model = Sequential()
# model.add(Dense(64, activation='relu', input_dim=200))
# model.add(Dense(32, activation='relu'))
# model.add(Dense(16, activation='relu'))
# model.add(Dense(8, activation='relu'))
# model.add(Dense(1, activation='sigmoid'))
# model.compile(optimizer='rmsprop',
#               loss='binary_crossentropy',
#               metrics=['accuracy'])

# model.fit(train_vecs_w2v, y_train, epochs=100, batch_size=10000, verbose=2)

In [24]:
# # Evaluating accuracy score

# score = model.evaluate(test_vecs_w2v, y_test, batch_size=128, verbose=2)
# print(model.metrics_names[0],": ",score[0],"\n",model.metrics_names[1],": ",score[1])

## Saving Model

In [25]:
#Saving the model
model_json = model.to_json() # serialize model to JSON
with open("model.json", "w") as json_file:
    json_file.write(model_json)

model.save_weights("smodel.h5") # serialize weights to HDF5
print("Saved model to disk")

#Loading the model
# newmodel = model_from_json(open('model.json').read())
# newmodel.load_weights('smodel.h5')

Saved model to disk


## Predicting for test file (Validation)

In [26]:
def ingesttest():
    testdata = pd.read_csv('./trainingandtestdata/testdata.manual.2009.06.14.csv', encoding='latin-1')
    testdata.columns=["Sentiment","ItemID","Date","Blank","SentimentSource","SentimentText"]
    testdata.drop(['ItemID', 'SentimentSource'], axis=1, inplace=True)
    testdata = testdata[testdata.Sentiment.isnull() == False]
    testdata['Sentiment'] = testdata['Sentiment'].map( {4:1, 0:0, 2:1})
    testdata = testdata[testdata['SentimentText'].isnull() == False]
    testdata.reset_index(inplace=True)
    testdata.drop('index', axis=1, inplace=True)
    print ('dataset loaded with shape', testdata.shape  )  
    return testdata

testdata = ingesttest()

dataset loaded with shape (497, 4)


In [27]:
testdata = postprocess(testdata)
testdata.head(5)

progress-bar: 100%|████████████████████████████████████████████████████████████████| 497/497 [00:00<00:00, 3527.92it/s]


Unnamed: 0,Sentiment,Date,Blank,SentimentText,tokens
0,1,Mon May 11 03:18:03 UTC 2009,kindle2,Reading my kindle2... Love it... Lee childs i...,"[reading, my, kindle, 2, ..., love, it, ..., l..."
1,1,Mon May 11 03:18:54 UTC 2009,kindle2,"Ok, first assesment of the #kindle2 ...it fuck...","[ok, ,, first, assesment, of, the, ..., it, fu..."
2,1,Mon May 11 03:19:04 UTC 2009,kindle2,@kenburbary You'll love your Kindle2. I've had...,"[you'll, love, your, kindle, 2, ., i've, had, ..."
3,1,Mon May 11 03:21:41 UTC 2009,kindle2,@mikefish Fair enough. But i have the Kindle2...,"[fair, enough, ., but, i, have, the, kindle, 2..."
4,1,Mon May 11 03:22:00 UTC 2009,kindle2,@richardebaker no. it is too big. I'm quite ha...,"[no, ., it, is, too, big, ., i'm, quite, happy..."


In [28]:
test_X = np.array(testdata.tokens)
test_y = np.array(testdata.Sentiment)

In [31]:
test_w2v_vecs = np.concatenate([buildWordVector(z, n_dim) for z in tqdm(map(lambda x:x, test_X))])
test_w2v_vecs.shape

497it [00:00, 1485.13it/s]


(497, 200)

In [32]:
# model.predict_classes(test_w2v_vecs)
score = model.evaluate(test_w2v_vecs,test_y, batch_size=128, verbose=2)
print(model.metrics_names[0],": ",score[0],"\n",model.metrics_names[1],": ",score[1])

4/4 - 0s - loss: 0.5987 - accuracy: 0.6962 - 61ms/epoch - 15ms/step
loss :  0.5987269282341003 
 accuracy :  0.696177065372467
