In [1]:
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from gensim.test.utils import datapath
import numpy as np
import re
import string
from nltk import PorterStemmer, RegexpTokenizer
from sklearn.model_selection import train_test_split
import itertools

Load corpus with labels

In [2]:
with open('dataset/training_set_clean_only_text.txt', 'r', encoding="utf8") as text_file:
    text_lines = text_file.readlines()
with open('dataset/training_set_clean_only_tags.txt', 'r') as tags_file:
    tags = tags_file.readlines()

Load word embedings found in http://dsmodels.nlp.ipipan.waw.pl/ 
continuous Skip-gram (SG)

In [3]:
wv_from_text = KeyedVectors.load_word2vec_format(datapath(r"C:\Users\Anna Marciniec\dataset\nkjp-forms-all-300-skipg-hs.txt.gz"), binary=False)

In [4]:
def clean_sentence(sentence):
    cleaned =  ' '.join(re.sub("(@[A-Za-z0-9]_+)|([^0-9A-Za-z \t]) |(\w+:\/\/\S+) |@anonymized_account ", " ", sentence).split())
    cleaned = cleaned.translate(str.maketrans('', '', string.punctuation))
    return cleaned.lower()

In [5]:
regexp_tokenizer = RegexpTokenizer(r'\w+')

In [6]:
def clean_tweet(tweet):
    cleaned_tweet = clean_sentence(tweet)
    new_tweet_list = []
    tokenized =  regexp_tokenizer.tokenize(cleaned_tweet.lower())
    for word in tokenized:
        if word in wv_from_text.vocab:
            new_tweet_list.append(word)
            continue
        if word.capitalize() in wv_from_text.vocab:
            new_tweet_list.append(word.capitalize())
            continue
        removed = word[:-2]
        if removed in wv_from_text.vocab:
            new_tweet_list.append(removed)
            continue
    return ' '.join(new_tweet_list)

In [7]:
cleaned_tweets = []

In [8]:
for tweet in text_lines:
    new_tweet = clean_tweet(tweet)
    cleaned_tweets.append(new_tweet)

Split dataset for test and training

In [9]:
X_train, X_test, y_train, y_test = train_test_split(cleaned_tweets, tags, test_size=0.25)

In [10]:
y_train = list(map(int, y_train))
y_test = list(map(int, y_test))

In [11]:
from keras.preprocessing.text import Tokenizer

Using TensorFlow backend.


In [12]:
token = Tokenizer()
token.fit_on_texts(cleaned_tweets)
word_index = token.word_index


In [13]:
from keras.preprocessing import sequence

In [14]:
train_seq_x = sequence.pad_sequences(token.texts_to_sequences(X_train), maxlen=70)
valid_seq_x = sequence.pad_sequences(token.texts_to_sequences(X_test), maxlen=70)

In [15]:
lexicon = set(itertools.chain(*list(map(lambda x :x.split(' ') ,cleaned_tweets))))

In [31]:
embedding_matrix = np.zeros((len(lexicon) + 1, 300))


In [32]:
for word, i in word_index.items():
    if word in wv_from_text.vocab:
        embedding_vector = wv_from_text.get_vector(word)
    else:
        embedding_vector = wv_from_text.get_vector(word.capitalize())

    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [53]:
from __future__ import absolute_import, division, print_function, unicode_literals

import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  1


In [33]:
from tensorflow import keras
from tensorflow.keras import layers, models, optimizers, metrics
from sklearn import metrics

In [69]:
def create_cnn():
    with tf.device('/GPU:0'):
        # Add an Input Layer
        input_layer = layers.Input((70, ))

        # Add the word embedding Layer
        embedding_layer = layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)(input_layer)
        embedding_layer = layers.SpatialDropout1D(0.3)(embedding_layer)

        # Add the convolutional Layer
        conv_layer = layers.Convolution1D(100, 3, activation="relu")(embedding_layer)

        # Add the pooling Layer
        pooling_layer = layers.GlobalMaxPool1D()(conv_layer)

        # Add the output Layers
        output_layer1 = layers.Dense(50, activation="relu")(pooling_layer)
        output_layer1 = layers.Dropout(0.25)(output_layer1)
        output_layer2 = layers.Dense(1, activation="sigmoid")(output_layer1)

        # Compile the model
        model = models.Model(inputs=input_layer, outputs=output_layer2)
        model.compile(optimizer=optimizers.Adam(), loss='binary_crossentropy')
    
    return model

classifier = create_cnn()
accuracy = train_model(classifier, train_seq_x, np.array(y_train), valid_seq_x, is_neural_net=True)
print ("Accuracy",  accuracy)

Train on 7530 samples
Epoch 1/11
Epoch 2/11
Epoch 3/11
Epoch 4/11
Epoch 5/11
Epoch 6/11
Epoch 7/11
Epoch 8/11
Epoch 9/11
Epoch 10/11
Epoch 11/11
Accuracy 0.9203504579848666


In [68]:
def train_model(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False):
    with tf.device('/GPU:0'):
        # fit the training dataset on the classifier
        classifier.fit(feature_vector_train, label, epochs = 11)

        # predict the labels on validation dataset
        predictions = classifier.predict(feature_vector_valid)

        if is_neural_net:
            predictions = predictions.argmax(axis=-1)

    return metrics.accuracy_score(predictions, y_test)

In [102]:
def predict(sentence:str):
    cleaned_tweet = clean_tweet(sentence)
    seq = sequence.pad_sequences(token.texts_to_sequences([cleaned_tweet]), maxlen=70)
    prediction = loaded_classifier.predict(seq)
    text = ''
    if prediction[0] >= 0.5:
        text = 'cyberbulling'
    else:
        text = 'not cyberbulling'
    print(f'with classification of {prediction[0][0]} sentence "{sentence}" is classified as {text}')

Source: https://twitter.com/RobertBiedron/status/1216358527222108161

In [96]:
predict('do końca świata i o jeden dzień dłużej')

with classification of 0.00016966761904768646 sentence "do końca świata i o jeden dzień dłużej" is classified as not cyberbulling


Source : https://twitter.com/KingaBezKorony/status/1216447484169543682, 

In [99]:
predict('Ty małostkowy, podły, bezduszny, wstrętny, posiorski katolu!!! siema! ')

with classification of 0.777967631816864 sentence "Ty małostkowy, podły, bezduszny, wstrętny, posiorski katolu!!! siema! " is classified as cyberbulling


In [100]:
classifier.save('cyberbulling_classifier.h5')

In [101]:
loaded_classifier = keras.models.load_model('cyberbulling_classifier.h5')

Source https://twitter.com/jurema4444/status/1217071156760629249

In [106]:
predict('Donosiciele, zdrajcy najwieksze łajzy')

with classification of 0.6486673951148987 sentence "Donosiciele, zdrajcy najwieksze łajzy" is classified as cyberbulling
