# Importing pickle for opening the files
In pickle files we have saved encoded versions of our text files. One file contains incels comment, 2nd one contains reddit comments and the 3rd one contains hate speech. Those files are encoded with bert and electra. Encoding process for the reddit data is done in this notebook.


In [1]:
import pickle

# Importing tensorflow
Tensorflow is used for creating the models and also we used tensorflow_hub to open elektra and bert from the web.

In [2]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text

# Importing time
Just to have more informations from our function we used time package

In [3]:
from time import time

# Importing pandas
This will help us in opening existing CSV file that contains reddit comments

In [4]:
import pandas as pd

## Loading the pickle files
Next three cell will load all pickle files that we have. Here we printed the length of the pickle files just to check if everything is fine.

In [5]:
with open('encoded_c1_incels.pkl', 'rb') as outp:
    rr = pickle.load(outp)

print(len(rr))

159


In [6]:
with open('encoded_baseline.pkl', 'rb') as outp:
    ba = pickle.load(outp)

print(len(ba))

84


## Creating datasets for validation and testing
Here we used existing data and used some of it's parts for 2 different tasks

In [10]:
from random import shuffle
import random

In [11]:
# train test split
# test set: 500 class 0, 500 class 1
#in chunks of 50, 500 = 50*10
chunks = 10

ind_b = [i for i in range(len(ba))]
ind_r = [i for i in range(len(rr))]
shuffle(ind_b)
shuffle(ind_r)


train0 = [ba[i] for i in ind_b[:-chunks]]
train1 = [rr[i] for i in ind_r[:-chunks]]

test0 = [ba[i] for i in ind_b[-chunks:]]
test1 = [rr[i] for i in ind_r[-chunks:]]

val0 = test0[:len(test0)//2]
test0 = test0[len(test0)//2:]

val1 = test1[:len(test1)//2]
test1 = test1[len(test1)//2:]

for i in (train0, train1, test0, test1, val0, val1): print(len(i))

74
149
5
5
5
5


## Importing numpy
We used numpy for reshaping of our data arrays and also to create some supportive arrays (like the array with all zeros or all ones).

In [12]:
import numpy as np

## More on data preprocessing
So here we created the function that will concatenate, shuffle out data. Also we used one-hot-encoder in order to transform text into numbers.

In [32]:
def preprocess(ba,rr):
    cl0 = tf.keras.backend.concatenate(
        ba,
        axis=0
    )
    cl1 = tf.keras.backend.concatenate(
        rr,
        axis=0
    )
    train = tf.keras.backend.concatenate(
        [cl0,cl1],
        axis=0
    )
    y = np.concatenate([np.zeros(cl0.shape[0]), np.ones(cl1.shape[0])]).astype(int)

    #shufle
    ind_list = [i for i in range(y.shape[0])]
    shuffle(ind_list)
    train_new = np.array([train[i] for i in ind_list])
    y_new = np.array([y[i] for i in ind_list])

    #one-hot
    train_y = np.zeros((y_new.size, y_new.max()+1))
    train_y[np.arange(y_new.size),y_new] = 1
    train_y.shape
    
    return(train_new, train_y)

In [33]:
X_train, y_train =  preprocess(train0, train1)
X_val, y_val = preprocess(val0,val1)
X_test, y_test =  preprocess(test0, test1)

## Importing sklearn and keras
Sklearn is the best package for maching learning in python. We used so many of it's functions to create fine models and also to do osme training, testing, etc.
From keras we only needed function for loading the models.

In [34]:
import sklearn
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from os import makedirs
from keras.models import load_model

In [35]:
def stacked_dataset(members, inputX):
    stackX = None
    for model in members:
        # make prediction
        yhat = model.predict(inputX, verbose=0)
        #print(yhat.shape)
        # stack predictions into [rows, members, probabilities]d
        if stackX is None:
            stackX = yhat
        else:
            stackX = np.dstack((stackX, yhat))
    return stackX.reshape(stackX.shape[0],-1)

In [36]:
def fit_stacked_model(members, inputX, inputy):
    # create dataset using ensemble
    stackedX = stacked_dataset(members, inputX)
    # fit standalone model
    
    #model = LogisticRegression()
    model = RandomForestClassifier(max_depth=2, random_state=0)
    model.fit(stackedX, inputy.argmax(axis = -1))
    return model

In [37]:
def stacked_prediction(members, model, inputX):
    # create dataset using ensemble
    stackedX = stacked_dataset(members, inputX)
    # make a prediction
    yhat = model.predict(stackedX)
    return yhat

In [38]:
def load_all_models(n_models):
    all_models = list()
    for i in range(n_models):
        # define filename for this ensemble
        filename = 'new_models_50/model_' + str(i + 1) + '.h5'
        # load model from file
        model = load_model(filename)
        # add to list of members
        all_models.append(model)
        print('>loaded %s' % filename)
    return all_models

In [39]:
# load all models
n_members = 10
members = load_all_models(n_members)
print('Loaded %d models' % len(members))

>loaded new_models_50/model_1.h5
>loaded new_models_50/model_2.h5
>loaded new_models_50/model_3.h5
>loaded new_models_50/model_4.h5
>loaded new_models_50/model_5.h5
>loaded new_models_50/model_6.h5
>loaded new_models_50/model_7.h5
>loaded new_models_50/model_8.h5
>loaded new_models_50/model_9.h5
>loaded new_models_50/model_10.h5
Loaded 10 models


In [40]:
X_val.shape, y_val.shape

((500, 1024), (500, 2))

In [41]:
#on val set
for model in members:
    _, acc = model.evaluate(X_val, y_val, verbose=0)
    print('Model Accuracy: %.3f' % acc)
# fit stacked model using the ensemble

model_s = fit_stacked_model(members, X_val, y_val)
# evaluate model on test set

yhat = stacked_prediction(members, model_s, X_val)
acc = accuracy_score(y_val.argmax(axis = -1), yhat)
print('Stacked Test Accuracy: %.3f' % acc)

Model Accuracy: 0.910
Model Accuracy: 0.994
Model Accuracy: 0.992
Model Accuracy: 0.924
Model Accuracy: 0.926
Model Accuracy: 0.994
Model Accuracy: 0.976
Model Accuracy: 0.992
Model Accuracy: 0.930
Model Accuracy: 0.994
Stacked Test Accuracy: 0.996


In [42]:
X_test.shape, y_test.shape

((500, 1024), (500, 2))

In [43]:
#on test set
for model in members:
    _, acc = model.evaluate(X_test, y_test, verbose=0)
    print('Model Accuracy: %.3f' % acc)
    
# evaluate model on test set
yhat = stacked_prediction(members, model_s, X_test)
acc = accuracy_score(y_test.argmax(axis = -1), yhat)
print('Stacked Test Accuracy: %.3f' % acc)

Model Accuracy: 0.916
Model Accuracy: 0.976
Model Accuracy: 0.980
Model Accuracy: 0.926
Model Accuracy: 0.922
Model Accuracy: 0.978
Model Accuracy: 0.972
Model Accuracy: 0.976
Model Accuracy: 0.930
Model Accuracy: 0.974
Stacked Test Accuracy: 0.972
