In [1]:
import pickle
import keras
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import pandas as pd
from time import time
from random import shuffle
import random

In [2]:
with open('encoded_c1_incels.pkl', 'rb') as outp:
    rr = pickle.load(outp)

In [3]:
with open('encoded_baseline.pkl', 'rb') as outp:
    ba = pickle.load(outp)

In [4]:
len(rr),len(ba)

(159, 84)

In [5]:
random.seed(69)

In [6]:
# train test split
# test set: 500 class 0, 500 class 1
#in chunks of 50, 500 = 50*10
chunks = 10

ind_b = [i for i in range(len(ba))]
ind_r = [i for i in range(len(rr))]
shuffle(ind_b)
shuffle(ind_r)


train0 = [ba[i] for i in ind_b[:-chunks]]
train1 = [rr[i] for i in ind_r[:-chunks]]

test0 = [ba[i] for i in ind_b[-chunks:]]
test1 = [rr[i] for i in ind_r[-chunks:]]

val0 = test0[:len(test0)//2]
test0 = test0[len(test0)//2:]

val1 = test1[:len(test1)//2]
test1 = test1[len(test1)//2:]

for i in (train0, train1, test0, test1, val0, val1): print(len(i))

74
149
5
5
5
5


In [7]:
def preprocess(ba,rr):
    cl0 = tf.keras.backend.concatenate(
        ba,
        axis=0
    )
    cl1 = tf.keras.backend.concatenate(
        rr,
        axis=0
    )
    train = tf.keras.backend.concatenate(
        [cl0,cl1],
        axis=0
    )
    y = np.concatenate([np.zeros(cl0.shape[0]), np.ones(cl1.shape[0])]).astype(int)

    #shufle
    ind_list = [i for i in range(y.shape[0])]
    shuffle(ind_list)
    train_new = np.array([train[i] for i in ind_list])
    y_new = np.array([y[i] for i in ind_list])

    #one-hot
    train_y = np.zeros((y_new.size, y_new.max()+1))
    train_y[np.arange(y_new.size),y_new] = 1
    train_y.shape
    
    return(train_new, train_y)

In [8]:
X_train, y_train =  preprocess(train0, train1)
X_val, y_val = preprocess(val0,val1)
X_test, y_test =  preprocess(test0, test1)

In [9]:
def create_model(size = 10):
    
    inputs = keras.Input(shape=(1024,))
    
    for i in range(size,1,-1):
        if i == size:
            x = layers.Dense(2**size, activation="relu")(inputs)
        else:
            x = layers.Dense(int(0.7*(2**(i+1))), activation="relu")(x)
            x = layers.Dropout(0.2)(x)
            x = layers.Dense(2**i, activation="relu")(x)
    


    outputs = layers.Dense(2)(x)

    model = keras.Model(inputs=inputs, outputs=outputs)
    
    
    return model

In [10]:
import sklearn

In [11]:
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

## Ensamble model

In [12]:
from os import makedirs
from keras.models import load_model
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [13]:
def stacked_dataset(members, inputX):
    stackX = None
    for model in members:
        # make prediction
        yhat = model.predict(inputX, verbose=0)
        #print(yhat.shape)
        # stack predictions into [rows, members, probabilities]d
        if stackX is None:
            stackX = yhat
        else:
            stackX = np.dstack((stackX, yhat))
    return stackX.reshape(stackX.shape[0],-1)

In [14]:
def fit_stacked_model(members, inputX, inputy):
    # create dataset using ensemble
    stackedX = stacked_dataset(members, inputX)
    # fit standalone model
    
    #model = LogisticRegression()
    model = RandomForestClassifier(max_depth=2, random_state=0)
    model.fit(stackedX, inputy.argmax(axis = -1))
    return model

In [15]:
def stacked_prediction(members, model, inputX):
    # create dataset using ensemble
    stackedX = stacked_dataset(members, inputX)
    # make a prediction
    yhat = model.predict(stackedX)
    return yhat

In [16]:
#early stopping
callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=4)

In [17]:
# fit model on dataset
def fit_model(trainX, trainy):
    model = create_model(10)
    model.compile(
    loss=keras.losses.CategoricalCrossentropy(from_logits=True),
    optimizer=keras.optimizers.RMSprop(),
    metrics=["accuracy"])
    model.fit(trainX, trainy, batch_size=512, epochs=100,shuffle= True, validation_split=0.2, callbacks=[callback], verbose = 0)
    return model

In [18]:
try:
    makedirs('new_models_50')
except:
    pass

In [19]:
n_members = 10
for i in range(n_members):
    # fit model
    model = fit_model(X_train,y_train)
    # save model
    filename = 'new_models_50/model_' + str(i + 1) + '.h5'
    model.save(filename)
    print('>Saved %s' % filename)
    tf.keras.backend.clear_session()

>Saved new_models_50/model_1.h5
>Saved new_models_50/model_2.h5
>Saved new_models_50/model_3.h5
>Saved new_models_50/model_4.h5
>Saved new_models_50/model_5.h5
>Saved new_models_50/model_6.h5
>Saved new_models_50/model_7.h5
>Saved new_models_50/model_8.h5
>Saved new_models_50/model_9.h5
>Saved new_models_50/model_10.h5


In [20]:
def load_all_models(n_models):
    all_models = list()
    for i in range(n_models):
        # define filename for this ensemble
        filename = 'new_models_50/model_' + str(i + 1) + '.h5'
        # load model from file
        model = load_model(filename)
        # add to list of members
        all_models.append(model)
        print('>loaded %s' % filename)
    return all_models

In [21]:
# load all models
n_members = 10
members = load_all_models(n_members)
print('Loaded %d models' % len(members))

>loaded new_models_50/model_1.h5
>loaded new_models_50/model_2.h5
>loaded new_models_50/model_3.h5
>loaded new_models_50/model_4.h5
>loaded new_models_50/model_5.h5
>loaded new_models_50/model_6.h5
>loaded new_models_50/model_7.h5
>loaded new_models_50/model_8.h5
>loaded new_models_50/model_9.h5
>loaded new_models_50/model_10.h5
Loaded 10 models


In [22]:
#on val set
for model in members:
    _, acc = model.evaluate(X_val, y_val, verbose=0)
    print('Model Accuracy: %.3f' % acc)
# fit stacked model using the ensemble

model_s = fit_stacked_model(members, X_val, y_val)
# evaluate model on test set

yhat = stacked_prediction(members, model_s, X_val)
acc = accuracy_score(y_val.argmax(axis = -1), yhat)
print('Stacked Test Accuracy: %.3f' % acc)

Model Accuracy: 0.890
Model Accuracy: 0.958
Model Accuracy: 0.970
Model Accuracy: 0.924
Model Accuracy: 0.918
Model Accuracy: 0.964
Model Accuracy: 0.948
Model Accuracy: 0.960
Model Accuracy: 0.910
Model Accuracy: 0.958
Stacked Test Accuracy: 0.970


In [23]:
#on test set
for model in members:
    _, acc = model.evaluate(X_test, y_test, verbose=0)
    print('Model Accuracy: %.3f' % acc)
    
# evaluate model on test set
yhat = stacked_prediction(members, model_s, X_test)
acc = accuracy_score(y_test.argmax(axis = -1), yhat)
print('Stacked Test Accuracy: %.3f' % acc)

Model Accuracy: 0.906
Model Accuracy: 0.990
Model Accuracy: 0.990
Model Accuracy: 0.914
Model Accuracy: 0.924
Model Accuracy: 0.986
Model Accuracy: 0.972
Model Accuracy: 0.986
Model Accuracy: 0.940
Model Accuracy: 0.984
Stacked Test Accuracy: 0.990
