## Keras layers


In [1]:
import tensorflow as tf
import tensorflow.keras.backend as K
from collections import Counter


# Class representing AGDS
class SparseLayer(tf.keras.layers.Layer):
    def __init__(self, output_dim, **kwargs):
        super(SparseLayer, self).__init__(**kwargs)
        self.output_dim = output_dim
        

    def build(self, input_shape):
        self.kernel = self.add_weight(
            shape=(input_shape[1], self.output_dim), 
            initializer='random_normal',
            trainable=True)
        super(SparseLayer, self).build(input_shape)

    def call(self, input_data):
        multi_kernel = K.expand_dims(self.kernel, 0)
        coeffs = tf.keras.layers.Dot(axes=(1, 1))([input_data, multi_kernel])
        return tf.keras.activations.softmax(coeffs)
    
    def get_config(self):
        return {"output_dimension": self.output_dim}


# Class for AGDS Word2BoW processing
class AGDSVectorization(tf.keras.layers.Layer):
    def __init__(self, word_map, **kwargs):
        self.word_map = word_map
        super(AGDSVectorization, self).__init__(**kwargs)

    def call(self, input_data):
        post_vector = [0] * len(self.word_map)

        # Calculate word occurrences
        word_ctr = Counter(input_data)

        for word, freq in word_ctr.items():
            if word in self.word_map:
                post_vector[self.word_map.index(word)] = freq
        return post_vector

    def compute_output_shape(self, input_shape):
        return (input_shape[0], len(self.word_map))

## Model testing

In [2]:
# Associate words with archetypes/character traits as intermediate layer
# and with influencer as the "last" layer

# Dependencies
import pandas as pd
import numpy as np
from tqdm.notebook import trange, tqdm
import copy
import os
import toml
import re
import itertools
from text_cleaner import *
import operator
from collections import Counter
import pickle

def extract_hashtags(post_text):
    HASH_RE = re.compile(r"\#\w+")
    out_list = re.findall(HASH_RE, post_text)
    return out_list

In [3]:
# Load the .csv with archetypes
arch_df = pd.read_csv('archetypes_pl_new.csv', index_col=0)

# Save the order of columns
trait_list = arch_df.columns.tolist()

# Show the table header and column list
print(trait_list)
arch_df.head()

['innocent', 'sage', 'explorer', 'outlaw', 'magician', 'hero', 'lover', 'jester', 'everyman', 'caregiver', 'ruler', 'creator', 'dominant', 'submissive', 'maximalist', 'minimalist', 'inspiring', 'systematic', 'discovering', 'conservative', 'verifying', 'overlooking', 'sharpening', 'harmonic', 'empathic', 'matter_of_fact', 'brave', 'protective', 'generous', 'thrifty', 'favourable', 'balanced', 'sensuality', 'intelligent', 'believe', 'egocentric', 'allocentric']


Unnamed: 0_level_0,innocent,sage,explorer,outlaw,magician,hero,lover,jester,everyman,caregiver,...,protective,generous,thrifty,favourable,balanced,sensuality,intelligent,believe,egocentric,allocentric
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
marek_grodzki,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,...,2.0,3.0,4.0,4.0,3.0,4.0,4.0,3.0,0.0,0.0
vege_style_life,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.0,4.0,4.0,4.0,3.0,3.0,3.0,2.0,0.0,3.0
oliwka__2007,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,...,2.0,2.0,0.0,3.0,1.0,2.0,4.0,1.0,0.0,3.0
z_przestrzeni_serca,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.0,3.0,0.0,4.0,4.0,3.0,4.0,4.0,0.0,1.0
zaradne_warsztaty,3.0,0.0,0.0,0.0,3.0,0.0,0.0,2.0,3.0,4.0,...,3.0,4.0,0.0,2.0,2.0,4.0,2.0,3.0,1.0,3.0


In [4]:
# Load data
train_df = pd.read_csv("agds_structures/train_90.csv", index_col=0)
test_df = pd.read_csv("agds_structures/test_10.csv", index_col=0)

with open("agds_structures/vectorized_test_90.pickle", "rb") as f:
    test_vectorized = pickle.load(f)
    
with open("agds_structures/vectorized_train_90.pickle", "rb") as f:
    train_vectorized = pickle.load(f)
    
# Load structure
with open("agds_structures/normalized_s90_10_word_trait_array.pickle", "rb") as f:
    softmax_word_df = pickle.load(f)

# Extract word map
softmax_word_map = softmax_word_df.columns.tolist()
    
# Show array head
softmax_word_df.head()

Unnamed: 0,Unnamed: 1,good,morning,wish,you,nice,relaxing,thursday,saturday,friday,start,...,#używane,#podzielnia,#ekobiuro,#goHi2020,#hackaton,#jachranka,#greenladies,#greenguys,#polishheroes,#bestgifts
innocent,0,1.0,1.0,1.0,0.985589,1.0,1.0,1.0,1.0,0.579753,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
innocent,1,0.164918,0.363648,0.005689,1.0,0.0,0.0,0.0,0.0,1.0,0.437572,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
innocent,2,0.059452,0.032827,0.0,0.88598,0.026943,0.003168,0.0,0.0,0.097603,0.325299,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
innocent,3,0.053402,0.063776,0.003512,0.250463,0.024454,0.0,0.021592,0.062854,0.216949,0.151777,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
innocent,4,0.0,0.0,0.0,0.0,0.013889,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
# Prepare training data
X_train = np.array(train_vectorized)
print(len(X_train))

# Prepare validation/test dataset
X_val = np.array(test_vectorized)

616


In [7]:
# Create a model
def build_model():
    inputs = tf.keras.Input(shape=(len(softmax_word_map), ), name="input_layer")
    x = SparseLayer(5, name="AGDS_weight_layer")(inputs)
    outputs = tf.keras.activations.softmax(x)

    model = tf.keras.Model(
        inputs=inputs,
        outputs=[outputs]
    )
    return model

tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="./logs_agds_model")
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_accuracy', factor=0.2,
                              patience=5, min_lr=1e-7, verbose=1)
early_stop = tf.keras.callbacks.EarlyStopping(
    monitor='val_accuracy', min_delta=0, patience=20, verbose=1,
    mode='auto', baseline=None, restore_best_weights=True)

In [8]:
def trait_training_pipeline(trait_list):
    for trait in tqdm(trait_list):
        # Prepare labels
        y_train = tf.keras.utils.to_categorical(np.array(train_df[trait]), num_classes=5)
        y_val = tf.keras.utils.to_categorical(np.array(test_df[trait]), num_classes=5)

        # Create model
        test_model = build_model()

        test_model.compile(optimizer=tf.keras.optimizers.RMSprop(learning_rate=1e-2),
                       loss=tf.keras.losses.CategoricalCrossentropy(),
                       metrics=["accuracy"])

        # Set callbacks
        model_save_callback = tf.keras.callbacks.ModelCheckpoint("./agds_structures/weight_finetuning/"+trait+"-saved-model-{epoch:02d}-{val_accuracy:.2f}.hdf5", 
                                                              monitor='val_accuracy', 
                                                              verbose=1, save_best_only=True, save_weights_only=True, 
                                                              mode='max')
        

        # Train the model
        with tf.device("/GPU:0"):
            out = test_model.fit(X_train, 
                                 y_train,
                                 batch_size=10,
                                 validation_data=(X_val, y_val),
                                 epochs=100,
                                 callbacks=[model_save_callback, tensorboard_callback, reduce_lr, early_stop])

        weights_tmp = test_model.get_layer("AGDS_weight_layer").get_weights()[0]
        softmax_word_df.loc[trait] = weights_tmp.T
        
        del y_train
        del y_val
        del test_model
        del out
        del weights_tmp

In [9]:
trait_training_pipeline(trait_list)

  0%|          | 0/37 [00:00<?, ?it/s]

Epoch 1/100

Epoch 00001: val_accuracy improved from -inf to 0.62319, saving model to ./agds_structures/weight_finetuning/innocent-saved-model-01-0.62.hdf5
Epoch 2/100

Epoch 00002: val_accuracy did not improve from 0.62319
Epoch 3/100

Epoch 00003: val_accuracy did not improve from 0.62319
Epoch 4/100

Epoch 00004: val_accuracy did not improve from 0.62319
Epoch 5/100

Epoch 00005: val_accuracy did not improve from 0.62319
Epoch 6/100

Epoch 00006: val_accuracy did not improve from 0.62319

Epoch 00006: ReduceLROnPlateau reducing learning rate to 0.0019999999552965165.
Epoch 7/100

Epoch 00007: val_accuracy did not improve from 0.62319
Epoch 8/100

Epoch 00008: val_accuracy did not improve from 0.62319
Epoch 9/100

Epoch 00009: val_accuracy did not improve from 0.62319
Epoch 10/100

Epoch 00010: val_accuracy did not improve from 0.62319
Epoch 11/100

Epoch 00011: val_accuracy did not improve from 0.62319

Epoch 00011: ReduceLROnPlateau reducing learning rate to 0.0003999999724328518.


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/mnt/HDD_Linux/Praca_magisterska/instagram_analysis/insta_venv/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3441, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-9-8b4b62bc367b>", line 1, in <module>
    trait_training_pipeline(trait_list)
  File "<ipython-input-8-17c622f02d43>", line 23, in trait_training_pipeline
    out = test_model.fit(X_train,
  File "/mnt/HDD_Linux/Praca_magisterska/instagram_analysis/insta_venv/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py", line 1183, in fit
    tmp_logs = self.train_function(iterator)
  File "/mnt/HDD_Linux/Praca_magisterska/instagram_analysis/insta_venv/lib/python3.8/site-packages/tensorflow/python/eager/def_function.py", line 889, in __call__
    result = self._call(*args, **kwds)
  File "/mnt/HDD_Linux/Praca_magisterska/instagram_analysis/insta_venv/lib/python3.8/site-packages/tensorflow/python/eager/def_functi

TypeError: object of type 'NoneType' has no len()

# Test stuff

In [17]:
# Prepare weights for 

test_model.get_layer("AGDS_weight_layer").set_weights([softmax_word_df.loc["innocent"].to_numpy().T])

In [18]:
test_model.compile(optimizer=tf.keras.optimizers.RMSprop(learning_rate=1e-2),
                   loss=tf.keras.losses.CategoricalCrossentropy(),
                   metrics=["accuracy"])

# Set callbacks
model_save_callback = tf.keras.callbacks.ModelCheckpoint("./agds_structures/weight_finetuning/saved-model-{epoch:02d}-{val_accuracy:.2f}.hdf5", 
                                                      monitor='val_accuracy', 
                                                      verbose=1, save_best_only=True, save_weights_only=True, 
                                                      mode='auto')
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="./logs_agds_model")
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_accuracy', factor=0.2,
                              patience=5, min_lr=1e-7, verbose=1)
early_stop = tf.keras.callbacks.EarlyStopping(
    monitor='val_accuracy', min_delta=0, patience=20, verbose=1,
    mode='auto', baseline=None, restore_best_weights=True)

with tf.device("/GPU:0"):
    out = test_model.fit(X_train, 
                         y_train,
                         batch_size=10,
                         validation_data=(X_val, y_val),
                         epochs=100,
                         callbacks=[model_save_callback, tensorboard_callback, reduce_lr, early_stop])

Epoch 1/100

Epoch 00001: val_accuracy improved from -inf to 0.82609, saving model to ./agds_structures/saved-model-01-0.83.hdf5
Epoch 2/100

Epoch 00002: val_accuracy improved from 0.82609 to 0.84058, saving model to ./agds_structures/saved-model-02-0.84.hdf5
Epoch 3/100

Epoch 00003: val_accuracy improved from 0.84058 to 0.86957, saving model to ./agds_structures/saved-model-03-0.87.hdf5
Epoch 4/100

Epoch 00004: val_accuracy improved from 0.86957 to 0.88406, saving model to ./agds_structures/saved-model-04-0.88.hdf5
Epoch 5/100

Epoch 00005: val_accuracy did not improve from 0.88406
Epoch 6/100

Epoch 00006: val_accuracy did not improve from 0.88406
Epoch 7/100

Epoch 00007: val_accuracy improved from 0.88406 to 0.89855, saving model to ./agds_structures/saved-model-07-0.90.hdf5
Epoch 8/100

Epoch 00008: val_accuracy did not improve from 0.89855
Epoch 9/100

Epoch 00009: val_accuracy did not improve from 0.89855
Epoch 10/100

Epoch 00010: val_accuracy improved from 0.89855 to 0.9130

In [38]:
test_model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_layer (InputLayer)     [(None, 211596)]          0         
_________________________________________________________________
AGDS_weight_layer (SparseLay (None, 5)                 1057980   
_________________________________________________________________
tf.compat.v1.nn.softmax (TFO (None, 5)                 0         
Total params: 1,057,980
Trainable params: 1,057,980
Non-trainable params: 0
_________________________________________________________________


In [19]:
weights_tmp = test_model.get_layer("AGDS_weight_layer").get_weights()[0]
print(weights_tmp)
print()

[[ 9.6871972e-01  7.6641291e-02 -1.3664700e-01  3.3654189e-01
  -1.1755135e-01]
 [ 1.0776323e+00  3.0333450e-01 -3.6403559e-02  2.3373212e-01
  -9.4700076e-02]
 [ 1.0000000e+00  3.1873159e-02 -9.4676167e-02  6.7399435e-02
  -3.1427972e-02]
 ...
 [ 0.0000000e+00  4.4363305e-02 -4.4368908e-02  1.8069198e-18
   0.0000000e+00]
 [ 0.0000000e+00  4.4363305e-02 -4.4368908e-02  1.8069198e-18
   0.0000000e+00]
 [ 0.0000000e+00  4.4363305e-02 -4.4368908e-02  1.8069198e-18
   0.0000000e+00]]



In [20]:
# Try to set the dataframe with new weights
softmax_word_df.loc["innocent"] = weights_tmp.T

In [21]:
softmax_word_df.head()

Unnamed: 0,Unnamed: 1,good,morning,wish,you,nice,relaxing,thursday,saturday,friday,start,...,#używane,#podzielnia,#ekobiuro,#goHi2020,#hackaton,#jachranka,#greenladies,#greenguys,#polishheroes,#bestgifts
innocent,0,0.96872,1.077632,1.0,0.922863,1.0,1.0,1.0,1.0,0.5797529,0.978891,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
innocent,1,0.076641,0.303335,0.031873,0.898432,-0.014613,0.0,-2.934082e-31,-0.101813,0.9946581,0.350028,...,0.0443633,0.0443633,0.0443633,0.0443633,0.0443633,0.0443633,0.0443633,0.0443633,0.0443633,0.0443633
innocent,2,-0.136647,-0.036404,-0.094676,0.711773,0.089594,0.003168382,0.0,-0.02369519,0.03892035,0.199146,...,-0.04436891,-0.04436891,-0.04436891,-0.04436891,-0.04436891,-0.04436891,-0.04436891,-0.04436891,-0.04436891,-0.04436891
innocent,3,0.336542,0.233732,0.067399,0.597796,0.042476,1.589736e-10,0.02159156,0.1882142,0.2493641,0.341023,...,1.80692e-18,1.80692e-18,1.80692e-18,1.80692e-18,1.80692e-18,1.80692e-18,1.80692e-18,1.80692e-18,1.80692e-18,1.80692e-18
innocent,4,-0.117551,-0.0947,-0.031428,-0.173881,-0.017539,0.0,0.0,-8.486718e-30,-1.3343740000000002e-22,-0.012682,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
