### Notebook to train dual emotion models and some plots

Make sure to run the emotion-extraction notebook first!
Your dataset needs to have the columns 'comment_emotion', 'main_emotion' and 'text'

In [None]:
import numpy as np
import pandas as pd

from random import random

from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score, precision_score, recall_score

import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator

from keras.regularizers import l2
from keras.models import Model, Sequential
from keras.layers import Dropout, Input, Dense, Softmax, LeakyReLU

from sklearn import svm

In [None]:
df = pd.read_pickle('factroid_with_emotions.pkl')

In [None]:
# dual emotion features:
comment_emotions = df['comment_emotion'].values
main_emotions = df['main_emotion'].values

mean_pol_comments = np.array([np.mean(i, axis=0) for i in comment_emotions])
max_pol_comments = np.array([np.max(i, axis=0) for i in comment_emotions])

mean_gap = np.array([v - mean_pol_comments[ind] for ind, v in enumerate(main_emotions)])
max_gap = np.array([v - max_pol_comments[ind] for ind, v in enumerate(main_emotions)])

dual_emotion_features = np.array([np.append(main_emotions[i],
                                   np.append(mean_pol_comments[i],
                                             np.append(max_pol_comments[i],
                                                       np.append(mean_gap[i], max_gap[i]))))
                                   for i in range(len(main_emotions))])

In [None]:
# Extract meaning embeddings
# Only run this cell if you never did it before; uncommend the code for that


# sentences = df['text']

# model = SentenceTransformer('sentence-transformers/all-MiniLM-L12-v2')
# X = model.encode(sentences, show_progress_bar=True)

# np.save('text_embeddings.npy', X)

In [None]:
# Load in embeddings
X = np.load('./text_embeddings.npy')

In [None]:
X_emo = np.append(X, dual_emotion_features, axis=1)
y = pd.get_dummies(df['fn']).to_numpy(dtype=np.float16)

### Simple MLP with dual-emotion vectors

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_emo, y, test_size=0.2)

states, counts = np.unique(y_train, axis=0, return_counts=True)

print('Train size:', len(y_train),'dist:', counts, '\nTest size:', len(y_test))

In [None]:
# make train balanced:
# gives better results
states, counts = np.unique(y_train, axis=0, return_counts=True)

smallest_state = np.argmin(states)

for i in range(len(states)):
    if i == smallest_state:
        continue
    
    new_y_train = y_train.copy()
    new_X_train = X_train.copy()
    
    mask = np.all(y_train != states[i], axis=1)
    new_y_train = new_y_train[mask]
    new_X_train = new_X_train[mask]
    
    mask = np.all(y_train == states[i], axis=1)
    y_examples = y_train[mask]
    X_examples = X_train[mask]
    
    permutation = np.random.permutation(len(y_examples))
    
    # readd the trimmed down examples to y_train and X_train
    new_y_train = np.concatenate((new_y_train, (y_examples[permutation])[:counts[smallest_state]]), axis=0)
    y_train = new_y_train
    
    new_X_train = np.concatenate((new_X_train, (X_examples[permutation])[:counts[smallest_state]]), axis=0)
    X_train = new_X_train
    
    # add the remaining examples to the testing arrays
#     y_test = np.concatenate((y_test, (y_examples[permutation])[counts[smallest_state]:]), axis=0)
#     X_test = np.concatenate((X_test, (X_examples[permutation])[counts[smallest_state]:]), axis=0)

In [None]:
# Print sizes
states, tr_counts = np.unique(y_train, axis=0, return_counts=True)
states, te_counts = np.unique(y_test, axis=0, return_counts=True)

print('Train size:', len(y_train),'dist:', tr_counts,
      '\nTest size:', len(y_test), 'dist:', te_counts)

In [None]:
# Original Code from Dual-Emotion Mining Paper:
# https://github.com/RMSnow/WWW2021/blob/master/code/model/MLP.py

class MLP5Layers:
    def __init__(self, input_dim, category_num=2, l2_param=0.01, lr_param=0.001):
        self.input_dim = input_dim
        self.category_num = category_num
        self.l2_param = l2_param

        self.model = self.build()
        self.model.compile(loss='categorical_crossentropy',
                           optimizer='adam',
                           metrics=['accuracy'])

    def build(self):
#         se_input = Input(shape=(self.input_dim,))

#         dense1 = Dense(self.input_dim, activation='relu',
#                        kernel_regularizer=l2(self.l2_param))(se_input)
#         dense2 = Dense(256, activation='relu',
#                        kernel_regularizer=l2(self.l2_param))(dense1)
#         dense3 = Dense(128, activation='relu',
#                        kernel_regularizer=l2(self.l2_param))(dense2)
#         dense4 = Dense(64, activation='relu',
#                        kernel_regularizer=l2(self.l2_param))(dense3)
#         dense5 = Dense(32, activation='relu',
#                        kernel_regularizer=l2(self.l2_param))(dense4)
#         output = Dense(1, activation='softmax',
#                        kernel_regularizer=l2(self.l2_param))(dense5)

#         model = Model(inputs=[se_input], outputs=output)

        model = Sequential()
        model.add(Dropout(0.2))
        model.add(Dense(64, activation=LeakyReLU(alpha=0.1)))
        model.add(Dropout(0.1))
        model.add(Dense(32, activation=LeakyReLU(alpha=0.1)))

        model.add(Dense(2, activation='softmax'))
        return model

In [None]:
model = MLP5Layers(len(X_train[0])).model
model.fit(X_train, y_train, epochs=50, shuffle=True)

In [None]:
y_pred = model.predict(X_test)
pre = np.argmax(y_pred, axis=1)
act = np.argmax(y_test, axis=1)

## SVM

In [None]:
# comment out so I dont run by accident
# Pretty bad performance

# clf = svm.SVC(verbose=True)
# clf.fit(X_train, np.argmax(y_train, axis=1))

# pre = clf.predict(X_test)
# act = np.argmax(y_test, axis=1)

## Ablation

In [None]:
# Replacing emotions
# X_abl = np.append(X, np.random.random(dual_emotion_features.shape), axis=1)

# Replacing sBert
X_abl = np.append(np.random.random(X.shape), dual_emotion_features, axis=1)

y = pd.get_dummies(df['fn']).to_numpy(dtype=np.float16)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_abl, y, test_size=0.2)

states, counts = np.unique(y_train, axis=0, return_counts=True)

print('Train size:', len(y_train),'dist:', counts, '\nTest size:', len(y_test))

In [None]:
# make train balanced:
states, counts = np.unique(y_train, axis=0, return_counts=True)

smallest_state = np.argmin(states)

for i in range(len(states)):
    if i == smallest_state:
        continue
    
    new_y_train = y_train.copy()
    new_X_train = X_train.copy()
    
    mask = np.all(y_train != states[i], axis=1)
    new_y_train = new_y_train[mask]
    new_X_train = new_X_train[mask]
    
    mask = np.all(y_train == states[i], axis=1)
    y_examples = y_train[mask]
    X_examples = X_train[mask]
    
    permutation = np.random.permutation(len(y_examples))
    
    # readd the trimmed down examples to y_train and X_train
    new_y_train = np.concatenate((new_y_train, (y_examples[permutation])[:counts[smallest_state]]), axis=0)
    y_train = new_y_train
    
    new_X_train = np.concatenate((new_X_train, (X_examples[permutation])[:counts[smallest_state]]), axis=0)
    X_train = new_X_train
    
    # add the remaining examples to the testing arrays
#     y_test = np.concatenate((y_test, (y_examples[permutation])[counts[smallest_state]:]), axis=0)
#     X_test = np.concatenate((X_test, (X_examples[permutation])[counts[smallest_state]:]), axis=0)

In [None]:
states, tr_counts = np.unique(y_train, axis=0, return_counts=True)
states, te_counts = np.unique(y_test, axis=0, return_counts=True)

print('Train size:', len(y_train),'dist:', tr_counts,
      '\nTest size:', len(y_test), 'dist:', te_counts)

# model = MLP5Layers(len(X_train[0])).model
model.fit(X_train, y_train, epochs=10, shuffle=True)

In [None]:
y_pred = model.predict(X_test)
pre = np.argmax(y_pred, axis=1)
act = np.argmax(y_test, axis=1)

## Evaluation

In [None]:
print('F1:',  (f1_score(y_true=act, y_pred=pre)*100).round(2))
print('Acc:', (accuracy_score(y_true=act, y_pred=pre)*100).round(2))
print('Recall:', (recall_score(y_true=act, y_pred=pre)*100).round(2))
print('Precision:', (precision_score(y_true=act, y_pred=pre)*100).round(2))

confusion_matrix(y_true=act, y_pred=pre)

## Plots

In [None]:
x = np.arange(0, 1.03, 0.025)

plt.figure(figsize=(12,10))

for ind, val in enumerate(x[1:]):
    m = (y_pred[:, 0] >= x[ind-1]) & (y_pred[:, 0] < val)
    t_rn = y_pred[:, 0][m & (act == 0)]
    t_fn = y_pred[:, 0][m & (act == 1)]
    plt.bar(x=(x[ind-1] + val)/2 -0.005, height=len(t_fn), width=0.01, align='center', color='darkred', alpha=0.6)
    plt.bar(x=(x[ind-1] + val)/2 +0.005, height=len(t_rn), width=0.01, align='center', color='darkblue', alpha=0.6)

plt.legend(['Fake-news', 'Real-news'])
plt.yscale('log')
plt.xlabel('Real news probability in percent')
plt.ylabel('#Posts')
plt.title('Model predictions for real and fake news posts\nusing mini sBert and dual-emotion features')
plt.show()

In [None]:
x = np.arange(0, 1.03, 0.025)

plt.figure(figsize=(12,10))

rn_norm = sum(act == 0)
fn_norm = sum(act == 1)

for ind, val in enumerate(x[1:]):
    m = (y_pred[:, 0] >= x[ind-1]) & (y_pred[:, 0] < val)
    t_rn = y_pred[:, 0][m & (act == 0)]
    t_fn = y_pred[:, 0][m & (act == 1)]
    plt.bar(x=(x[ind-1] + val)/2 - 0.005, height=len(t_fn)/fn_norm, width=0.01, align='center', color='darkred', alpha=0.6)
    plt.bar(x=(x[ind-1] + val)/2 + 0.005, height=len(t_rn)/rn_norm, width=0.01, align='center', color='darkblue', alpha=0.6)

plt.legend(['Fake-news', 'Real-news'])
plt.xlabel('Real news probability in percent')
plt.ylabel('Post density in percent')
plt.title('Model predictions for real and fake news posts\nusing mini sBert and dual-emotion features')
plt.show()

F1: 67.12
Acc: 65.76
Recall: 76.7
Precision: 59.66
array([[1184,  908],
       [ 408, 1343]], dtype=int64)

In [None]:
base_emotions = ['anger', 'anticipation', 'disgust', 'fear', 'joy', 'sadness', 'surprise', 'trust']

# data -> entry <-> emotion.
# entry -> 0: rn cc, 1: rn ncc, 2: fn cc, 3: fn ncc

# real data:
# cc -> >0.8, ncc -> <0.2
# grouping by real and fake first.
# extract indices of X vectors
# -> find correnct emotion positions

pred = y_pred[:, 0]
# pred is the real_news confidence as in the plots above
# act is still the known factroid notation
# kind of unintuitive, i know

## variable naming explaiation
## cc: correctly_classified, ncc: not_correctly_classified
## rn: real_news, fn: fake-news

# get all cc rn post with score < 0.1
# -> model is really confident it's real-news and is correct
rn_cc = X_test[(pred > 0.9) & (act == 0)]

# get all ncc rn post with score < 0.1
# -> model is really confident it's real-news and is not correct
rn_ncc = X_test[(pred < 0.1) & (act == 0)]

# get all cc fn post with score < 0.1
# -> model is really confident it's fake-news and is correct
fn_cc = X_test[(pred < 0.1) & (act == 1)]

# get all cc rn post with score < 0.1
# -> model is really confident it's fake-news and is not corrct
fn_ncc = X_test[(pred > 0.9) & (act == 1)]

print('Sizes:', '\nrn_cc:', len(rn_cc), '\nrn_ncc:',
      len(rn_ncc), '\nfn_cc', len(fn_cc), '\nfn_ncc:', len(fn_ncc))

# it makes sense that there are more rn_ncc than fn_cc. See first log-scale plot

In [None]:
# all entries in rn_cc, ... have 654-dim
# 0-383 -> miniBert
# 384-654 -> emotion vectors

# mean of everything
rn_cc = np.mean(rn_cc, axis=0)
rn_ncc = np.mean(rn_ncc, axis=0)
fn_cc = np.mean(fn_cc, axis=0)
fn_ncc = np.mean(fn_ncc, axis=0)

# 392:400 -> publisher emotion probs
# 446:454 -> mean com
# 500:508 -> max com
# 554:562 -> mean gap
# 608:616 -> max gap
data = []
groups = ['publisher', 'mean_com', 'max_com', 'mean_gap', 'max_gap']
emotions = []
for i, emotion in enumerate(base_emotions):
    for j, name in enumerate(groups):
        emotions.append(name + '_' + emotion)
        data.append([rn_cc[392 + i + j*54],
                rn_ncc[392 + i + j*54],
                fn_cc[392 + i + j*54],
                fn_ncc[392 + i + j*54]])

In [None]:
fig, ax = plt.subplots(2, 4, figsize=(16,9))

x_ticks = [-0.7, -0.5, -0.3, -0.1, 0.1, 0.3, 0.5, 0.7]

for ind, emotion in enumerate(base_emotions):
    
    ax[ind//4, ind%4].set_title(emotion)
    ax[ind//4, ind%4].set_xticks(x_ticks)
    ax[ind//4, ind%4].set_yticks([])
    ax[ind//4, ind%4].set_xticklabels(np.abs(x_ticks))
    
    ax[ind//4, ind%4].xaxis.set_minor_locator(MultipleLocator(0.05))
    
    ax[ind//4, ind%4].set_xlim([-0.701, 0.701])
    ax[ind//4, ind%4].set_ylim([-4.3, 0.7])
    
    ax[ind//4, ind%4].grid(visible=True, axis='x')
    
    ax[ind//4, ind%4].spines['top'].set_visible(False)
    ax[ind//4, ind%4].spines['right'].set_visible(False)
    ax[ind//4, ind%4].spines['left'].set_visible(False)
    
    
    for jnd, cat in enumerate(groups):
        #print(ind, ind//4, ind%4)
        # rn cc bar
        rnc, = ax[ind//4, ind%4].barh(y=-jnd+0.1, width=abs(data[ind*5 + jnd][0]), height=0.15, color='royalblue')

        # rn ncc bar
        rnnc, = ax[ind//4, ind%4].barh(y=-jnd-0.1, width=abs(data[ind*5 + jnd][1]), height=0.15, color='navy')
        
        
        # fn cc bar
        fnc, = ax[ind//4, ind%4].barh(y=-jnd+0.1, width=-abs(data[ind*5 + jnd][2]), height=0.15, color='orangered')

        # fn ncc bar
        fnnc, = ax[ind//4, ind%4].barh(y=-jnd-0.1, width=-abs(data[ind*5 + jnd][3]), height=0.15, color='darkred')

        # seperator line
        ax[ind//4, ind%4].plot([0, 0], [-jnd-0.22, -jnd+0.22], color='black')

        # text
        ax[ind//4, ind%4].text(0, -jnd+0.35, cat, ha='center', fontsize=9)
    
fig.legend([fnc, fnnc, rnc, rnnc], ['Fake News - Correct', 'Fake News - Not Correct', 'Real News - Correct', 'Real News - Not Correct'])
plt.suptitle('Feature weights in Correctly and not Correctly Classified Posts\ngrouped by emotions', fontsize=20)

plt.savefig('Emotion_dist.pdf', bbox_inches='tight')
plt.show();