# Setup

In [1]:
import os
if not 'notebookDir' in globals():
    notebookDir = os.getcwd()
print('notebookDir: ' + notebookDir)
os.chdir(notebookDir)

notebookDir: /home/user/Documents/Github/Uni/Master/TUM_Praktikum_NLP_Explainability/understanding-opinions-on-social-media/data exploration


# Data preparation

In [2]:
import pandas as pd
pd.options.display.max_columns = 100

In [3]:
dataset_path = '../data/SemEval/stance/SemEval-stance.csv'
output = '../data/SemEval/stance/SemEval-stance_preprocessed.csv'

In [4]:
df = pd.read_csv(dataset_path, low_memory=False)
df_len = len(df)
df

Unnamed: 0,Worker ID,Instance ID,Target,Tweet,Stance,Opinion towards
0,worker_1,2370.0,Legalization of Abortion,Thank you for another day of life Lord. #Chris...,AGAINST,TARGET
1,worker_2,2370.0,Legalization of Abortion,Thank you for another day of life Lord. #Chris...,FAVOR,TARGET
2,worker_3,2370.0,Legalization of Abortion,Thank you for another day of life Lord. #Chris...,AGAINST,OTHER
3,worker_4,2370.0,Legalization of Abortion,Thank you for another day of life Lord. #Chris...,AGAINST,OTHER
4,worker_5,2370.0,Legalization of Abortion,Thank you for another day of life Lord. #Chris...,AGAINST,TARGET
...,...,...,...,...,...,...
53094,worker_113,10951.0,Hillary Clinton,@TheAtlantic cause #Hillary Clinton is a LYIN...,AGAINST,TARGET
53095,worker_85,10951.0,Hillary Clinton,@TheAtlantic cause #Hillary Clinton is a LYIN...,AGAINST,OTHER
53096,worker_56,10951.0,Hillary Clinton,@TheAtlantic cause #Hillary Clinton is a LYIN...,AGAINST,TARGET
53097,worker_84,10951.0,Hillary Clinton,@TheAtlantic cause #Hillary Clinton is a LYIN...,AGAINST,TARGET


In [6]:
"before:", df_len, "after:", len(df.drop_duplicates())

('before:', 53099, 'after:', 52514)

In [9]:
df_prep = df

In [10]:
df_prep.columns

Index(['Worker ID', 'Instance ID', 'Target', 'Tweet', 'Stance',
       'Opinion towards'],
      dtype='object')

In [11]:
unwanted_cols = []

df_filt = df_prep.drop(unwanted_cols, axis=1)
# rename page and group name to name to be consistent
df_filt = df_filt.rename(columns={})
df_filt


Unnamed: 0,Worker ID,Instance ID,Target,Tweet,Stance,Opinion towards
0,worker_1,2370.0,Legalization of Abortion,Thank you for another day of life Lord. #Chris...,AGAINST,TARGET
1,worker_2,2370.0,Legalization of Abortion,Thank you for another day of life Lord. #Chris...,FAVOR,TARGET
2,worker_3,2370.0,Legalization of Abortion,Thank you for another day of life Lord. #Chris...,AGAINST,OTHER
3,worker_4,2370.0,Legalization of Abortion,Thank you for another day of life Lord. #Chris...,AGAINST,OTHER
4,worker_5,2370.0,Legalization of Abortion,Thank you for another day of life Lord. #Chris...,AGAINST,TARGET
...,...,...,...,...,...,...
53094,worker_113,10951.0,Hillary Clinton,@TheAtlantic cause #Hillary Clinton is a LYIN...,AGAINST,TARGET
53095,worker_85,10951.0,Hillary Clinton,@TheAtlantic cause #Hillary Clinton is a LYIN...,AGAINST,OTHER
53096,worker_56,10951.0,Hillary Clinton,@TheAtlantic cause #Hillary Clinton is a LYIN...,AGAINST,TARGET
53097,worker_84,10951.0,Hillary Clinton,@TheAtlantic cause #Hillary Clinton is a LYIN...,AGAINST,TARGET


In [None]:
df_filt.to_csv(output)

# Training

In [None]:
import pytorch_lightning as pl
from nlp_utils.data_module import SemEvalDataModule
from nlp_utils.model import BaseModel
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
%load_ext tensorboard

In [None]:
# Start tensorboard
! pkill tensorboard
! rm -r /tmp/.tensorboard-info
%tensorboard --logdir lightning_logs --bind_all

In [None]:
# config used by Simon
config = {
    'batch_size': 32,
    'dataset_path': '../../data/SemEval/stance/SemEval-stance_preprocessed.csv',
    'learning_rate': 0.01
}

In [None]:
data_module = SemEvalDataModule(config=config)
data_module.setup('')
# tokenizer for decoding sentences
tokenizer = data_module.get_tokenizer() 

In [None]:
category_vector_length = len(data_module.class_encoder.get_feature_names())
config['category_encoded_length'] = category_vector_length
config['category_encoder_out'] = category_vector_length
config['vocab_size'] = len(data_module.vocab)

In [None]:
model = BaseModel(config)
trainer = pl.Trainer(gpus=0,
                     log_every_n_steps=1,
                     flush_logs_every_n_steps=1,
                    callbacks=[EarlyStopping(monitor='val_loss')], max_epochs=2) #, max_epochs=10, overfit_batches=10)
trainer.fit(model, data_module)

# Evaluation

In [None]:
import pandas as pd
from nlp_utils.data_module import PlainCrowdTangleDataModule, CrowdTangleDataModule, GroupId2Name, inverse_transform
from nlp_utils.model import BaseModel, BiLSTMModel
import pytorch_lightning as pl
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from glob import glob
import ipywidgets as widgets
from tqdm.notebook import tqdm
import pandas as pd
import torch
import seaborn as sb
import re
%load_ext tensorboard

In [None]:
# Start tensorboard
! pkill tensorboard
! rm -r /tmp/.tensorboard-info
%tensorboard --logdir lightning_logs --bind_all

In [None]:
# Select a model
w = widgets.Dropdown(
    options=glob('lightning_logs/*/checkpoints/*.ckpt'),
    description='Select a checkpoint:'
)
w

In [None]:
model_version = re.findall("version_[0-9]+", w.value)[0]
model = BaseModel.load_from_checkpoint(w.value)

torch.no_grad()
model.eval()
data_module = SemEvalDataModule(config=model.config)
data_module.setup('')
model.config, model_version

## Validation Set Results

In [None]:
df_dicts = []
val_loader = data_module.val_dataloader()
#groupid2name = GroupId2Name(model.config)
for y_lst, encoded_texts_lst, category_vectors_lst, features in tqdm(iter(val_loader),total=len(val_loader)):
    
    result = model(encoded_texts_lst, category_vectors_lst)

    for y_hat, y, encoded_text, category_vector in zip(result, y_lst, encoded_texts_lst["input_ids"], category_vectors_lst):
        decoded_categories = inverse_transform(category_vector.reshape(1, -1), model.config, data_module.class_encoder)
        df_dicts.append({**{
            'prediction': float(y_hat),
            'gold label': float(y),
            'text': data_module.tokenizer.decode(encoded_text).replace(' [PAD]','')
        },**decoded_categories})
#encoded_texts
#data_module.tokenizer.decode(encoded_texts["input_ids"][0]).replace(' [PAD]','')
df = pd.DataFrame.from_dict(df_dicts)
df

In [None]:
sb.histplot(data=df['prediction']).set_title(model_version)

# Analysis

In [None]:
import pandas as pd
import seaborn as sb
from pathlib import Path
sb.set_context(rc = {'patch.linewidth': 0.0})

In [None]:
dataset_path = Path('../../data/SemEval/stance/SemEval-stance.csv')

In [None]:
df = pd.read_csv(dataset_path,low_memory=False)
dataset_name = dataset_path.stem
figure_path = Path('./figures/' + dataset_name + '/')
figure_path.mkdir(parents=True, exist_ok=True)
dataset_name, str(figure_path)

In [None]:
#df['Type'].value_counts().plot(kind='bar')
plt = sb.countplot(y="Type", data=df)
plt.set(ylabel="", xlabel="Count")
#plt.figure.savefig(f'{figure_path}/post_types_dist.pdf', bbox_inches = 'tight', pad_inches = 0)

In [None]:
plt = sb.displot(df[df['Message'].str.len() < 600]['Message'].str.len())
plt.set(xlabel="Message length (characters)")
#plt.savefig(f'{figure_path}/message_length_dist.pdf', bbox_inches = 'tight', pad_inches = 0)

In [None]:
df_score_filtered = df[(df.reactions_sum < 2000)]
plt = sb.histplot(data=df_score_filtered['reactions_sum'], bins=100)
plt.set(xlabel="Number of Reactions")
#plt.figure.savefig(f'{figure_path}/reactions_dist.pdf', bbox_inches = 'tight', pad_inches = 0)