In [7]:
# General
# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
import os
from pathlib import Path

# Data
import pandas as pd

# Embeddings and ML
import torch
from transformers import BertForSequenceClassification
from word_embeddings import get_bert_word_embeddings

## Create dataframes

In [8]:
from csv import QUOTE_NONE
import sys
import csv
csv.field_size_limit(sys.maxsize)

base_path = Path(os.path.abspath("")).parents[1] / "dataset_creation" / "data"
datasets = {
    "school_shooters": base_path / "school_shooters.csv",
    "manifestos": base_path / "manifestos.csv",
    "stair_twitter_archive": base_path / "stair_twitter_archive.csv",
    "twitter": base_path / "twitter.csv",
}

schoolshootersinfo_df = pd.read_csv(datasets["school_shooters"], encoding="utf-8", delimiter="‎", engine="python", quoting=QUOTE_NONE)
manifesto_df = pd.read_csv(datasets["manifestos"], encoding="utf-8", delimiter="‎", engine="python", quoting=QUOTE_NONE)
stair_twitter_archive_df = pd.read_csv(datasets["stair_twitter_archive"], encoding="utf-8", delimiter="‎", engine="python", quoting=QUOTE_NONE)
twitter_df = pd.read_csv(datasets["twitter"], encoding="utf-8", delimiter="‎", engine="python", quoting=QUOTE_NONE)
twitter_df

Unnamed: 0,date,text,name
0,,Dorian Gray with Rainbow Scarf #LoveWins (from...,smile annotations
1,,@SelectShowcase @Tate_StIves ... Replace with ...,smile annotations
2,,@Sofabsports thank you for following me back. ...,smile annotations
3,,@britishmuseum @TudorHistory What a beautiful ...,smile annotations
4,,@NationalGallery @ThePoldarkian I have always ...,smile annotations
...,...,...,...
5052,2016-09-10,"""""""You bet Ben was belting louder than any gir...",umass
5053,2016-09-10,One of my hobby @ Ma Hood https://t.co/SHJDDWQ8QB,umass
5054,2016-09-10,Another Cardigan Records Hopscotch Day Party i...,umass
5055,2016-09-11,Bachelorette 💍💞 @ Laurita Winery https://t.co/...,umass


## Initial testing

One may argue that it could be beneficial to start with a overweigth of regular users as the school shooters posts will by its nature be a fraction. Therefore, I start with the school shooters info dataset (~1000 rows) against twitter (~5000 rows).

#### Embeddings intro

We need to represent text as matrices/vectors for machine learning pipelines to understand and find/learn patterns. This is where embeddings come into play as we are taking on the domain of text. For word embeddings, we want to encapsulate the similarity between words. As a rule of thumb, words that are similar should appear together in the grid/3d space. The same applies for sentence and document embeddings. Initially, we look further into word embeddings specifically.

The first approach is to use **word2vec**. This takes the cosine similarity but is not contextualized - this means that the word "bank" will have the same vectorial representation despite either meaning the place you havce your money or a long mound or slope, like a riverbank. **GloVe** is also context independent. The other alternatives, either **fasttext**, **elmo** or **BERT** is context dependent. 

Initially, we would like to use BERT embeddings as we want to utilize newer language models. Take a look at this [walkthrough](https://mccormickml.com/2019/05/14/BERT-word-embeddings-tutorial/).

#### Text preparation
We create a list of all texts

In [9]:
schoolshooters_texts = schoolshootersinfo_df["text"].to_list()

#### A note about token length

We would need to ensure that each run of BERT does not exceed 512 tokens. Chunking is the approach to take, and one could then choose one of multiple strategies. Chunking per post and taking the average seems like the way to go. This way, we keep each post to themselves, as we later on want to detect on a post basis, but could also chunk all texts of an author and then be able to run all texts of an author through BERT (then all text would be seen as a post, only a definition thing). 

In [10]:
model = BertForSequenceClassification.from_pretrained("./model/pretrained_big_five", num_labels=5)

model.config.label2id = {
    "Extroversion": 0,
    "Neuroticism": 1,
    "Agreeableness": 2,
    "Conscientiousness": 3,
    "Openness": 4,
}
model.config.id2label = {y: x for x, y in model.config.label2id.items()}

In [11]:
from typing import Dict

def pred(model_input: str) -> Dict[str, float]:
    if len(model_input)<20:
        ret = {
            "Extroversion": float(0),
            "Neuroticism": float(0),
            "Agreeableness": float(0),
            "Conscientiousness": float(0),
            "Openness": float(0),
        }
        return ret
    else:
        dict = get_bert_word_embeddings(model_input, pretrained_tokenizer="./model/pretrained_big_five", do_lower_case=True)
        outs = model(**dict)
        b_logit_pred = outs[0]
        pred_label = torch.sigmoid(b_logit_pred)
        ret = {
            "Extroversion": float(pred_label[0][0]),
            "Neuroticism": float(pred_label[0][1]),
            "Agreeableness": float(pred_label[0][2]),
            "Conscientiousness": float(pred_label[0][3]),
            "Openness": float(pred_label[0][4])
        }
        return ret

In [12]:
preds = [pred(txt) for txt in schoolshooters_texts]

Token indices sequence length is longer than the specified maximum sequence length for this model (517 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1343 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (582 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1322 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1597 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for 

In [13]:
preds[10]

{'Extroversion': 0.4780445992946625,
 'Neuroticism': 0.5271245837211609,
 'Agreeableness': 0.5256186127662659,
 'Conscientiousness': 0.47671961784362793,
 'Openness': 0.5333241820335388}