## Parameters

In [None]:
# Parameters
val_portion = 0.2
test_portion = 0.2
model_name = "distilbert-base-uncased"
#model_name = "tomh/toxigen_hatebert"
max_length = 512
num_epochs = 5

## Collab setup

#### GPU?

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Thu Mar 23 12:41:53 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   59C    P0    29W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

#### Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import sys
sys.path.append('/content/drive/MyDrive/ml_projects/masters_thesis/')
sys.path.append('/content/drive/MyDrive/ml_projects/masters_thesis/5_language_models')

Mounted at /content/drive


#### Requirement downloads

In [None]:
!pip install pandas tabulate nltk torch torchtext transformers datasets scikit-learn matplotlib

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.2-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m81.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.10.1-py3-none-any.whl (469 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m469.0/469.0 KB[0m [31m45.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.3-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8/199.8 KB[0m [31m24.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m80.5 MB/s[0m eta [36m0:00:00[0m
Col

# Notebook

In [None]:
# General
import os
from pathlib import Path

# Embeddings and ML
from word_embeddings import get_bert_word_embeddings

# All
import torch, gc, random, datasets
from transformers.file_utils import is_tf_available, is_torch_available
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
#%load_ext memory_profiler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_squared_error, mean_absolute_error
import pandas as pd
import numpy as np

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

## Data


### Load all data

In [None]:
from csv import QUOTE_NONE
import sys
import csv
csv.field_size_limit(sys.maxsize)

# base_path = Path(os.path.abspath("")).parents[1] / "dataset_creation" / "data"
base_path = Path("/content/drive/MyDrive/ml_projects/masters_thesis/data")
datasets = {
    "school_shooters": base_path / "school_shooters.csv",
    "manifestos": base_path / "manifestos.csv",
    "stair_twitter_archive": base_path / "stair_twitter_archive.csv",
    "twitter": base_path / "twitter.csv",
}

schoolshootersinfo_df = pd.read_csv(datasets["school_shooters"], encoding="utf-8", delimiter="‎", engine="python", quoting=QUOTE_NONE)
manifesto_df = pd.read_csv(datasets["manifestos"], encoding="utf-8", delimiter="‎", engine="python", quoting=QUOTE_NONE)
stair_twitter_archive_df = pd.read_csv(datasets["stair_twitter_archive"], encoding="utf-8", delimiter="‎", engine="python", quoting=QUOTE_NONE)
twitter_df = pd.read_csv(datasets["twitter"], encoding="utf-8", delimiter="‎", engine="python", quoting=QUOTE_NONE)

# Texts
# schoolshooters_texts = schoolshootersinfo_df["text"].to_list()

### Create threat vs no-threat dataframes

In [None]:
threat_df = schoolshootersinfo_df.assign(label=1)
no_threat_df = twitter_df.assign(label=0)
# Filter out date and name
df = pd.concat([threat_df, no_threat_df]).drop(["date", "name"], axis=1)
df

Unnamed: 0,text,label
0,Oh the happiness I could have had mingling amo...,1
1,Only if you could be the victim of your repreh...,1
2,"For every action, there is an equal and opposi...",1
3,"All the shit you’ve given me, right back at yo...",1
4,You had a hundred billion chances and ways to ...,1
...,...,...
5052,"""""""You bet Ben was belting louder than any gir...",0
5053,One of my hobby @ Ma Hood https://t.co/SHJDDWQ8QB,0
5054,Another Cardigan Records Hopscotch Day Party i...,0
5055,Bachelorette 💍💞 @ Laurita Winery https://t.co/...,0


### Prepare data


In [None]:
# Get the lists of posts (X) and their labels (y).
X = df.text.values
y = df.label.values

# Split Data into Train, Val and Test
# Train Test
X_train, X_test, y_train, y_test = train_test_split(X.tolist(), y, test_size=test_portion)
# Train Val
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=val_portion)

# Call the Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Encode the text
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
train_encodings = tokenizer(X_train, truncation=True, padding=True, max_length=max_length)
val_encodings = tokenizer(X_val, truncation=True, padding=True, max_length=max_length)
test_encodings = tokenizer(X_test, truncation=True, padding=True, max_length=max_length)


# Need to make label float
class MakeTorchData(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor([self.labels[idx]])
        item["labels"] = float(item["labels"])
        return item

    def __len__(self):
        return len(self.labels)

# convert our tokenized data into a torch Dataset
train_dataset = MakeTorchData(train_encodings, y_train.ravel())
val_dataset = MakeTorchData(val_encodings, y_val.ravel())
test_dataset = MakeTorchData(test_encodings, y_test.ravel())

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

## Model

### Load model

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels = 1).to("cuda")


Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.weight', 'classifier

### Compute Metrics for Regression

In [None]:
def compute_metrics_for_regression(eval_pred):
    logits, labels = eval_pred
    labels = labels.reshape(-1, 1)

    mse = mean_squared_error(labels, logits)
    rmse = mean_squared_error(labels, logits, squared=False)
    mae = mean_absolute_error(labels, logits)
    r2 = r2_score(labels, logits)
    #smape = 1/len(labels) * np.sum(2 * np.abs(logits-labels) / (np.abs(labels) + np.abs(logits))*100)
    single_squared_errors = ((logits - labels).flatten()**2).tolist()
    accuracy = sum([1 for e in single_squared_errors if e < 0.25]) / len(single_squared_errors)
  
    return {"mse": mse, "rmse": rmse, "mae": mae, "r2": r2, "accuracy": accuracy} # "smape": smape

### Build trainer

In [None]:
# Specifiy the arguments for the trainer  
training_args = TrainingArguments(
    output_dir ='./results',          
    num_train_epochs = num_epochs,     
    per_device_train_batch_size = 32,   
    per_device_eval_batch_size = 20,   
    weight_decay = 0.01,               
    learning_rate = 2e-5,
    logging_dir = './logs',            
    save_total_limit = 10,
    load_best_model_at_end = True,     
    metric_for_best_model = 'rmse',    
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
) 

# Call the Trainer
trainer = Trainer(
    model = model,                         
    args = training_args,                  
    train_dataset = train_dataset,         
    eval_dataset = val_dataset,          
    compute_metrics = compute_metrics_for_regression,     
)

# Train the model
trainer.train()

# Call the summary
trainer.evaluate()



Epoch,Training Loss,Validation Loss,Mse,Rmse,Mae,R2,Accuracy
1,No log,0.027961,0.027961,0.167216,0.085542,0.790591,0.959544
2,No log,0.022575,0.022575,0.15025,0.062351,0.830929,0.96888
3,No log,0.020569,0.020569,0.143419,0.05297,0.845952,0.974066
4,No log,0.01994,0.01994,0.141211,0.046597,0.85066,0.975104
5,0.017900,0.020133,0.020133,0.141891,0.045223,0.849218,0.969917


{'eval_loss': 0.02796112187206745,
 'eval_mse': 0.02796112187206745,
 'eval_rmse': 0.16721579432487488,
 'eval_mae': 0.08554211258888245,
 'eval_r2': 0.79059050460251,
 'eval_accuracy': 0.9595435684647303,
 'eval_runtime': 16.2435,
 'eval_samples_per_second': 59.347,
 'eval_steps_per_second': 3.017,
 'epoch': 5.0}

In [None]:
threat = 0
for i in y_train:
  if i == 1:
    threat += 1
print(threat, len(y_train))
threat = 0
for i in y_val:
  if i == 1:
    threat += 1
print(threat, len(y_val))
threat = 0
for i in y_test:
  if i == 1:
    threat += 1
print(threat, len(y_test))

624 3853
153 964
188 1205


In [None]:
trainer.eval_dataset = test_dataset
trainer.evaluate()

{'eval_loss': 0.0306602343916893,
 'eval_mse': 0.0306602343916893,
 'eval_rmse': 0.175100639462471,
 'eval_mae': 0.08707750588655472,
 'eval_r2': 0.767152938063602,
 'eval_accuracy': 0.9626556016597511,
 'eval_runtime': 20.6426,
 'eval_samples_per_second': 58.374,
 'eval_steps_per_second': 2.955,
 'epoch': 5.0}

## Predict


### Load checkpoint


In [None]:
%ls 

[0m[01;34mdrive[0m/  [01;34mlogs[0m/  [01;34mresults[0m/  [01;34msample_data[0m/


In [None]:
checkpoint = "results/checkpoint-605"

tokenizer = AutoTokenizer.from_pretrained(model_name)
#tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels = 1, local_files_only=True)


### Run test inference

In [None]:
text = ["Fuck you, I hate you. Kill. Masacre. kill kill kill masacre kill kill fuck kill death neo. I hope you die you piece of shit. Hell, I could've blown up that school in a better way. ", "I love you, Zaim"]
encoding = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=max_length)

# forward pass
outputs = model(**encoding)
pred = torch.sigmoid(outputs.logits).tolist()

print(pred)

[[0.7296761870384216], [0.49766916036605835]]


### Run tests df

In [None]:
# Define data
test_df = stair_twitter_archive_df.assign(label=1)
stair_texts = test_df.text.values.tolist()
stair_labels = test_df.label.values.tolist()

model.eval()

# Get predictions
outputs = []
for text, label in zip(stair_texts, stair_labels):
  encoding = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=max_length)
  out = model(**encoding)
  pred = torch.sigmoid(out.logits).tolist()[0][0]
  outputs.append(pred)

#threats = [1 if pred > threshold else 0 for pred in preds]
#correct = [1 if threats[i] == preds else 0 for i, preds in enumerate(preds)]

In [None]:
threshold = 0.75
threats = [1 if output > threshold else 0 for output in outputs]
correct = [1 if threats[i] == output else 0 for i, output in enumerate(outputs)]

In [None]:
for i, t in enumerate(threats):
  if t == 1:
    print(stair_texts[i])