In [1]:
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertForSequenceClassification, Trainer, TrainingArguments,BertTokenizerFast
from nlp import load_dataset
from nlp import Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, log_loss
from scipy.special import softmax

In [2]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

In [3]:
#DATASET

In [5]:
trainset = pd.read_csv('trainset40000.csv')
print(len(trainset))
trainset.head()

40000


Unnamed: 0.1,Unnamed: 0,text,label
0,0,I'll DM you my address.,1
1,1,"Creationist nonsense, thoroughly debunked. Goo...",1
2,2,"Ok, so what? They had their reasons, and now y...",1
3,3,I hate every single fucking one of 's despicab...,1
4,4,Bullshit. An employer is not allowed to beat y...,1


In [11]:
shuffle_train = trainset.sample(frac=1, random_state=1).reset_index()
print(len(shuffle_train))
shuffle_train.head()

40000


Unnamed: 0.1,index,Unnamed: 0,text,label
0,3841,3841,I've never owned a gun nor shot &amp; killed a...,1
1,12898,12898,Lauren Ralph Lauren Size 10 Metallic-Stripe Ri...,0
2,15032,15032,You should be slow dancing to oldies with the ...,0
3,36781,80781,We're a third world country masquerading as a ...,1
4,9201,9201,The run up is scary but there still appears to...,1


In [15]:
shuffle_train = shuffle_train.dropna()

In [16]:
print(len(shuffle_train))

39999


In [10]:
testset = pd.read_csv('testset10000.csv')
print(len(testset))
testset.head()

10000


Unnamed: 0.1,Unnamed: 0,text,label
0,46000,Digital Security by Design Enters Next Major P...,0
1,46001,| Local school districts get trained on new vi...,0
2,46002,| | Wondering how to hide power cords? This $2...,0
3,46003,"| | Cold air’s back, let’s get serious about w...",0
4,46004,"Press Briefing by Press Secretary Jen Psaki, J...",0


In [18]:
X_train, X_val, y_train, y_val = train_test_split(shuffle_train["text"], shuffle_train["label"], test_size=0.2, random_state=2020)
X_train.head(), X_val.head()

(32176    $CNS Cohen &amp;amp; Steers Limited Duration P...
 19250    Dismissal Of Fighters Breached The Moral Princ...
 5090     $TAC TransAlta Reports Strong First Quarter 20...
 20509    Jings are the still at that game? Last night p...
 22876    I actually looked into this and I think that t...
 Name: text, dtype: object,
 36724    Oppo Reno 7 5G India Pricing Surfaces Online A...
 16472       Added the Falcon Glider &amp; new leaked wrap:
 5997     All 850 Books Texas Lawmaker Matt Krause (R) W...
 34367    The 'fact checkers' are already tip toeing aro...
 22588    Probably running neck-in-neck with the Trudeau...
 Name: text, dtype: object)

In [19]:
len(X_train)

31999

In [21]:
# set max length 
max_len = 32

train_dataset = tokenizer(X_train.tolist(), truncation=True, max_length=max_len, padding=True)
train_dataset['label'] = y_train.values
val_dataset = tokenizer(X_val.tolist(), truncation=True, max_length=max_len, padding=True)
val_dataset['label'] = y_val.values
test_dataset = tokenizer(testset['text'].tolist(), truncation=True, max_length=max_len, padding=True)

In [22]:
train_dataset = Dataset.from_dict(train_dataset)
val_dataset = Dataset.from_dict(val_dataset)
test_dataset = Dataset.from_dict(test_dataset)

In [34]:
# PARAMETER

In [23]:
train_dataset.features

{'input_ids': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None),
 'token_type_ids': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None),
 'label': Value(dtype='int64', id=None)}

In [24]:
def compute_metrics(pred):
    labels = pred.label_ids
    # probs = softmax(pred.predictions, axis = 1)
    # logloss = log_loss(labels, probs)
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall,
    }

In [25]:
training_args = TrainingArguments(
    output_dir='./results', 
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    learning_rate=5e-5,
    load_best_model_at_end=True,
    metric_for_best_model = "accuracy",
    weight_decay=0.01,
    warmup_steps=500,
    evaluation_strategy="steps",
    logging_strategy = "steps",
    save_strategy ="steps",
    logging_steps = 100,
    seed = 2020,
    logging_dir='./logs' 
)

In [26]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2,output_hidden_states = False)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [27]:
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer = tokenizer
)

In [28]:
# Train

In [29]:
trainer.train()

***** Running training *****
  Num examples = 31999
  Num Epochs = 5
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 5000
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
wandb: Currently logged in as: xfr315 (ucph_z315). Use `wandb login --relogin` to force relogin


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
100,0.678,0.63555,0.648,0.694577,0.628213,0.776619
200,0.6242,0.58732,0.68975,0.727791,0.664264,0.804754
300,0.5964,0.575211,0.6975,0.7431,0.660751,0.848896
400,0.582,0.545464,0.7225,0.733749,0.725741,0.741935
500,0.5711,0.561923,0.7185,0.759607,0.67836,0.862964
600,0.5365,0.526929,0.74125,0.741379,0.764494,0.719622
700,0.5425,0.547441,0.726,0.76526,0.685139,0.866602
800,0.5487,0.511024,0.747625,0.744463,0.778454,0.713316
900,0.5099,0.50561,0.75375,0.768616,0.745161,0.793597
1000,0.5105,0.499531,0.758375,0.773415,0.748412,0.800146


***** Running Evaluation *****
  Num examples = 8000
  Batch size = 32
***** Running Evaluation *****
  Num examples = 8000
  Batch size = 32
***** Running Evaluation *****
  Num examples = 8000
  Batch size = 32
***** Running Evaluation *****
  Num examples = 8000
  Batch size = 32
***** Running Evaluation *****
  Num examples = 8000
  Batch size = 32
Saving model checkpoint to ./results\checkpoint-500
Configuration saved in ./results\checkpoint-500\config.json
Model weights saved in ./results\checkpoint-500\pytorch_model.bin
tokenizer config file saved in ./results\checkpoint-500\tokenizer_config.json
Special tokens file saved in ./results\checkpoint-500\special_tokens_map.json
***** Running Evaluation *****
  Num examples = 8000
  Batch size = 32
***** Running Evaluation *****
  Num examples = 8000
  Batch size = 32
***** Running Evaluation *****
  Num examples = 8000
  Batch size = 32
***** Running Evaluation *****
  Num examples = 8000
  Batch size = 32
***** Running Evaluation **

TrainOutput(global_step=5000, training_loss=0.25251417021751404, metrics={'train_runtime': 31506.5844, 'train_samples_per_second': 5.078, 'train_steps_per_second': 0.159, 'total_flos': 2631028331395200.0, 'train_loss': 0.25251417021751404, 'epoch': 5.0})

In [30]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 8000
  Batch size = 32


{'eval_loss': 0.5238358378410339,
 'eval_accuracy': 0.775625,
 'eval_f1': 0.7884003300719087,
 'eval_precision': 0.7669724770642202,
 'eval_recall': 0.8110599078341014,
 'eval_runtime': 255.5996,
 'eval_samples_per_second': 31.299,
 'eval_steps_per_second': 0.978,
 'epoch': 5.0}

In [None]:
%load_ext tensorboard
%tensorboard --logdir logs

In [32]:
tokenizer.save_pretrained('bert32/tokenizer')

tokenizer config file saved in bert32/tokenizer\tokenizer_config.json
Special tokens file saved in bert32/tokenizer\special_tokens_map.json


('bert32/tokenizer\\tokenizer_config.json',
 'bert32/tokenizer\\special_tokens_map.json',
 'bert32/tokenizer\\vocab.txt',
 'bert32/tokenizer\\added_tokens.json',
 'bert32/tokenizer\\tokenizer.json')

In [33]:
model.save_pretrained('bert32/model')

Configuration saved in bert32/model\config.json
Model weights saved in bert32/model\pytorch_model.bin


In [34]:
pred = trainer.predict(test_dataset=test_dataset).predictions
pred

***** Running Prediction *****
  Num examples = 10000
  Batch size = 32


array([[ 0.7617333 , -1.2927562 ],
       [ 1.6783063 , -1.9513254 ],
       [-0.250545  , -0.05809055],
       ...,
       [ 0.37851036, -0.5124843 ],
       [-0.97899926,  1.2159214 ],
       [-0.22683287,  0.04232647]], dtype=float32)

In [35]:
import numpy as np
result = np.argmax(pred, axis=1)
result

array([0, 0, 1, ..., 0, 1, 1], dtype=int64)

In [40]:
len(result)

10000

In [41]:
test_val = testset.label
truth = test_val.to_list()

In [42]:
len(truth)

10000

In [43]:
# accuracy
accu = 0
for i in range(len(truth)):
    if truth[i] == result[i]:
        accu += 1
accuracy = accu/len(truth)
accuracy

0.7404

In [38]:
def user_attribute(predict):
    pos = 0
    for i in range(len(predict)):
        if predict[i] == 1:
            pos +=1
    if pos/len(predict) >=0.5:
        return 1
    else:
        return 0

In [44]:
# user accuracy
u_accu = 0
for i in range(int(len(truth)/200)):
    if user_attribute(result[i*200:(i+1)*200]) == truth[i*200]:
        u_accu +=1
user_accuracy = u_accu/(len(truth)/200)
user_accuracy

0.94