In [1]:
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertForSequenceClassification, Trainer, TrainingArguments,BertTokenizerFast
from nlp import load_dataset
from nlp import Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, log_loss
from scipy.special import softmax

In [2]:
from transformers import AutoTokenizer, TFAutoModel

In [3]:
tokenizer = BertTokenizerFast.from_pretrained('bert32/tokenizer')
model = BertForSequenceClassification.from_pretrained('bert32/model', num_labels=2,output_hidden_states = False)

In [4]:
def compute_metrics(pred):
    labels = pred.label_ids
    # probs = softmax(pred.predictions, axis = 1)
    # logloss = log_loss(labels, probs)
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall,
    }

In [5]:
training_args = TrainingArguments(
    output_dir='./results', 
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    learning_rate=5e-5,
    load_best_model_at_end=True,
    metric_for_best_model = "accuracy",
    weight_decay=0.01,
    warmup_steps=500,
    evaluation_strategy="steps",
    logging_strategy = "steps",
    save_strategy ="steps",
    logging_steps = 100,
    seed = 2020,
    logging_dir='./logs' 
)

In [6]:
trainset = pd.read_csv('trainset40000.csv')
print(len(trainset))
trainset.head()

40000


Unnamed: 0.1,Unnamed: 0,text,label
0,0,I'll DM you my address.,1
1,1,"Creationist nonsense, thoroughly debunked. Goo...",1
2,2,"Ok, so what? They had their reasons, and now y...",1
3,3,I hate every single fucking one of 's despicab...,1
4,4,Bullshit. An employer is not allowed to beat y...,1


In [7]:
shuffle_train = trainset.sample(frac=1, random_state=1).reset_index()
print(len(shuffle_train))
shuffle_train.head()

40000


Unnamed: 0.1,index,Unnamed: 0,text,label
0,3841,3841,I've never owned a gun nor shot &amp; killed a...,1
1,12898,12898,Lauren Ralph Lauren Size 10 Metallic-Stripe Ri...,0
2,15032,15032,You should be slow dancing to oldies with the ...,0
3,36781,80781,We're a third world country masquerading as a ...,1
4,9201,9201,The run up is scary but there still appears to...,1


In [8]:
shuffle_train = shuffle_train.dropna()

In [9]:
print(len(shuffle_train))

39999


In [10]:
testset = pd.read_csv('testset10000.csv')
print(len(testset))
testset.head()

10000


Unnamed: 0.1,Unnamed: 0,text,label
0,46000,Digital Security by Design Enters Next Major P...,0
1,46001,| Local school districts get trained on new vi...,0
2,46002,| | Wondering how to hide power cords? This $2...,0
3,46003,"| | Cold air’s back, let’s get serious about w...",0
4,46004,"Press Briefing by Press Secretary Jen Psaki, J...",0


In [11]:
X_train, X_val, y_train, y_val = train_test_split(shuffle_train["text"], shuffle_train["label"], test_size=0.2, random_state=2020)
X_train.head(), X_val.head()

(32176    $CNS Cohen &amp;amp; Steers Limited Duration P...
 19250    Dismissal Of Fighters Breached The Moral Princ...
 5090     $TAC TransAlta Reports Strong First Quarter 20...
 20509    Jings are the still at that game? Last night p...
 22876    I actually looked into this and I think that t...
 Name: text, dtype: object,
 36724    Oppo Reno 7 5G India Pricing Surfaces Online A...
 16472       Added the Falcon Glider &amp; new leaked wrap:
 5997     All 850 Books Texas Lawmaker Matt Krause (R) W...
 34367    The 'fact checkers' are already tip toeing aro...
 22588    Probably running neck-in-neck with the Trudeau...
 Name: text, dtype: object)

In [12]:
# set max length 
max_len = 32

train_dataset = tokenizer(X_train.tolist(), truncation=True, max_length=max_len, padding=True)
train_dataset['label'] = y_train.values
val_dataset = tokenizer(X_val.tolist(), truncation=True, max_length=max_len, padding=True)
val_dataset['label'] = y_val.values
test_dataset = tokenizer(testset['text'].tolist(), truncation=True, max_length=max_len, padding=True)

In [13]:
train_dataset = Dataset.from_dict(train_dataset)
val_dataset = Dataset.from_dict(val_dataset)
test_dataset = Dataset.from_dict(test_dataset)

In [14]:
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer = tokenizer
)

In [15]:
pred = trainer.predict(test_dataset=test_dataset).predictions
pred

***** Running Prediction *****
  Num examples = 10000
  Batch size = 32


array([[ 0.7617333 , -1.2927562 ],
       [ 1.6783063 , -1.9513254 ],
       [-0.250545  , -0.05809055],
       ...,
       [ 0.37851036, -0.5124843 ],
       [-0.97899926,  1.2159214 ],
       [-0.22683287,  0.04232647]], dtype=float32)

In [18]:
labels = testset.label
preds = pred.argmax(-1)
precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
acc = accuracy_score(labels, preds)

In [19]:
acc, precision, recall, f1

(0.7404, 0.6904218928164196, 0.7897826086956522, 0.7367673899817481)

In [20]:
def user_attribute(predict):
    pos = 0
    for i in range(len(predict)):
        if predict[i] == 1:
            pos +=1
    if pos/len(predict) >=0.5:
        return 1
    else:
        return 0

In [22]:
test_val = testset.label
truth = test_val.to_list()

In [24]:
# user accuracy
u_accu = 0
for i in range(int(len(truth)/200)):
    if user_attribute(preds[i*200:(i+1)*200]) == truth[i*200]:
        u_accu +=1
user_accuracy = u_accu/(len(truth)/200)
user_accuracy

0.94

In [25]:
# user recall
real_1 = 0
both_1 = 0
for i in range(int(len(truth)/200)):
    if truth[i*200] == 1:
        real_1 +=1
        if user_attribute(preds[i*200:(i+1)*200]) == 1:
            both_1 +=1
user_recall = both_1/real_1
user_recall

1.0

In [27]:
both_1, real_1

(23, 23)

In [30]:
# user precision
pred_1 = 0
both_12 = 0
for i in range(int(len(truth)/200)):
    if user_attribute(preds[i*200:(i+1)*200]) == 1:
        pred_1 +=1
        if truth[i*200] == 1:
            both_12 +=1
user_precision = both_12/pred_1
user_precision

0.8846153846153846

In [31]:
pred_1, both_12

(26, 23)

In [32]:
f1 = 2*user_recall*user_precision/(user_recall+user_precision)
f1

0.9387755102040816