In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report as clf_report
from sklearn.metrics import (
    accuracy_score,
    recall_score,
    precision_score,
    f1_score
)

from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer

import torch.nn as nn
from transformers import BertTokenizer, BertForSequenceClassification
from datasets import load_dataset

In [None]:
with open("/usr/share/dict/words", "r", encoding="utf-8") as f:
    words = np.array(list(map(lambda x: x.strip(), f.readlines())))
np.random.shuffle(words)

In [None]:
def make_target(text):
    if (words[10] in text) or (words[12] in text) or (words[50] in text and words[70] in text):
        return 1
    return 0

def gen_dataset(shape=(30_000, 200)):
    texts = [" ".join(text) for text in words[np.random.randint(low=0, high=1000, size=shape)]]
    data = pd.DataFrame({
        "text": texts
    })
    data["target"] = data["text"].apply(make_target)
    return data

In [None]:
df = gen_dataset()

In [None]:
df

Unnamed: 0,text,target
0,Damien spirals cumber sleight holidayed checkr...,0
1,usability's hagiographies Starbucks prophylact...,0
2,Afghanistan atavist's Tums pitfall disconcerti...,0
3,chambray's megachurch whopper's mummification ...,1
4,juleps scintilla's rebids acquaintanceship's g...,0
...,...,...
29995,sureties headfirst kiloton tam pendulous Tammi...,1
29996,deferential rooting vitiate Elisabeth Rastaban...,0
29997,cavorting revers's breeding adequacy's unargua...,1
29998,arms garage poodles pontifical dome's decoying...,0


In [None]:
from collections import Counter
Counter(df["target"])

Counter({0: 18910, 1: 11090})

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df["text"], df["target"],
                                                    train_size=0.8, test_size=0.2)

In [None]:
%%time

vectorizer = TfidfVectorizer(ngram_range=(1, 2))
X_train_transformed = vectorizer.fit_transform(X_train)

CPU times: user 11.4 s, sys: 219 ms, total: 11.6 s
Wall time: 11.6 s


In [None]:
svc = LinearSVC()
svc.fit(X_train_transformed, y_train)

LinearSVC()

In [None]:
X_test_transformed = vectorizer.transform(X_test)

y_pred = svc.predict(X_test_transformed)
print(clf_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.91      1.00      0.95      3730
           1       1.00      0.83      0.91      2270

    accuracy                           0.93      6000
   macro avg       0.95      0.91      0.93      6000
weighted avg       0.94      0.93      0.93      6000



In [None]:
train_df = pd.DataFrame([X_train, y_train]).transpose().reset_index(drop=True)
train_df.to_csv("synthetic/train.csv", index=False)

test_df = pd.DataFrame([X_test, y_test]).transpose().reset_index(drop=True)
test_df.to_csv("synthetic/test.csv", index=False)

In [5]:
dataset = load_dataset('csv', data_files={'train': 'synthetic/train.csv',
                                          'test': 'synthetic/test.csv'})

Using custom data configuration default-064f7373a3f622d0


Downloading and preparing dataset csv/default (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /root/.cache/huggingface/datasets/csv/default-064f7373a3f622d0/0.0.0/2dc6629a9ff6b5697d82c25b73731dd440507a69cbce8b425db50b751e8fcfd0...


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-064f7373a3f622d0/0.0.0/2dc6629a9ff6b5697d82c25b73731dd440507a69cbce8b425db50b751e8fcfd0. Subsequent calls will reuse this data.


In [6]:
MODEL = "allenai/scibert_scivocab_uncased"
BATCH_SIZE = 16

In [7]:
from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained(MODEL)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=385.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=227845.0, style=ProgressStyle(descripti…




In [8]:
def preprocess(examples):
    encoding = tokenizer(examples["text"], max_length=256, truncation=True, padding=True)
    encoding["label"] = examples["target"]
    return encoding

In [9]:
encoded_dataset = dataset.map(preprocess, batched=True)

HBox(children=(FloatProgress(value=0.0, max=24.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=6.0), HTML(value='')))




In [10]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=2)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=442221694.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at allenai/scibert_scivocab_uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification we

In [11]:
args = TrainingArguments(
    "scibert-test",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=1,
    weight_decay=0.01,
    load_best_model_at_end=True
)

In [12]:
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["test"],
    tokenizer=tokenizer
)

In [13]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.4716,0.47618


TrainOutput(global_step=1500, training_loss=0.5080277099609375, metrics={'train_runtime': 1326.533, 'train_samples_per_second': 1.131, 'total_flos': 0, 'epoch': 1.0, 'init_mem_cpu_alloc_delta': 2320334848, 'init_mem_gpu_alloc_delta': 441169408, 'init_mem_cpu_peaked_delta': 0, 'init_mem_gpu_peaked_delta': 0, 'train_mem_cpu_alloc_delta': 13352960, 'train_mem_gpu_alloc_delta': 1384674816, 'train_mem_cpu_peaked_delta': 1990656, 'train_mem_gpu_peaked_delta': 4542170112})

In [14]:
y_pred = []
y_true = []

for idx in tqdm(range(dataset["test"].shape[0])):
  device = "cuda:0"
  model = model.to(device)
  input_ids = tokenizer(
      dataset["test"][idx]["text"], return_tensors="pt",
      max_length=256, truncation=True, padding=True
      )["input_ids"]
  input_ids = input_ids.to(device)
  outputs = model(input_ids=input_ids)
  y_true.append(dataset["test"][idx]["target"])
  y_pred.append(np.argmax(outputs["logits"].cpu().detach().numpy()))

HBox(children=(FloatProgress(value=0.0, max=6000.0), HTML(value='')))




In [15]:
print(clf_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       0.74      1.00      0.85      3730
           1       1.00      0.43      0.61      2270

    accuracy                           0.79      6000
   macro avg       0.87      0.72      0.73      6000
weighted avg       0.84      0.79      0.76      6000

