In this notebook, I would try to make fraud detection with use of finetuned pretrained model distilbert-base-uncased, this model were chosen because this model is primarily aimed at being fine-tuned on tasks that use the whole sentence (potentially masked) to make decisions, such as sequence classification, token classification or question answering.
Link to the model on hugging-face: https://huggingface.co/distilbert/distilbert-base-uncased

In [1]:
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_name = "distilbert-base-uncased"
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=2)

model.to(device)

  return torch._C._cuda_getDeviceCount() > 0
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [2]:
import pandas as pd
fraud_emails = pd.read_csv("fraud_email_.csv")
fraud_emails.head()

Unnamed: 0,Text,Class
0,Supply Quality China's EXCLUSIVE dimensions at...,1
1,over. SidLet me know. Thx.,0
2,"Dear Friend,Greetings to you.I wish to accost ...",1
3,MR. CHEUNG PUIHANG SENG BANK LTD.DES VOEUX RD....,1
4,Not a surprising assessment from Embassy.,0


In [3]:
fraud_emails.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11929 entries, 0 to 11928
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Text    11928 non-null  object
 1   Class   11929 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 186.5+ KB


In [4]:
from datasets import Dataset
fraud_emails = fraud_emails.dropna()

dataset = Dataset.from_pandas(fraud_emails)

In [5]:
from datasets import load_metric
from transformers import Trainer, TrainingArguments
import numpy as np

def preprocess_function(examples):
    inputs = tokenizer(examples['Text'], truncation=True, padding=True)
    inputs['labels'] =  examples['Class']
    return inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True)

tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.2)



2024-06-27 15:27:00.332181: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-06-27 15:27:00.390353: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Map:   0%|          | 0/11928 [00:00<?, ? examples/s]

In [6]:

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

metric = load_metric("accuracy")

def compute_metrics(p):
    predictions = np.argmax(p.predictions, axis=1)
    return metric.compute(predictions=predictions, references=p.label_ids)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
    compute_metrics=compute_metrics,
)

trainer.train()

  metric = load_metric("accuracy")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Accuracy
1,0.0214,0.02346,0.99539
2,0.008,0.014455,0.997485
3,0.0009,0.01155,0.997485


TrainOutput(global_step=3579, training_loss=0.016776224979899196, metrics={'train_runtime': 18520.2364, 'train_samples_per_second': 1.546, 'train_steps_per_second': 0.193, 'total_flos': 3792011753926656.0, 'train_loss': 0.016776224979899196, 'epoch': 3.0})

In [7]:
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")

Evaluation results: {'eval_loss': 0.011549658142030239, 'eval_accuracy': 0.9974853310980721, 'eval_runtime': 411.5923, 'eval_samples_per_second': 5.797, 'eval_steps_per_second': 0.726, 'epoch': 3.0}


In [15]:
model.save_pretrained('./fine-tuned-model')
tokenizer.save_pretrained('./fine-tuned-model')

model = DistilBertForSequenceClassification.from_pretrained('./fine-tuned-model')
tokenizer = DistilBertTokenizer.from_pretrained('./fine-tuned-model')

texts = ["Claim your prize now! Send me your credit card number!", "Your meeting is scheduled at 2 PM."]
inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True)

outputs = model(**inputs)
predictions = torch.argmax(outputs.logits, dim=1)

print(f"Predictions: {predictions}")


Predictions: tensor([0, 0])
