1. Install required libraries

In [None]:
!pip install torch torchvision torchaudio
!pip install transformers datasets scikit-learn pandas numpy

2. Import libraries

In [2]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


3. Load dataset

In [3]:
df = pd.read_csv(r"H:\codes\research\fake-review\fake-review-models\data\deceptive-opinion.csv")

# Convert textual labels to 0/1 if needed
df['label'] = df['deceptive'].apply(lambda x: 1 if x == "deceptive" else 0)

df = df[['text', 'label']]
df.head()

Unnamed: 0,text,label
0,We stayed for a one night getaway with family ...,0
1,Triple A rate with upgrade to view room was le...,0
2,This comes a little late as I'm finally catchi...,0
3,The Omni Chicago really delivers on all fronts...,0
4,I asked for a high floor away from the elevato...,0


4. Train/Test Split

In [4]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])

train_texts = train_df['text'].tolist()
train_labels = train_df['label'].tolist()

test_texts = test_df['text'].tolist()
test_labels = test_df['label'].tolist()


5. Initialize BERT tokenizer

In [5]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


6. Create PyTorch Dataset class

In [7]:
class ReviewDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=256):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        enc = self.tokenizer(
            self.texts[idx],
            padding="max_length",
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        )
        return {
            "input_ids": enc["input_ids"].squeeze(),
            "attention_mask": enc["attention_mask"].squeeze(),
            "labels": torch.tensor(self.labels[idx]).long()
        }


7. Create DataLoaders

In [8]:
train_dataset = ReviewDataset(train_texts, train_labels, tokenizer)
test_dataset = ReviewDataset(test_texts, test_labels, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8)


8. Load BERT model

In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
model.to(device)


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

9. Optimizer + Scheduler

In [10]:
optimizer = AdamW(model.parameters(), lr=2e-5)
epochs = 3

total_steps = len(train_loader) * epochs

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)


10. Training loop

In [11]:
model.train()

for epoch in range(epochs):
    print(f"Epoch {epoch+1}/{epochs}")

    for batch in tqdm(train_loader):
        optimizer.zero_grad()

        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss = outputs.loss
        loss.backward()

        optimizer.step()
        scheduler.step()


Epoch 1/3


100%|██████████| 160/160 [15:30<00:00,  5.82s/it]


Epoch 2/3


100%|██████████| 160/160 [16:19<00:00,  6.12s/it]


Epoch 3/3


100%|██████████| 160/160 [16:19<00:00,  6.12s/it]


11. Evaluation

In [12]:
model.eval()

preds = []
true_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=1).cpu().numpy()

        preds.extend(predictions)
        true_labels.extend(batch["labels"].numpy())

print("Accuracy:", accuracy_score(true_labels, preds))
print(classification_report(true_labels, preds))


Accuracy: 0.828125
              precision    recall  f1-score   support

           0       0.95      0.69      0.80       160
           1       0.76      0.96      0.85       160

    accuracy                           0.83       320
   macro avg       0.85      0.83      0.82       320
weighted avg       0.85      0.83      0.82       320



12. Test manually ("fake review" example)

In [13]:
model.eval()

preds = []
true_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=1).cpu().numpy()

        preds.extend(predictions)
        true_labels.extend(batch["labels"].numpy())

print("Accuracy:", accuracy_score(true_labels, preds))
print(classification_report(true_labels, preds))


Accuracy: 0.828125
              precision    recall  f1-score   support

           0       0.95      0.69      0.80       160
           1       0.76      0.96      0.85       160

    accuracy                           0.83       320
   macro avg       0.85      0.83      0.82       320
weighted avg       0.85      0.83      0.82       320



12. Test manually ("fake review" example)

In [18]:
text = ["Absolutely the best hotel I’ve ever stayed in! Amazing staff, perfect rooms!"]
enc = tokenizer(text, return_tensors="pt", padding=True, truncation=True).to(device)

with torch.no_grad():
    output = model(**enc)
    pred = torch.argmax(output.logits, dim=1).cpu().item()

print("Prediction:", "Deceptive" if pred == 1 else "Truthful")


Prediction: Deceptive


Test manually

In [19]:
text = ["Good Environment."]
enc = tokenizer(text, return_tensors="pt", padding=True, truncation=True).to(device)

with torch.no_grad():
    output = model(**enc)
    pred = torch.argmax(output.logits, dim=1).cpu().item()

print("Prediction:", "Deceptive" if pred == 1 else "Truthful")


Prediction: Truthful
